gpu: nvgpu: enhance CE error reporting documentation

Update documentation for function nvgpu_report_ce_err to include fine granular implemenation details. In additiona, remove redundant descrptions from error reporting functions. Jira NVGPU-6948 Change-Id: Ie1675b0260809bfbc6fdeab6748c48347b5f3d7d Signed-off-by: Antony Clince Alex <aalex@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2554573 (cherry picked from commit a5f84edde5943358549534b8f736ee931a28c1ad) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2555909 Reviewed-by: Dinesh T <dt@nvidia.com> Reviewed-by: svc_kernel_abi <svc_kernel_abi@nvidia.com> Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: Rajesh Devaraj <rdevaraj@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 10:34:43 +03:00 · 2021-07-06 09:27:32 +00:00
parent 3c97f3b932
commit bb5bffe571
1 changed files with 63 additions and 43 deletions
--- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h
@@ -395,8 +395,7 @@ struct nvgpu_hw_err_inject_info_desc {
 };

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report errors from HOST
+ * @brief This function provides an interface to report errors from HOST
 *        (FIFO/PBDMA/PBUS) unit to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
@@ -438,42 +437,71 @@ void nvgpu_report_host_err(struct gk20a *g, u32 hw_unit,
 	u32 inst, u32 err_id, u32 intr_info);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report errors from CE unit
+ * @brief This function provides an interface to report errors from CE unit
 *        to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
- * @param hw_unit [in]		- Index of HW unit (CE).
- *				  - NVGPU_ERR_MODULE_CE
+ *                                - The function does not perform validation of
+ *                                  g parameter.
+ * @param hw_unit [in]		- Index of HW unit.
+ *				  - The function validates that hw_unit ==
+ *				    \link NVGPU_ERR_MODULE_CE \endlink.
 * @param inst [in]		- Instance ID.
 *				  - In case of multiple instances of the same HW
 *				    unit (e.g., there are multiple instances of
 *				    CE), it is used to identify the instance
 *				    that encountered a fault.
+ *				  - The function does not perform any validation
+ *				    on this parameter.
 * @param err_id [in]		- Error index.
- *				  - Min: GPU_CE_LAUNCH_ERROR
- *				  - Max: GPU_CE_METHOD_BUFFER_FAULT
+ *				  - The function validates that, this paramter
+ *				    has a value within valid range.
+ *				  - Min: \link GPU_CE_LAUNCH_ERROR \endlink
+ *				  - Max: \link GPU_CE_METHOD_BUFFER_FAULT \endlink
 * @param intr_info [in]	- Content of interrupt status register.
+ *				  - The function does not perform any validation
+ *				    on this parameter.
 *
- * - Checks whether SDL is supported in the current GPU platform. If SDL is not
- *   supported, it simply returns.
- * - Validates both \a hw_unit and \a err_id indices. In case of a failure,
- *   invokes #nvgpu_sdl_handle_report_failure() api.
- * - Gets the current time of a clock. In case of a failure, invokes
- *   #nvgpu_sdl_handle_report_failure() api.
- * - Gets error description from internal look-up table using \a hw_unit and
- *   \a err_id indices.
- * - Forms error packet using details such as time-stamp, \a hw_unit, \a err_id,
- *   criticality of the error, \a inst, error description, \a intr_info and
- *   size of the error packet.
- * - Performs compile-time assert check to ensure that the size of the error
- *   packet does not exceed the maximum allowable size specified in
- *   #MAX_ERR_MSG_SIZE.
- * - Invokes #nvgpu_sdl_report_error_rmos() to enqueue the error packet into
- *   error message queue. In case of any failure during this enqueue operation,
- *   #nvgpu_sdl_handle_report_failure() api is invoked to handle the failure.
- * - The error packet will be dequeued from the error message queue and reported
- *   to Safety_Services by #nvgpu_sdl_worker thread.
+ * - Check nvgpu_os_rmos.is_sdl_supported equals true (pointer to nvgpu_os_rmos
+ *   is obtained using \ref nvgpu_os_rmos_from_gk20a()
+ *   "nvgpu_os_rmos_from_gk20a(g)"), return on failure.
+ * - Perform validation of input paramters, see paramter section for detailed
+ *   validation criteria. In case of a failure, print error message and invoke
+ *   \ref nvgpu_sdl_handle_report_failure()
+ *   "nvgpu_sdl_handle_report_failure(g)" and return.
+ * - Get the current time using api clock_gettime(CLOCK_MONOTONIC, &ts).
+ *   In case of a failure, print error message and invoke
+ *   \ref nvgpu_sdl_handle_report_failure()
+ *   "nvgpu_sdl_handle_report_failure(g)" and return.
+ * - Declare and initialize an error message packet err_pkt of type
+ *   nvgpu_err_msg, using \ref nvgpu_init_ce_err_msg()
+ *   "nvgpu_init_ce_err_msg(&err_pkt)".
+ * - Get error description err_desc of type nvgpu_err_desc from internal
+ *   look-up table \ref nvgpu_os_rmos.sdl_rmos "nvgpu_os_rmos.sdl_rmos".\ref
+ *   nvgpu_sdl_rmos.err_lut "err_lut" using hw_unit and err_id.
+ * - Update the following fields in the error message packet err_pkt.
+ *   - nvgpu_err_msg.hw_unit_id = hw_unit
+ *   - nvgpu_err_msg.is_critical = err_desc->is_critical
+ *   - nvgpu_err_msg.err_id = err_desc->error_id
+ *   - nvgpu_err_msg.err_size = sizeof(gpu_error_info.ce_info)
+ *   - \ref nvgpu_err_msg.err_info "nvgpu_err_msg.err_info".\ref
+ *     gpu_error_info.ce_info "ce_info".\ref gpu_ce_error_info.header
+ *     "header".\ref gpu_err_header.sub_err_type "sub_err_type" = intr_info
+ *   - \ref nvgpu_err_msg.err_info "nvgpu_err_msg.err_info".\ref
+ *     gpu_error_info.ce_info "ce_info".\ref gpu_ce_error_info.header
+ *     "header".\ref gpu_err_header.timestamp_ns "timestamp_ns" =
+ *     \ref nvgpu_timespec2nsec() "nvgpu_timespec2nsec(&ts)"
+ *   - nvgpu_err_msg.err_desc = err_desc
+ * - Invoke \ref nvgpu_sdl_report_error_rmos()
+ *   "nvgpu_sdl_report_error_rmos(g, &err_pkt, sizeof(err_pkt))" to enqueue
+ *   the packet err_pkt into the circular buffer \ref nvgpu_os_rmos.sdl_rmos
+ *   "nvgpu_os_rmos.sdl_rmos".\ref nvgpu_sdl_rmos.emsg_q "emsg_q". In case
+ *   of failure, print error message and invoke
+ *   \ref nvgpu_sdl_handle_report_failure() "nvgpu_sdl_handle_report_failure(g)"
+ *   and return.
+ * - The error packet err_pkt will be dequeued from \ref
+ *   nvgpu_os_rmos.sdl_rmos "nvgpu_os_rmos.sdl_rmos".\ref nvgpu_sdl_rmos.emsg_q
+ *   "emsg_q" and reported to Safety Service by nvgpu_sdl_worker() thread.
 *
 * @return	None
 */
@@ -481,8 +509,7 @@ void nvgpu_report_ce_err(struct gk20a *g, u32 hw_unit,
 	u32 inst, u32 err_id, u32 intr_info);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report ECC erros to SDL unit.
+ * @brief This function provides an interface to report ECC erros to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
 * @param hw_unit [in]		- Index of HW unit.
@@ -556,8 +583,7 @@ void nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
 		u32 err_id, u64 err_addr, u64 err_count);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This is a wrapper function to report ECC errors from HUBMMU to SDL.
+ * @brief This is a wrapper function to report ECC errors from HUBMMU to SDL.
 *
 * @param g [in]		- The GPU driver struct.
 * @param err_id [in]		- Error index.
@@ -580,8 +606,7 @@ static inline void nvgpu_report_fb_ecc_err(struct gk20a *g, u32 err_id, u64 err_
 }

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report CTXSW erros to SDL unit.
+ * @brief This function provides an interface to report CTXSW erros to SDL unit.
 *
 * @param g [in]	- The GPU driver struct.
 * @param hw_unit [in]	- Index of HW unit (FECS).
@@ -618,8 +643,7 @@ void nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
 		void *data);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report SM and PGRAPH erros
+ * @brief This function provides an interface to report SM and PGRAPH erros
 *        to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
@@ -672,8 +696,7 @@ void nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
 		u32 err_id, struct gr_err_info *err_info, u32 sub_err_type);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report PMU erros to SDL unit.
+ * @brief This function provides an interface to report PMU erros to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
 * @param hw_unit [in]		- Index of HW unit (PMU).
@@ -711,8 +734,7 @@ void nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
 	u32 sub_err_type, u32 status);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report PRI erros to SDL unit.
+ * @brief This function provides an interface to report PRI erros to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
 * @param hw_unit [in]		- Index of HW unit (PRI).
@@ -757,8 +779,7 @@ void nvgpu_report_pri_err(struct gk20a *g, u32 hw_unit, u32 inst,
 		u32 err_id, u32 err_addr, u32 err_code);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This function provides an interface to report HUBMMU erros to SDL.
+ * @brief This function provides an interface to report HUBMMU erros to SDL.
 *
 * @param g [in]		- The GPU driver struct.
 * @param hw_unit [in]		- Index of HW unit (HUBMMU).
@@ -799,8 +820,7 @@ void nvgpu_report_mmu_err(struct gk20a *g, u32 hw_unit,
 		u32 status, u32 sub_err_type);

 /**
- * @brief GPU HW errors need to be reported to Safety_Services via SDL unit.
- *        This is a wrapper function to report CTXSW errors to SDL unit.
+ * @brief This is a wrapper function to report CTXSW errors to SDL unit.
 *
 * @param g [in]		- The GPU driver struct.
 * @param err_type [in]		- Error index.