gpu: nvgpu: add detailed documentation for common.rc unit

- Add detailed documentation for common.rc unit as per new guidance.

Jira NVGPU-6995

Change-Id: Id1ec052022e8ffb3d3f73bb79c388c01490c4bab
Signed-off-by: shashank singh <shashsingh@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2587869
(cherry picked from commit 5b75e43f8c97e53131f466bb4b420dd6e17921b1)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2672030
Reviewed-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
shashank singh
2021-09-02 13:01:05 +05:30
committed by mobile promotions
parent 1bdca92c50
commit 2ca87051e1

View File

@@ -151,13 +151,24 @@ static inline const char *nvgpu_rc_type_to_str(unsigned int rc_type)
* @brief Do recovery processing for ctxsw timeout.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param eng_bitmask [in] Engine bitmask.
* - The function does not perform validation of \a eng_bitmask parameter
* because it is not used for safety.
* @param tsg [in] Pointer to TSG struct.
* - The function does not perform validation of \a tsg parameter because it is
* passed as is to \ref nvgpu_tsg_set_error_notifier.
* @param debug_dump [in] Whether debug dump required or not.
* - The function does not perform validation of \a debug_dump parameter
* because it is not used for safety.
*
* Trigger SW/HW sequence to recover from timeout during context switch. For
* safety, just set error notifier to NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT
* and print warning if quiesce is not triggered already.
* safety, the steps performed by this API are
* - Call \ref nvgpu_tsg_set_error_notifier
* "nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT)"
* to set error notifier for the faulting tsg.
* - Call \ref WARN_ON "WARN_ON(!g->sw_quiesce_pending)" to print warning if
* quiesce is not triggered already.
*/
void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
struct nvgpu_tsg *tsg, bool debug_dump);
@@ -166,12 +177,64 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
* @brief Do recovery processing for PBDMA fault.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param pbdma_id [in] Pbdma identifier.
* - The function does not perform validation of \a pbdma_id parameter
* because it is just used for logging.
* @param error_notifier [in] Error notifier type to be set.
* - The function validates \a error_notifier to be in the range
* [0, \ref NVGPU_ERR_NOTIFIER_INVAL].
* @param pbdma_status [in] Pointer to PBDMA status info.
*
* - The function does not perform validation of \a pbdma_status parameter
* because it is not deferenced in this function.
* Do PBDMA fault recovery. Set error notifier as per \a error_notifier and call
* \a nvgpu_rc_tsg_and_related_engines to do the recovery.
* \ref nvgpu_rc_tsg_and_related_engines to do the recovery. Steps involved are
* - If \a error_notifier is >= \ref NVGPU_ERR_NOTIFIER_INVAL, set error variable to
* -EINVAL and jump to label \a out.
* - If \ref nvgpu_pbdma_status_is_chsw_valid
* "nvgpu_pbdma_status_is_chsw_valid(pbdma_status)" or
* \ref nvgpu_pbdma_status_is_chsw_save
* "nvgpu_pbdma_status_is_chsw_save(pbdma_status)" returns true, set id and
* id_type to \ref nvgpu_pbdma_status_info "pbdma_status->id" and
* \ref nvgpu_pbdma_status_info "pbdma_status->id_type" respectively.
* - Else if \ref nvgpu_pbdma_status_is_chsw_load
* "nvgpu_pbdma_status_is_chsw_load(pbdma_status)" or
* \ref nvgpu_pbdma_status_is_chsw_switch
* "nvgpu_pbdma_status_is_chsw_switch(pbdma_status)" returns true, set id and
* id_type to \ref nvgpu_pbdma_status_info "pbdma_status->next_id" and
* \ref nvgpu_pbdma_status_info "pbdma_status->next_id_type" respectively.
* - Else if \ref nvgpu_pbdma_status_ch_not_loaded
* "nvgpu_pbdma_status_ch_not_loaded(pbdma_status)" returns true, log message
* but don't set error variable.
* - Else log error message and set error variable to -EINVAL and trigger
* quiesce.
* - If id_type set in above steps matches with \ref PBDMA_STATUS_ID_TYPE_TSGID,
* call \ref nvgpu_tsg_get_from_id to get
* pointer to struct \ref nvgpu_tsg and store in variable tsg, then call
* \ref nvgpu_tsg_set_error_notifier
* "nvgpu_tsg_set_error_notifier(g, tsg, error_notifier)" to set error
* notifier buffer. Finally, call
* \ref nvgpu_rc_tsg_and_related_engines
* "nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PBDMA_FAULT)"
* to do recovery for tsg and related engines.
* - If id_type set in above steps matches with \ref PBDMA_STATUS_ID_TYPE_CHID,
* call \ref nvgpu_channel_from_id to get
* pointer to struct \ref nvgpu_channel and store in variable ch. If ch is NULL
* log error, set error variable to -EINVAL and jump to label \a out else get
* pointer to struct \ref nvgpu_tsg using API \ref nvgpu_tsg_from_ch
* "nvgpu_tsg_from_ch(ch)" and store in variable tsg. If tsg is NULL log
* error, set error variable to -EINVAL and jump to label \a out else set
* error notifier buffer by calling \ref nvgpu_tsg_set_error_notifier
* "nvgpu_tsg_set_error_notifier(g, tsg, error_notifier)" followed by doing
* recovery by calling \ref nvgpu_rc_tsg_and_related_engines
* "nvgpu_rc_tsg_and_related_engines(g, tsg, true, RC_TYPE_PBDMA_FAULT)".
* Finally put the channel reference by calling \ref nvgpu_channel_put
* "nvgpu_channel_put(ch)".
* - If id_type is not set to any of \ref PBMDA_STATUS_ID_TYPE_TSGID or
* \ref PBMDA_STATUS_ID_TYPE_CHID, log error and set error variable to -EINVAL.
* - At \a out label, if error variable is set, call
* \ref nvgpu_sw_quiesce "nvgpu_sw_quiesce(g)".
* - Return error variable.
*
* @return 0 in case of success, < 0 in case of failure.
* @retval -EINVAL in case of following cases:
@@ -188,10 +251,14 @@ int nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier,
* @brief Do recovery processing for runlist update timeout.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param runlist_id [in] Runlist identifier.
* - The function does not perform validation of \a runlist_id parameter
* because it is not used for safety.
*
* Do runlist update timeout recovery. For safety, just print warning if quiesce
* is not triggered already.
* Do runlist update timeout recovery. For safety, just print warning by calling
* \ref WARN_ON "WARN_ON(!g->sw_quiesce_pending)" if quiesce is not triggered
* already.
*/
void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id);
@@ -199,11 +266,17 @@ void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id);
* @brief Do recovery processing for preemption timeout.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param tsg [in] Pointer to TSG struct.
* - The function does not perform validation of \a tsg parameter because it is
* passed as is to \ref nvgpu_tsg_set_error_notifier().
*
* Do preemption timeout recovery. For safety, just set error notifier to
* NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT and call BUG() if quiesce is
* not triggered already.
* Do preemption timeout recovery. For safety, following steps are performed
* - Set error notifier buffer to \ref NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT by
* calling \ref nvgpu_tsg_set_error_notifier
* "nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT)".
* - Call \ref BUG_ON "BUG_ON(!g->sw_quiesce_pending)" if quiesce is not
* triggered already.
*/
void nvgpu_rc_preempt_timeout(struct gk20a *g, struct nvgpu_tsg *tsg);
@@ -211,11 +284,17 @@ void nvgpu_rc_preempt_timeout(struct gk20a *g, struct nvgpu_tsg *tsg);
* @brief Do recovery processing for GR fault.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param tsg [in] Pointer to TSG struct.
* - The function does not perform validation of \a tsg parameter
* because it is not used for safety.
* @param ch [in] Pointer to channel struct.
* - The function does not perform validation of \a ch parameter
* because it is not used for safety.
*
* Do GR fault recovery. For safety, just print warning if quiesce is not
* triggered already.
* Do GR fault recovery. For safety, just print warning by calling
* \ref WARN_ON "WARN_ON(!g->sw_quiesce_pending)" if quiesce is not triggered
* already.
*/
void nvgpu_rc_gr_fault(struct gk20a *g,
struct nvgpu_tsg *tsg, struct nvgpu_channel *ch);
@@ -224,9 +303,11 @@ void nvgpu_rc_gr_fault(struct gk20a *g,
* @brief Do recovery processing for bad TSG sched error.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
*
* Do bad TSG sched error recovery. For safety, just print warning if quiesce is
* not triggered already.
* Do bad TSG sched error recovery. For safety, just print warning by calling
* \ref WARN_ON "WARN_ON(!g->sw_quiesce_pending)" if quiesce is not triggered
* already.
*/
void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g);
@@ -234,12 +315,21 @@ void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g);
* @brief Do recovery processing for TSG and related engines.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param tsg [in] Pointer to TSG struct.
* - The function does not perform validation of \a tsg parameter
* because it is not used for safety.
* @param debug_dump [in] Whether debug dump required or not.
* - The function does not perform validation of \a debug_dump parameter
* because it is not used for safety.
* @param rc_type [in] Recovery type.
* - The function does not perform validation of \a rc_type parameter
* because it is not used for safety.
*
* Do TSG and related engines recovery dependending on the \a rc_type. For
* safety just print warning if quiesce is not triggered already.
* safety, just print warning by calling
* \ref WARN_ON "WARN_ON(!g->sw_quiesce_pending)" if quiesce is not triggered
* already.
*/
void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct nvgpu_tsg *tsg,
bool debug_dump, u32 rc_type);
@@ -248,24 +338,50 @@ void nvgpu_rc_tsg_and_related_engines(struct gk20a *g, struct nvgpu_tsg *tsg,
* @brief Do recovery processing for mmu fault.
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param act_eng_bitmask [in] Engine bitmask.
* - The function does not perform validation of \a act_eng_bitmask parameter
* because it is just used for logging.
* @param id [in] Hw identifier.
* - The function validates \a id parameter to be less than
* \ref nvgpu_fifo "g->fifo.num_channels" i.e. in the range
* [0, g->fifo.num_channels).
* @param id_type [in] Hw id type.
* - The function validates \a id_type parameter to be less than or equal to
* #ID_TYPE_TSG.
* @param rc_type [in] Recovery type.
* - The function does not perform validation of \a rc_type parameter
* because it is not used for safety.
* @param mmufault [in] Mmu fault info
* - The function does not perform validation of \a mmufault parameter.
* because it is not used for safety.
*
* Validate the id. Valid range is [0, g->fifo.num_channels).
* Validate the id type parameter. Valid range is [ID_TYPE_CHANNEL, ID_TYPE_TSG].
* Do mmu fault recovery dependending on the \a rc_type, \a act_eng_bitmask,
* \a hw_id and \a id_type.
* For safety,
* - set error notifier to NVGPU_ERR_NOTIFIER_FIFO_ERROR_MMU_ERR_FLT
* when \a id_type is TSG.
* - Mark the channels of that TSG as unserviceable when \a id_type is TSG
* - print warning if quiesce is not triggered already.
* For safety, following steps are performed
* - If \a id is greater than or equal to \ref nvgpu_fifo
* "g->fifo.num_channels" i.e. not in valid range
* [0, g->fifo.num_channels), set error variable to -EINVAL and jump to label
* \a out.
* - If \a id_type is greater than #ID_TYPE_TSG i.e. not in valid range
* [ID_TYPE_CHANNEL, ID_TYPE_TSG], set error variable to -EINVAL and jump to
* label \a out.
* - Log values of \a id, \a id_type and \a act_eng_bitmask.
* - If \a id is not \ref INVAL_ID and \a id_type is \ref ID_TYPE_TSG, call
* \ref nvgpu_tsg_set_ctx_mmu_error "nvgpu_tsg_set_ctx_mmu_error(g, tsg)"
* to set mmu fault in error notifier buffer and call
* \ref nvgpu_tsg_mark_error "nvgpu_tsg_mark_error(g, tsg)" to mark error
* for all channels belonging to the given tsg. Pointer to struct
* \ref nvgpu_tsg is retrieved from \ref nvgpu_fifo "&g->fifo.tsg[id]".
* - print warning by calling
* \ref WARN_ON() "WARN_ON(!g->sw_quiesce_pending)" if quiesce is not
* triggered already.
* - At \a out label, if error variable is set, call \ref nvgpu_sw_quiesce
* "nvgpu_sw_quiesce(g)".
* - Return error variable.
*
* @return 0 in case of success, < 0 in case of failure.
* @retval -EINVAL in case ID and ID type are invalid.
* @retval -EINVAL in case \a id or \a id_type is invalid.
*/
int nvgpu_rc_mmu_fault(struct gk20a *g, u32 act_eng_bitmask,
u32 id, unsigned int id_type, unsigned int rc_type,
@@ -275,16 +391,31 @@ int nvgpu_rc_mmu_fault(struct gk20a *g, u32 act_eng_bitmask,
* @brief The core recovery sequence to teardown faulty TSG
*
* @param g [in] Pointer to GPU driver struct.
* - The function does not perform validation of \a g parameter.
* @param eng_bitmask [in] Engine bitmask.
* - The function does not perform validation of \a eng_bitmask parameter
* because it is not used for safety.
* @param hw_id [in] Hardware identifier.
* - The function does not perform validation of \a hw_id parameter
* because it is not used for safety.
* @param id_is_tsg [in] Whether hw id is TSG or not.
* - The function does not perform validation of \a id_is_tsg parameter
* because it is not used for safety.
* @param id_is_known [in] Whether hw id is known or not.
* - The function does not perform validation of \a id_is_known parameter
* because it is not used for safety.
* @param debug_dump [in] Whether debug dump required or not.
* - The function does not perform validation of \a debug_dump parameter
* because it is not used for safety.
* @param rc_type [in] Recovery type.
* - The function does not perform validation of \a rc_type parameter
* because it is not used for safety.
*
* Perform the manual recommended channel teardown sequence depending on the
* \a rc_type, \a eng_bitmask, \a hw_id, \a id_is_tsg and \a id_is_known. For
* safety, just print warning if quiesce is not triggered already.
* safety, just print warning by calling
* \ref WARN_ON "WARN_ON(!g->sw_quiesce_pending)" if quiesce is not triggered
* already.
*/
void nvgpu_rc_fifo_recover(struct gk20a *g,
u32 eng_bitmask, /* if zero, will be queried from HW */