From da3c83cd5e559f8464d555c5cffa86d0cdad6e59 Mon Sep 17 00:00:00 2001
From: Sagar Kamble <skamble@nvidia.com>
Date: Mon, 11 Oct 2021 17:08:47 +0530
Subject: [PATCH] gpu: nvgpu: update doxygen for common.ltc unit

Update the documentation as per SWUD feedback for common.ltc unit.

JIRA NVGPU-6982

Change-Id: I0a8406791bef2094bcd2804546db46378a269bb3
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2608663
(cherry picked from commit dd1198870b4dbef5d4731fd5d292188c268967b6)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2633960
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/hal/ltc/ltc_gm20b_fusa.c |  13 +-
 drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h | 325 +++++++++++++++++++--
 drivers/gpu/nvgpu/include/nvgpu/ltc.h      | 173 ++++++-----
 3 files changed, 414 insertions(+), 97 deletions(-)
diff --git a/drivers/gpu/nvgpu/hal/ltc/ltc_gm20b_fusa.c b/drivers/gpu/nvgpu/hal/ltc/ltc_gm20b_fusa.c
index b6ae5f731..6ebb40f33 100644
--- a/drivers/gpu/nvgpu/hal/ltc/ltc_gm20b_fusa.c
+++ b/drivers/gpu/nvgpu/hal/ltc/ltc_gm20b_fusa.c
@@ -51,18 +51,17 @@ static int gm20b_ltc_wait_for_clean(struct gk20a *g)
 
 		/*
 		 * Use 5ms - this should be sufficient time to flush the cache.
-		 * On tegra, rough EMC BW available for old tegra chips (newer
-		 * chips are strictly faster) can be estimated as follows:
+		 * On tegra, rough EMC BW available can be estimated as follows:
 		 *
-		 * Lowest reasonable EMC clock speed will be around 102MHz on
-		 * t124 for display enabled boards and generally fixed to max
+		 * Lowest reasonable EMC clock speed will be around 204MHz on
+		 * t234 for display enabled boards and generally fixed to max
 		 * for non-display boards (since they are generally plugged in).
 		 *
-		 * Thus, the available BW is 64b * 2 * 102MHz = 1.3GB/s. Of that
+		 * Thus, the available BW is 128B * 2 * 204MHz = ~52GB/s. Of that
 		 * BW the GPU will likely get about half (display and overhead/
-		 * utilization inefficiency eating the rest) so 650MB/s at
+		 * utilization inefficiency eating the rest) so 26GB/s at
 		 * worst. Assuming at most 1MB of GPU L2 cache (less for most
-		 * chips) worst case is we take 1MB/650MB/s = 1.5ms.
+		 * chips) worst case is we take 1MB/26GB/s = 38us.
 		 *
 		 * So 5ms timeout here should be more than sufficient.
 		 */
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h b/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h
index 370fab22f..ce3d63db3 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gops/ltc.h
@@ -42,10 +42,174 @@ struct gops_ltc_intr {
 	/**
 	 * @brief ISR for handling ltc interrupts.
 	 *
-	 * @param g [in]		Pointer to GPU driver struct.
-	 * @param ltc [in]		LTC unit number
+	 * @param g [in]		- The GPU driver struct.
+	 *                                - The function does not perform
+	 *                                  validation of g parameter.
+	 * @param ltc [in]		- Index of LTC.
+	 *				  - The function validates that
+	 *				    ltc < g->ltc->ltc_count.
 	 *
-	 * This function handles ltc related ecc interrupts.
+	 * - For each ltc slice \a slice from 0 to g->ltc->slices_per_ltc - 1:
+	 *   -# The L2 has SEC-DED protection on its data RAM and parity protection on the
+	 *      byte enables RAM.
+	 *   -# See <a href="https:/p4viewer.nvidia.com/get//hw/doc/gpu/ampere/ampere/design/Functional_Descriptions/Resiliency/Ampere_gpu_resiliency_ECC.docx</a> for details.
+	 *   -# Following PRI registers are used for controlling parity ECC and
+	 *      getting the status and information of ECC.
+	 *      -# Control:
+	 *         -# ECC_CONTROL
+	 *      -# Error status and information:
+	 *         -# ECC_STATUS
+	 *         -# ECC_ADDRESS
+	 *         -# ECC_CORRECTED_ERR_COUNT
+	 *         -# ECC_UNCORRECTED_ERR_COUNT
+	 *   -# Detect and handle ECC PARITY errors and SEC-DED errors.
+	 *      SEC errors are reported as DSTG corrected errors and
+	 *      DED errors are reported as DSTG uncorrected errors.
+	 *      Below are the supported errors:
+	 *      -# UNCORRECTED_ERR_RSTG - signals a parity error in RSTG RAMS, for now only CBC RAMS
+	 *      -# UNCORRECTED_ERR_TSTG - signals a parity error in TSTG RAMS
+	 *      -# UNCORRECTED_ERR_DSTG - signals a parity error in DSTG RAMS, non-data RAMS
+	 *                                and DED in data RAMS.
+	 *      -# CORRECTED_ERR_DSTG - signals an ecc corrected error in DSTG data RAMS (SEC)
+	 *   -# Read ltc_ltc0_lts0_intr3_r() register corresponding to the slice adding the offset:
+	 *      \f$(ltc * GPU\_LIT\_LTC\_STRIDE) + (slice * GPU\_LIT\_LTS\_STRIDE)\f$
+	 *   -# Check if ltc_ltcs_ltss_intr3_ecc_uncorrected_m() or
+	 *      ltc_ltcs_ltss_intr3_ecc_corrected_m() is set in
+	 *      ltc_ltc0_lts0_intr3_r() register read above.
+	 *      If so, handle as below:
+	 *      -# Read following registers for the slice:
+	 *         -# ecc status register: ltc_ltc0_lts0_l2_cache_ecc_status_r()
+	 *         -# ecc address register: ltc_ltc0_lts0_l2_cache_ecc_address_r()
+	 *         -# ecc uncorrected count register:
+	 *            ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r()
+	 *         -# ecc corrected count register:
+	 *            ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r()
+	 *      -# Calculate counter delta by applying
+	 *         ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_total_v()
+	 *         to uncorrected count register read above.
+	 *      -# Check if the uncorrected count overflow happened by AND'ing ecc status
+	 *         read above with ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_total_counter_overflow_m().
+	 *      -# Reset the counter ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r()
+	 *         to zero if the counter delta is non-zero or if there is overflow.
+	 *      -# Calculate counter delta by applying
+	 *         ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_v()
+	 *         to corrected count register read above.
+	 *      -# Check if the corrected count overflow happened by AND'ing ecc status
+	 *         read above with ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_total_counter_overflow_m().
+	 *      -# Reset the counter ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_r() to zero if
+	 *         the counter delta is non-zero or if there is overflow.
+	 *      -# Reset the counter ltc_ltc0_lts0_l2_cache_ecc_uncorrected_err_count_r() to zero if
+	 *         the counter delta is non-zero or if there is overflow.
+	 *      -# Write ltc_ltc0_lts0_l2_cache_ecc_status_reset_task_f() to
+	 *         ltc_ltc0_lts0_l2_cache_ecc_status_r() to reset the entire register.
+	 *      -# Add to the uncorrected counter delta
+	 *         BIT32(ltc_ltc0_lts0_l2_cache_ecc_ununcorrected_err_count_total_s())
+	 *         if there is overflow.
+	 *      -# Add to the corrected counter delta
+	 *         BIT32(ltc_ltc0_lts0_l2_cache_ecc_corrected_err_count_total_s())
+	 *         if there is overflow.
+	 *      -# Handle ecc errors for subunits (part of the L2 slice detected an error).
+	 *         There are three subunits. Pass below parameters to these units:
+	 *         -# \a g
+	 *         -# \a ltc
+	 *         -# \a slice
+	 *         -# ecc status read
+	 *         -# ecc address read
+	 *         -# uncorrected delta
+	 *         -# corrected delta (This is passed to only DSTG ECC handling function)
+	 *
+	 *         ECC error handling for subunits is given below:
+	 *         -# r-stg : the input command queues and the compression bit cache
+	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_rstg_m() is
+	 *               set in ecc status:
+	 *               -# Increment g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter
+	 *                  with uncorrected counter delta with
+	 *                  \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
+	 *               -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
+	 *                  "nvgpu_report_ecc_err" with following parameters:
+	 *                  -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
+	 *                  -# (\a ltc << 8U) | \a slice
+	 *                  -# \ref GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED
+	 *                     "GPU_LTC_CACHE_RSTG_ECC_UNCORRECTED"
+	 *                  -# ecc address read above
+	 *                  -# g->ecc.ltc.rstg_ecc_parity_count[\a ltc][\a slice].counter
+	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_rstg_m() is
+	 *               set in ecc status, then it is considered as fatal error as it is not
+	 *               expected and call \ref BUG "BUG()".
+	 *         -# t-stg : tag lookup and miss fifos
+	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_tstg_m() is
+	 *               set in ecc status:
+	 *               -# Increment g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter
+	 *                  with uncorrected counter delta with
+	 *                  \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
+	 *               -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
+	 *                  "nvgpu_report_ecc_err" with following parameters:
+	 *                  -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
+	 *                  -# (\a ltc << 8U) | \a slice
+	 *                  -# \ref GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED
+	 *                     "GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED"
+	 *                  -# ecc address read above
+	 *                  -# g->ecc.ltc.tstg_ecc_parity_count[\a ltc][\a slice].counter
+	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_tstg_m() is
+	 *               set in ecc status, then it is considered as fatal error as it is not
+	 *               expected and call \ref BUG "BUG()".
+	 *         -# d-stg : sram data banks and write data queues
+	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_corrected_err_dstg_m() is
+	 *               set in ecc status:
+	 *               -# The correctable data ram errors are SEC errors.
+	 *               -# Increment g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter
+	 *                  with corrected counter delta with
+	 *                  \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
+	 *               -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
+	 *                  "nvgpu_report_ecc_err" with following parameters:
+	 *                  -# \a g
+	 *                  -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
+	 *                  -# (\a ltc << 8U) | \a slice
+	 *                  -# \ref GPU_LTC_CACHE_DSTG_ECC_CORRECTED
+	 *                     "GPU_LTC_CACHE_DSTG_ECC_CORRECTED"
+	 *                  -# ecc address read above.
+	 *                  -# g->ecc.ltc.ecc_sec_count[\a ltc][\a slice].counter
+	 *               -# Flush the L2 cache by calling
+	 *                  \ref gops_mm_cache.l2_flush "gops_mm_cache.l2_flush".
+	 *               -# If it fails then call \ref BUG "BUG()".
+	 *            -# If ltc_ltc0_lts0_l2_cache_ecc_status_uncorrected_err_dstg_m() is
+	 *               set in ecc status:
+	 *               -# The uncorrectable data ram errors are reported with the dstg non-data
+	 *                  ram parity errors in the UNCORRECTED_ERR_DSTG field.
+	 *               -# Check if the ECC address corresponds to data ram:
+	 *                  -# Increment g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter
+	 *                     with uncorrected counter delta with
+	 *                     \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
+	 *                  -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
+	 *                     "nvgpu_report_ecc_err" with following parameters:
+	 *                     -# \a g
+	 *                     -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
+	 *                     -# (\a ltc << 8U) | \a slice
+	 *                     -# \ref GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED
+	 *                        "GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED"
+	 *                     -# ecc address read above.
+	 *                     -# g->ecc.ltc.ecc_ded_count[\a ltc][\a slice].counter
+	 *               -# Else if the ECC address correspongs to DSTG BE RAM:
+	 *                  -# Increment g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter
+	 *                     with uncorrected counter delta with
+	 *                     \ref nvgpu_wrapping_add_u32 "nvgpu_wrapping_add_u32".
+	 *                  -# Report to |qnx.sdl| unit by calling \ref nvgpu_report_ecc_err
+	 *                     "nvgpu_report_ecc_err" with following parameters:
+	 *                     -# \a g
+	 *                     -# \ref NVGPU_ERR_MODULE_LTC "NVGPU_ERR_MODULE_LTC"
+	 *                     -# (\a ltc << 8U) | \a slice
+	 *                     -# \ref GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED
+	 *                        "GPU_LTC_CACHE_DSTG_BE_ECC_UNCORRECTED"
+	 *                     -# ecc address read above
+	 *                     -# g->ecc.ltc.dstg_be_ecc_parity_count[\a ltc][\a slice].counter
+	 *               -# Else call \ref BUG "BUG()" as this type of ECC error is not supported.
+	 *      -# Clear the register ltc_ltc0_lts0_intr3_r() by writing the read value.
+	 * - return 0
+	 *
+	 * @return 0 in case of success, < 0 in case of failure.
+	 * @retval -ENODEV if invalid LTC number specified.
 	 */
 	void (*isr)(struct gk20a *g, u32 ltc);
 
@@ -68,13 +232,72 @@ struct gops_ltc_intr {
  */
 struct gops_ltc {
 	/**
-	 * @brief Initialize LTC support.
+	 * @brief Initialize Level Two Cache (LTC) support.
 	 *
-	 * @param g [in]		Pointer to GPU driver struct.
+	 * @param g [in]            - The GPU driver struct.
+	 *                            - The function does not perform validation
+	 *                              of g parameter.
 	 *
 	 * This function reads ltc unit info from GPU h/w and stores
 	 * it in #nvgpu_ltc structure. This function also initializes
-	 * LTC unit ecc counters.
+	 * LTC unit ecc counters. Steps are given below:
+	 *
+	 * - Allocate memory for g->ltc.
+	 * - Initialize LTC floorsweep state by calling the hal
+	 *   \ref gops_ltc.init_fs_state "gops_ltc.init_fs_state" with parameter \a g.
+	 *   - Initialize g->ltc->max_ltc_count with value returned by calling
+	 *     \ref gops_top.get_num_ltcs "g->ops.top.get_num_ltcs" with parameter \a g.
+	 *   - Initialize g->ltc->ltc_count with value returned by calling
+	 *     \ref gops_priv_ring.enum_ltc "g->ops.priv_ring.enum_ltc" with parameter \a g.
+	 *   - Initialize g->ltc->slices_per_ltc with value obtained by applying
+	 *     ltc_ltcs_ltss_cbc_param_slices_per_ltc_v() to register value read
+	 *     for the register ltc_ltcs_ltss_cbc_param_r().
+	 *   - Initialize g->ltc->cacheline_size with value obtained by shifting 512 to left by
+	 *     the shift value obtained by applying ltc_ltcs_ltss_cbc_param_cache_line_size_v()
+	 *     to register value read for the register ltc_ltcs_ltss_cbc_param_r().
+	 * - The L2 cache (LTC) has SEC-DED ECC protection on its data RAM and parity protection
+	 *   for byte enables.
+	 * - Initialize ECC counters for LTCs. On ga10b there are 2 LTC and each LTC has 2 slices.
+	 *   For each following counters are initialized:
+	 *   -# ECC SEC count
+	 *   -# ECC DED count
+	 *   -# RSTG ECC parity count
+	 *   -# TSTG ECC parity count
+	 *   -# DSTG BE ECC parity count
+	 *   See also \ref gops_ltc.intr.isr "gops_ltc.intr.isr".
+	 * - Enable stalling interrupt for LTC unit.
+	 *   -# Enable interrupts at MC level: call #nvgpu_mc_intr_stall_unit_config by passing
+	 *      below parameters:
+	 *      -# \a g
+	 *      -# #MC_INTR_UNIT_LTC
+	 *      -# #MC_INTR_ENABLE
+	 *   -# Enable interrupts at unit level.
+	 *      The L2 interrupts controlled by ltc_ltcs_ltss_intr_r() register are only enabled
+	 *      by nvgpu. Various L2 interrupts are:
+	 *      -# IDLE_ERROR_CBC - flag if cbc gets a request while slcg clock is disabled
+	 *      -# IDLE_ERROR_TSTG - flag if tstg gets a request while slcg clock is disabled
+	 *      -# IDLE_ERROR_DSTG - flag if dstg gets a request while slcg clock is disabled
+	 *      -# EVICTED_CB - indicates that a CB was demoted.  Normally this should not happen
+	 *                      because the CBs should be flushed during context switch and/or
+	 *                      invalidated when no longer used.
+	 *      -# ILLEGAL_COMPSTAT - indicates an unexpected compression status given the kind.
+	 *      -# BLOCKLINEAR_CB  - indicates that a valid evict_last entry is accessed by a
+	 *                           block linear transaction.
+	 *      -# ECC_SEC_ERROR - single bit error in data banks. Obsolete.
+	 *      -# ECC_DED_ERROR - double bit error in data banks. Obsolete.
+	 *      -# DEBUG - unused
+	 *      -# ATOMIC_TO_Z - atomic to packing Z or S8.
+	 *      -# ILLEGAL_ATOMIC - unsupported atomic op and/or size received.
+	 *      -# BLKACTIVITY_ERR - internal error in power sensing block activity monitor
+	 *      -# ILLEGAL_COMPSTAT_ACCESS - indicates that some memory access read/wrote into
+	 *                                   the memory space reserved for the compression bit
+	 *                                   carveout (Bug 942161)
+	 *      -# ILLEGAL_ROP_ACCESS - zwr or cwr is scrubbed
+	 *
+	 *
+	 *      Of these, EVICTED_CB and ILLEGAL_COMPSTAT_ACCESS are disabled to reduce noise
+	 *      and increase performance. Rest of the interrupts are kept in hardware
+	 *      initialized state.
 	 *
 	 * @return 0 in case of success, < 0 in case of failure.
 	 * @retval -ENOMEM if memory allocation fails for #nvgpu_ltc.
@@ -84,20 +307,38 @@ struct gops_ltc {
 	/**
 	 * @brief Remove LTC support.
 	 *
-	 * @param g [in]		Pointer to GPU driver struct.
+	 * @param g [in]            - The GPU driver struct.
+	 *                            - The function does not perform validation
+	 *                              of g parameter.
 	 *
 	 * This function will free memory allocated for #nvgpu_ltc structure.
+	 * Steps are given below:
+	 *
+	 * - If g->ltc is NULL return.
+	 * - Free g->ltc.
+	 * - Set g->ltc to NULL.
 	 */
 	void (*ltc_remove_support)(struct gk20a *g);
 
 	/**
 	 * @brief Returns GPU L2 cache size.
 	 *
-	 * @param g [in]		Pointer to GPU driver struct.
+	 * @param g [in]            - The GPU driver struct.
+	 *                            - The function does not perform validation
+	 *                              of g parameter.
 	 *
-	 * This function returns GPU L2 cache size by reading h/w ltc
+	 * This function returns GPU L2 cache size by reading HW ltc
 	 * config register.
 	 *
+	 * - Read register ltc_ltc0_lts0_tstg_info_1_r().
+	 * - Get slice_size by applying ltc_ltc0_lts0_tstg_info_1_slice_size_in_kb_v()
+	 *   to the register value read above.
+	 * - Get slices_per_l2 by applying ltc_ltc0_lts0_tstg_info_1_slices_per_l2_v()
+	 *   to the register value read in 1st step.
+	 * - Calculate the size as:
+	 *   \f$ g->ltc->ltc\_count * slices\_per\_l2 * (slice\_size * 1024) \f$
+	 * - Return the size.
+	 *
 	 * @return Size of L2 cache in bytes.
 	 */
 	u64 (*determine_L2_size_bytes)(struct gk20a *g);
@@ -105,17 +346,67 @@ struct gops_ltc {
 	/**
 	 * @brief Flush GPU L2 cache.
 	 *
-	 * @param g [in]		Pointer to GPU driver struct.
+	 * @param g [in]            - The GPU driver struct.
+	 *                            - The function does not perform validation
+	 *                              of g parameter.
 	 *
 	 * This function flushes all L2 cache data to main memory by cleaning
-	 * and invaliding all cache sub-units. s/w will poll for completion of
-	 * each ltc unit cache cleaning/invalidation for 5 msec. This 5 msec
-	 * time out is based on following calculations:
-	 * Lowest EMC clock rate will be around 102MHz and thus available
-	 * bandwidth is 64b * 2 * 102MHz = 1.3GB/s. Of that bandwidth, GPU
-	 * will likely get about half, so 650MB/s at worst. Assuming at most
-	 * 1MB of GPU L2 cache, worst case it will take 1MB/650MB/s = 1.5ms.
+	 * and invalidating all cache sub-units. SW will poll for completion
+	 * of each ltc unit cache cleaning/invalidation for 5ms.
+	 *
+	 * The 5ms timeout is based on following calculations:
+	 * Lowest EMC clock rate will be around 204MHz and thus available
+	 * bandwidth is 128B (Cacheline size) * 2 (LTCs) * 204MHz = ~52GB/s.
+	 * Of that bandwidth, GPU will likely get about half, so 26GB/s
+	 * at worst. Assuming at most 1MB of GPU L2 cache, worst case
+	 * it will take 1MB/26GB/s = 38us.
 	 * So 5ms timeout here should be more than enough.
+	 *
+	 * - First stage is to clean the LTCs with the below write:
+	 *   \code
+	 *	nvgpu_writel(g, ltc_ltcs_ltss_tstg_cmgmt1_r(),
+	 *		ltc_ltcs_ltss_tstg_cmgmt1_clean_pending_f() |
+	 *		ltc_ltcs_ltss_tstg_cmgmt1_max_cycles_between_cleans_3_f() |
+	 *		ltc_ltcs_ltss_tstg_cmgmt1_clean_wait_for_fb_to_pull_true_f() |
+	 *		ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_last_class_true_f() |
+	 *		ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_normal_class_true_f() |
+	 *		ltc_ltcs_ltss_tstg_cmgmt1_clean_evict_first_class_true_f());
+	 *   \endcode
+	 * - This cleans all LTCs.
+	 * - For each LTC, wait for clean to finish for 5ms.
+	 *   -# Initialize poll timer with timeout of 5ms by calling
+	 *      \ref nvgpu_timeout_init "nvgpu_timeout_init"
+	 *      with below parameters:
+	 *      -# \a g
+	 *      -# local timeout variable
+	 *      -# 5
+	 *      -# \ref NVGPU_TIMER_CPU_TIMER "NVGPU_TIMER_CPU_TIMER"
+	 *   -# do while LTCs are not cleared or timeout is not expired
+	 *      -# Read ltc_ltc0_ltss_tstg_cmgmt1_r() corresponding to the LTC.
+	 *         The offset is calculated as:
+	 *      \f$ltc\_ltc0\_ltss\_tstg\_cmgmt1\_r() + (ltc * GPU\_LIT\_LTC\_STRIDE)\f$
+	 *      -# Check if ltc_ltc0_ltss_tstg_cmgmt1_clean_pending_f() is cleared.
+	 * - Second stage is to invalidate the LTCs with the below write:
+	 *   \code
+	 *	nvgpu_writel(g, ltc_ltcs_ltss_tstg_cmgmt0_r(),
+	 *	     ltc_ltcs_ltss_tstg_cmgmt0_invalidate_pending_f() |
+	 *	     ltc_ltcs_ltss_tstg_cmgmt0_max_cycles_between_invalidates_3_f() |
+	 *	     ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_last_class_true_f() |
+	 *	     ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_normal_class_true_f() |
+	 *	     ltc_ltcs_ltss_tstg_cmgmt0_invalidate_evict_first_class_true_f());
+	 *   \endcode
+	 * - This invalidates all LTCs.
+	 * - For each LTC, wait for invalidate to finish for 5ms.
+	 *   -# Initialize poll timer with timeout of 5ms by calling
+	 *      \ref nvgpu_timeout_init "nvgpu_timeout_init"
+	 *      with below parameters:
+	 *      -# \a g
+	 *      -# local timeout variable
+	 *      -# 5
+	 *      -# \ref NVGPU_TIMER_CPU_TIMER "NVGPU_TIMER_CPU_TIMER"
+	 *   -# do while LTCs are not cleared or timeout is not expired
+	 *      -# Read ltc_ltc0_ltss_tstg_cmgmt0_r() corresponding to the LTC.
+	 *      -# Check if ltc_ltc0_ltss_tstg_cmgmt0_invalidate_pending_f() is cleared.
 	 */
 	void (*flush)(struct gk20a *g);
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/ltc.h b/drivers/gpu/nvgpu/include/nvgpu/ltc.h
index 5b337653f..92491ceb2 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/ltc.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/ltc.h
@@ -55,6 +55,105 @@ struct nvgpu_ltc {
 	u32 cacheline_size;
 };
 
+/**
+ * @brief Get enumerated ltcs count.
+ *
+ * @param g [in]            - The GPU driver struct.
+ *                            - The function does not perform validation
+ *                              of g parameter.
+ *
+ * This function returns enumerated number of ltcs after floorsweeping.
+ * After floorsweeping enumerated ltcs may be less than maximum ltcs available.
+ *
+ * - Return value of g->ltc->ltc_count.
+ *
+ * @return Number of enumerated ltc count.
+ */
+u32 nvgpu_ltc_get_ltc_count(struct gk20a *g);
+
+/**
+ * @brief Get slices per ltc.
+ *
+ * @param g [in]            - The GPU driver struct.
+ *                            - The function does not perform validation
+ *                              of g parameter.
+ *
+ * This function returns slices per ltc.
+ * Each ltc unit is constituted by h/w configured multiple physical slices.
+ * Clients can use slice size info to make their cache requirement to
+ * a slice for better bandwidth and/or utilization.
+ *
+ * - Return value of g->ltc->slices_per_ltc.
+ *
+ * @return Number of slices per ltc.
+ */
+u32 nvgpu_ltc_get_slices_per_ltc(struct gk20a *g);
+
+/**
+ * @brief Get cacheline size.
+ *
+ * @param g [in]            - The GPU driver struct.
+ *                            - The function does not perform validation
+ *                              of g parameter.
+ *
+ * This function returns cacheline size in bytes.
+ * Cacheline is chunk of memory that can be handled in one go by cache.
+ * Cacheline size is configured as multiple of 512 bytes in h/w.
+ *
+ * - Return value of g->ltc->cacheline_size.
+ *
+ * @return Cacheline size in bytes.
+ */
+u32 nvgpu_ltc_get_cacheline_size(struct gk20a *g);
+
+#define NVGPU_L2_SECTOR_PROMOTE_FLAG_NONE		(1U << 0U)
+#define NVGPU_L2_SECTOR_PROMOTE_FLAG_64B		(1U << 1U)
+#define NVGPU_L2_SECTOR_PROMOTE_FLAG_128B		(1U << 2U)
+#define NVGPU_L2_SECTOR_PROMOTE_FLAG_INVALID		(1U << 3U)
+
+/**
+ * @brief Release all LTC ECC stats counters.
+ *
+ * @param g [in]            - The GPU driver struct.
+ *                            - The function does not perform validation
+ *                              of g parameter.
+ *
+ * Frees all error counters associated with the LTC unit.
+ *
+ * - For each ltc from 0 to \ref nvgpu_ltc_get_ltc_count "nvgpu_ltc_get_ltc_count(g)" - 1:
+ *   - Free dynamically allocated memory for following ECC counters for slices: SEC, DED,
+ *     RSTG parity, TSTG parity, DSTG parity.
+ * - Free container of the ECC counters for the LTCs.
+ *
+ */
+void nvgpu_ltc_ecc_free(struct gk20a *g);
+
+/** @cond DOXYGEN_SHOULD_SKIP_THIS */
+
+/**
+ * @brief Initialize #nvgpu_ltc structure.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ *
+ * This function reads ltc unit info from GPU h/w and stores
+ * it in #nvgpu_ltc structure. This function allocates memory
+ * to track the ecc error counts for the LTC unit and enables
+ * LTC unit interrupts and stalling interrupt at MC level.
+ *
+ * @return 0 in case of success, < 0 in case of failure.
+ * @retval -ENOMEM if memory allocation for #nvgpu_ltc fails.
+ */
+int nvgpu_init_ltc_support(struct gk20a *g);
+/**
+ * @brief Remove support for LTC.
+ *
+ * @param g [in]		Pointer to GPU driver struct.
+ *
+ * This function will free memory allocated for #nvgpu_ltc structure.
+ * LTC unit data will be no longer accessible by s/w.
+ */
+void nvgpu_ltc_remove_support(struct gk20a *g);
+
 /**
  * @brief Allocate and initialize a error counters for all ltc-lts instances.
  *
@@ -81,79 +180,7 @@ int nvgpu_ecc_counter_init_per_lts(struct gk20a *g,
 #define NVGPU_ECC_COUNTER_INIT_PER_LTS(stat) \
 	nvgpu_ecc_counter_init_per_lts(g, &g->ecc.ltc.stat, #stat)
 
-#define NVGPU_L2_SECTOR_PROMOTE_FLAG_NONE		(1U << 0U)
-#define NVGPU_L2_SECTOR_PROMOTE_FLAG_64B		(1U << 1U)
-#define NVGPU_L2_SECTOR_PROMOTE_FLAG_128B		(1U << 2U)
-#define NVGPU_L2_SECTOR_PROMOTE_FLAG_INVALID		(1U << 3U)
-
-/**
- * @brief Release all LTC ECC stats counters.
- *
- * @param g [in] The GPU driver struct.
- *
- * Frees all error counters associated with the LTC unit.
- */
-void nvgpu_ltc_ecc_free(struct gk20a *g);
-
-/**
- * @brief Initialize #nvgpu_ltc structure.
- *
- * @param g [in]		Pointer to GPU driver struct.
- *
- * This function reads ltc unit info from GPU h/w and stores
- * it in #nvgpu_ltc structure. This function allocates memory
- * to track the ecc error counts for the LTC unit and enables
- * LTC unit interrupts and stalling interrupt at MC level.
- *
- * @return 0 in case of success, < 0 in case of failure.
- * @retval -ENOMEM if memory allocation for #nvgpu_ltc fails.
- */
-int nvgpu_init_ltc_support(struct gk20a *g);
-/**
- * @brief Remove support for LTC.
- *
- * @param g [in]		Pointer to GPU driver struct.
- *
- * This function will free memory allocated for #nvgpu_ltc structure.
- * LTC unit data will be no longer accessible by s/w.
- */
-void nvgpu_ltc_remove_support(struct gk20a *g);
-/**
- * @brief Get enumerated ltcs count.
- *
- * @param g [in]		Pointer to GPU driver struct.
- *
- * This function returns enumerated number of ltcs after floorsweeping.
- * After floorsweeping enumerated ltcs may be less than maximum ltcs available.
- *
- * @return Number of enumerated ltc count.
- */
-u32 nvgpu_ltc_get_ltc_count(struct gk20a *g);
-/**
- * @brief Get slices per ltc.
- *
- * @param g [in]		Pointer to GPU driver struct.
- *
- * This function returns slices per ltc.
- * Each ltc unit is constituted by h/w configured multiple physical slices.
- * Clients can use slice size info to make their cache requirement to
- * a slice for better bandwidth and/or utilization.
- *
- * @return Number of slices per ltc.
- */
-u32 nvgpu_ltc_get_slices_per_ltc(struct gk20a *g);
-/**
- * @brief Get cacheline size.
- *
- * @param g [in]	Pointer to GPU driver struct.
- *
- * This function returns cacheline size in bytes.
- * Cacheline is chunk of memory that can be handled in one go by cache.
- * Cacheline size is configured as multiple of 512 bytes in h/w.
- *
- * @return Cacheline size in bytes.
- */
-u32 nvgpu_ltc_get_cacheline_size(struct gk20a *g);
+/** @endcond DOXYGEN_SHOULD_SKIP_THIS */
 
 #if defined(CONFIG_NVGPU_NON_FUSA) || defined(CONFIG_NVGPU_KERNEL_MODE_SUBMIT)
 /**