mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: add doxygen comments for common.ecc unit
Add doxygen style comments for common.ecc unit. Jira NVGPU-2475 Change-Id: Ie31f27a5fb253ac33e7b1b795c7268fd4a626a32 Signed-off-by: Antony Clince Alex <aalex@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2217455 Reviewed-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-by: Philip Elcan <pelcan@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
Alex Waterman
parent
6336c78c40
commit
e469c9cd98
@@ -23,6 +23,58 @@
|
||||
#ifndef NVGPU_ECC_H
|
||||
#define NVGPU_ECC_H
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @page unit-ecc Unit ECC(Error Control Codes)
|
||||
*
|
||||
* Acronyms
|
||||
* ========
|
||||
* ECC - Error Control Codes
|
||||
* SEC - Single Error Correction
|
||||
* SEC-DED - Standard single-error correction with double-error detection
|
||||
* SED - Single Error Detection
|
||||
*
|
||||
* Overview
|
||||
* ========
|
||||
* The memories within the GPU are protected using data integrity protection
|
||||
* mechanism like ecc or parity. This unit is responsible for allocating,
|
||||
* initializing and maintaining error counters for all memories which support
|
||||
* ecc/parity protection.
|
||||
*
|
||||
* + Initialization:
|
||||
* This unit allocates and initializes error counters (corrected and
|
||||
* uncorrected) for each memory and concatenates them into a list.
|
||||
*
|
||||
* Data Structures
|
||||
* ===============
|
||||
*
|
||||
* The data structures exposed by the ECC unit, conveys to the user information
|
||||
* regarding the corrected, uncorrected errors encountered in the constituent
|
||||
* memories in the GPU hardware units like (gr, ltc, pmu, etc).
|
||||
*
|
||||
* The following are the list of data structures:
|
||||
*
|
||||
* + struct nvgpu_ecc_stat
|
||||
*
|
||||
*
|
||||
* + struct nvgpu_ecc
|
||||
*
|
||||
*
|
||||
* Static Design
|
||||
* =============
|
||||
*
|
||||
* TODO
|
||||
*
|
||||
* Dynamic Design
|
||||
* =============
|
||||
*
|
||||
* TODO
|
||||
*
|
||||
* External APIs
|
||||
* -------------
|
||||
*
|
||||
*/
|
||||
|
||||
#include <nvgpu/types.h>
|
||||
#include <nvgpu/list.h>
|
||||
|
||||
@@ -30,12 +82,32 @@
|
||||
|
||||
struct gk20a;
|
||||
|
||||
/**
|
||||
* This struct holds the ecc/parity error information associated with each
|
||||
* memory. The error information includes a string that can be used to
|
||||
* uniquely identity the memory, error type. In addition it has a 32 bit
|
||||
* counter to track the number of instances of the errors.
|
||||
*/
|
||||
struct nvgpu_ecc_stat {
|
||||
/** The unique name associated with error */
|
||||
char name[NVGPU_ECC_STAT_NAME_MAX_SIZE];
|
||||
/** The 32-bit error counter */
|
||||
u32 counter;
|
||||
/**
|
||||
* The embedded list element, this is used to link the counters into
|
||||
* linked list.
|
||||
*/
|
||||
struct nvgpu_list_node node;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Helper function to get struct nvgpu_ecc_stat from list node.
|
||||
*
|
||||
* @param node [in] List element node.
|
||||
*
|
||||
* @return Pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
static inline struct nvgpu_ecc_stat *nvgpu_ecc_stat_from_node(
|
||||
struct nvgpu_list_node *node)
|
||||
{
|
||||
@@ -44,85 +116,166 @@ static inline struct nvgpu_ecc_stat *nvgpu_ecc_stat_from_node(
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* The structure contains the error statistics assocaited with constituent
|
||||
* memories within each gpu hardware unit. All statistics are linked together
|
||||
* into a list, the head of this list is stored in stats_list.
|
||||
*/
|
||||
struct nvgpu_ecc {
|
||||
/**
|
||||
* Contains error statistics for each memory contained within the gr
|
||||
* unit.
|
||||
*/
|
||||
struct {
|
||||
/* stats per tpc */
|
||||
|
||||
/** SM register file SEC count. */
|
||||
struct nvgpu_ecc_stat **sm_lrf_ecc_single_err_count;
|
||||
/** SM register file DED count. */
|
||||
struct nvgpu_ecc_stat **sm_lrf_ecc_double_err_count;
|
||||
|
||||
/** SM shared memory SEC count. */
|
||||
struct nvgpu_ecc_stat **sm_shm_ecc_sec_count;
|
||||
/** SM shared memory SED count. */
|
||||
struct nvgpu_ecc_stat **sm_shm_ecc_sed_count;
|
||||
/** SM shared memory DED count. */
|
||||
struct nvgpu_ecc_stat **sm_shm_ecc_ded_count;
|
||||
|
||||
/** TEX pipe0 total SEC count. */
|
||||
struct nvgpu_ecc_stat **tex_ecc_total_sec_pipe0_count;
|
||||
/** TEX pipe0 total DED count. */
|
||||
struct nvgpu_ecc_stat **tex_ecc_total_ded_pipe0_count;
|
||||
/** TEX pipe0 unique SEC count. */
|
||||
struct nvgpu_ecc_stat **tex_unique_ecc_sec_pipe0_count;
|
||||
/** TEX pipe0 unique DED count. */
|
||||
struct nvgpu_ecc_stat **tex_unique_ecc_ded_pipe0_count;
|
||||
/** TEX pipe1 total SEC count. */
|
||||
struct nvgpu_ecc_stat **tex_ecc_total_sec_pipe1_count;
|
||||
/** TEX pipe1 total DED count. */
|
||||
struct nvgpu_ecc_stat **tex_ecc_total_ded_pipe1_count;
|
||||
/** TEX pipe1 unique SEC count. */
|
||||
struct nvgpu_ecc_stat **tex_unique_ecc_sec_pipe1_count;
|
||||
/** TEX pipe1 unique DED count. */
|
||||
struct nvgpu_ecc_stat **tex_unique_ecc_ded_pipe1_count;
|
||||
|
||||
/** SM l1-tag correct error count. */
|
||||
struct nvgpu_ecc_stat **sm_l1_tag_ecc_corrected_err_count;
|
||||
/** SM l1-tag uncorrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_l1_tag_ecc_uncorrected_err_count;
|
||||
/** SM CBU corrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_cbu_ecc_corrected_err_count;
|
||||
/** SM CBU uncorrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_cbu_ecc_uncorrected_err_count;
|
||||
/** SM l1-data corrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_l1_data_ecc_corrected_err_count;
|
||||
/** SM l1-data uncorrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_l1_data_ecc_uncorrected_err_count;
|
||||
/** SM icache corrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_icache_ecc_corrected_err_count;
|
||||
/** SM icache uncorrected error count. */
|
||||
struct nvgpu_ecc_stat **sm_icache_ecc_uncorrected_err_count;
|
||||
|
||||
/* stats per gpc */
|
||||
|
||||
/** GCC l1.5-cache corrected error count. */
|
||||
struct nvgpu_ecc_stat *gcc_l15_ecc_corrected_err_count;
|
||||
/** GCC l1.5-cache uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *gcc_l15_ecc_uncorrected_err_count;
|
||||
|
||||
/** GPCCS falcon i-mem, d-mem corrected error count. */
|
||||
struct nvgpu_ecc_stat *gpccs_ecc_corrected_err_count;
|
||||
/** GPCCS falcone i-mem, d-mem uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *gpccs_ecc_uncorrected_err_count;
|
||||
|
||||
/** GMMU l1tlb corrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_l1tlb_ecc_corrected_err_count;
|
||||
/** GMMU l1tlb uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_l1tlb_ecc_uncorrected_err_count;
|
||||
|
||||
/* stats per device */
|
||||
/** FECS falcon i-mem, d-mem corrected error count. */
|
||||
struct nvgpu_ecc_stat *fecs_ecc_corrected_err_count;
|
||||
/** FECS falcon i-mem, d-mem uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *fecs_ecc_uncorrected_err_count;
|
||||
} gr;
|
||||
|
||||
/**
|
||||
* Contains error statistics for each memory contained within the ltc
|
||||
* unit.
|
||||
*/
|
||||
struct {
|
||||
/* stats per lts */
|
||||
/** ltc-lts sec count. */
|
||||
struct nvgpu_ecc_stat **ecc_sec_count;
|
||||
/** ltc-lts ded count. */
|
||||
struct nvgpu_ecc_stat **ecc_ded_count;
|
||||
} ltc;
|
||||
|
||||
/**
|
||||
* Contains error statistics for each memory contained within the fb
|
||||
* unit.
|
||||
*/
|
||||
struct {
|
||||
/* stats per device */
|
||||
/** hubmmu l2tlb corrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_l2tlb_ecc_corrected_err_count;
|
||||
/** hubmmu l2tlb uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_l2tlb_ecc_uncorrected_err_count;
|
||||
/** hubmmu hubtlb corrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_hubtlb_ecc_corrected_err_count;
|
||||
/** hubmmu hubtlb uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_hubtlb_ecc_uncorrected_err_count;
|
||||
/** hubmmu fillunit corrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_fillunit_ecc_corrected_err_count;
|
||||
/** hubmmu fillunit uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *mmu_fillunit_ecc_uncorrected_err_count;
|
||||
} fb;
|
||||
|
||||
/**
|
||||
* Contains error statistics for each memory contained within the pmu
|
||||
* unit.
|
||||
*/
|
||||
struct {
|
||||
/* stats per device */
|
||||
/** PMU falcon imem, dmem corrected error count. */
|
||||
struct nvgpu_ecc_stat *pmu_ecc_corrected_err_count;
|
||||
/** PMU falcon imem, dmem uncorrected error count. */
|
||||
struct nvgpu_ecc_stat *pmu_ecc_uncorrected_err_count;
|
||||
} pmu;
|
||||
|
||||
/**
|
||||
* Contains error statistics for each memory contained within the fbpa
|
||||
* unit.
|
||||
*/
|
||||
struct {
|
||||
/* stats per fbpa */
|
||||
/** fbpa sec count. */
|
||||
struct nvgpu_ecc_stat *fbpa_ecc_sec_err_count;
|
||||
/** fbpa ded count. */
|
||||
struct nvgpu_ecc_stat *fbpa_ecc_ded_err_count;
|
||||
} fbpa;
|
||||
|
||||
/** Contains the head to the list of error statistics. */
|
||||
struct nvgpu_list_node stats_list;
|
||||
/** Contains the number of error statistics. */
|
||||
int stats_count;
|
||||
/** Flag stores the initialization status of ECC unit. */
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize error counter specified by name for all
|
||||
* gpc-tpc instances.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
* @param stat [out] Pointer to array of pointers of error counters.
|
||||
* @param name [in] Unique name for error counter.
|
||||
*
|
||||
* Calculates the total number of tpcs across all gpcs within the gr unit.
|
||||
* Then allocates, initializes memory to hold error counters associated with all
|
||||
* tpcs, which is then added to the stats_list in struct nvgpu_ecc.
|
||||
*
|
||||
* @return 0 in case of success, less than 0 for failure.
|
||||
*/
|
||||
int nvgpu_ecc_counter_init_per_tpc(struct gk20a *g,
|
||||
struct nvgpu_ecc_stat ***stat, const char *name);
|
||||
/*
|
||||
* @brief Allocate and initalize counter for memories common across a TPC.
|
||||
*
|
||||
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
#define NVGPU_ECC_COUNTER_INIT_PER_TPC(stat) \
|
||||
do { \
|
||||
int err = 0; \
|
||||
@@ -133,38 +286,145 @@ int nvgpu_ecc_counter_init_per_tpc(struct gk20a *g,
|
||||
} \
|
||||
} while (false)
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize error counter specified by name for all gpc
|
||||
* instances.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
* @param stat [out] Pointer to array of tpc error counters.
|
||||
* @param name [in] Unique name for error counter.
|
||||
*
|
||||
* Calculates the total number of gpcs within the gr unit. Then allocates,
|
||||
* initializes memory to hold error counters associated with all gpcs, which is
|
||||
* then added to the stats_list in struct nvgpu_ecc.
|
||||
*
|
||||
* @return 0 in case of success, less than 0 for failure.
|
||||
*/
|
||||
int nvgpu_ecc_counter_init_per_gpc(struct gk20a *g,
|
||||
struct nvgpu_ecc_stat **stat, const char *name);
|
||||
/*
|
||||
* @brief Allocate and initalize counters for memories shared across a GPC.
|
||||
*
|
||||
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
#define NVGPU_ECC_COUNTER_INIT_PER_GPC(stat) \
|
||||
nvgpu_ecc_counter_init_per_gpc(g, &g->ecc.gr.stat, #stat)
|
||||
|
||||
/**
|
||||
* @brief Allocates, initializes an error counter with specified name.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
* @param stat [out] Pointer to array of tpc error counters.
|
||||
* @param name [in] Unique name for error counter.
|
||||
*
|
||||
* Allocate memory for one error counter, initializes the counter with 0 and the
|
||||
* specified string identifier. Finally the counter is added to the status_list
|
||||
* of struct nvgpu_ecc.
|
||||
*
|
||||
* @return 0 in case of success, less than 0 for failure.
|
||||
*/
|
||||
int nvgpu_ecc_counter_init(struct gk20a *g,
|
||||
struct nvgpu_ecc_stat **stat, const char *name);
|
||||
/*
|
||||
* @brief Allocate and initalize counters for memories shared within GR.
|
||||
*
|
||||
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
#define NVGPU_ECC_COUNTER_INIT_GR(stat) \
|
||||
nvgpu_ecc_counter_init(g, &g->ecc.gr.stat, #stat)
|
||||
/*
|
||||
* @brief Allocate and initalize counters for memories within FB.
|
||||
*
|
||||
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
#define NVGPU_ECC_COUNTER_INIT_FB(stat) \
|
||||
nvgpu_ecc_counter_init(g, &g->ecc.fb.stat, #stat)
|
||||
/*
|
||||
* @brief Allocate and initalize counter for memories within PMU.
|
||||
*
|
||||
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
#define NVGPU_ECC_COUNTER_INIT_PMU(stat) \
|
||||
nvgpu_ecc_counter_init(g, &g->ecc.pmu.stat, #stat)
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize a error counters for all ltc-lts instances.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
* @param stat [out] Pointer to array of tpc error counters.
|
||||
* @param name [in] Unique name for error counter.
|
||||
*
|
||||
* Calculates the total number of ltc-lts instances, allocates memory for each
|
||||
* instance of error counter, initializes the counter with 0 and the specified
|
||||
* string identifier. Finally the counter is added to the stats_list of
|
||||
* struct nvgpu_ecc.
|
||||
*
|
||||
* @return 0 in case of success, less than 0 for failure.
|
||||
*/
|
||||
int nvgpu_ecc_counter_init_per_lts(struct gk20a *g,
|
||||
struct nvgpu_ecc_stat ***stat, const char *name);
|
||||
/*
|
||||
* @brief Allocate and initalize counters for memories within ltc-lts
|
||||
*
|
||||
* @param stat [in] Address of pointer to struct nvgpu_ecc_stat.
|
||||
*
|
||||
*/
|
||||
#define NVGPU_ECC_COUNTER_INIT_PER_LTS(stat) \
|
||||
nvgpu_ecc_counter_init_per_lts(g, &g->ecc.ltc.stat, #stat)
|
||||
|
||||
/**
|
||||
* @brief Allocate and initialize error counters for all fbpa instances.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
* @param stat [out] Pointer to array of tpc error counters.
|
||||
* @param name [in] Unique name for error counter.
|
||||
*
|
||||
* Calculates the total number of fbpa instances, allocates memory for each
|
||||
* instance of error counter, initializes the counter with 0 and the specified
|
||||
* string identifier. Finally the counter is added to the stats_list of
|
||||
* struct nvgpu_ecc.
|
||||
*
|
||||
* @return 0 in case of success, less than 0 for failure.
|
||||
*/
|
||||
int nvgpu_ecc_counter_init_per_fbpa(struct gk20a *g,
|
||||
struct nvgpu_ecc_stat **stat, const char *name);
|
||||
#define NVGPU_ECC_COUNTER_INIT_PER_FBPA(stat) \
|
||||
nvgpu_ecc_counter_init_per_fbpa(g, &g->ecc.fbpa.stat, #stat)
|
||||
|
||||
/**
|
||||
* @brief Release memory associated with all error counters.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
*
|
||||
* Releases memory associated with all error counters associated with a hardware
|
||||
* unit, this is done for every instance of the hardware unit.
|
||||
*/
|
||||
void nvgpu_ecc_free(struct gk20a *g);
|
||||
|
||||
/**
|
||||
* @brief Allocates and initializes error counters for memories within gpu
|
||||
* hardware units.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
*
|
||||
* @return 0 in case of success, less than 0 for failure.
|
||||
*/
|
||||
int nvgpu_ecc_init_support(struct gk20a *g);
|
||||
|
||||
/**
|
||||
* @brief Destroys, frees up memory allocated to ecc/parity error counters.
|
||||
*
|
||||
* @param g [in] The GPU driver struct.
|
||||
*/
|
||||
void nvgpu_ecc_remove_support(struct gk20a *g);
|
||||
|
||||
/* OSes to implement */
|
||||
|
||||
#ifdef CONFIG_NVGPU_SYSFS
|
||||
int nvgpu_ecc_sysfs_init(struct gk20a *g);
|
||||
void nvgpu_ecc_sysfs_remove(struct gk20a *g);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user