gpu: nvgpu: tu10x: Add CE diversity gpu characteristic flag

Tu104 has multiple async-LCE (3), GRCE (2) and PCE (4).
So it is possible to use a different LCE/PCE during redundant
execution. This will allow us to claim very high coverage for
permanent fault.

JIRA NVGPU-4370

Change-Id: Ib39013d8d4f377eb20820db100af57c57592c39d
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2243984
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Antony Clince Alex <aalex@nvidia.com>
Reviewed-by: Shashank Singh <shashsingh@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Lakshmanan M
2019-11-21 11:00:44 +05:30
committed by Alex Waterman
parent eb4349548d
commit d6a20e31b3
4 changed files with 25 additions and 2 deletions

View File

@@ -1650,6 +1650,22 @@ int tu104_init_hal(struct gk20a *g)
nvgpu_set_enabled(g, NVGPU_SUPPORT_DGPU_PCIE_SCRIPT_EXECUTE, true);
nvgpu_set_enabled(g, NVGPU_FMON_SUPPORT_ENABLE, true);
/*
* Tu104 has multiple async-LCE (3), GRCE (2) and PCE (4).
* The allocation used for the HW structures is deterministic.
* LCE/PCE is likely to follow the same resource allocation in primary
* and redundant execution mode if we use the same LCE/PCE pairs for
* both execution modes. All available LCEs and GRCEs should be mapped
* to unique PCEs.
*
* The recommendation is to swap the GRCEs with each other during
* redundant execution. The async-LCEs have their own PCEs,
* so the suggestion is to use a different async-LCE during redundant
* execution. This will allow us to claim very high coverage for
* permanent fault.
*/
nvgpu_set_enabled(g, NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY, true);
/* for now */
gops->clk.support_pmgr_domain = false;
gops->clk.support_lpwr_pg = false;

View File

@@ -243,10 +243,13 @@ struct gk20a;
/** FMON feature Enable */
#define NVGPU_FMON_SUPPORT_ENABLE 83U
/** Copy Engine diversity enable bit */
#define NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY 84U
/*
* Must be greater than the largest bit offset in the above list.
*/
#define NVGPU_MAX_ENABLED_BITS 84U
#define NVGPU_MAX_ENABLED_BITS 85U
/**
* @brief Check if the passed flag is enabled.

View File

@@ -248,7 +248,9 @@ static struct nvgpu_flags_mapping flags_mapping[] = {
{NVGPU_GPU_FLAGS_SUPPORT_SET_CTX_MMU_DEBUG_MODE,
NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE},
{NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY,
NVGPU_SUPPORT_FAULT_RECOVERY}
NVGPU_SUPPORT_FAULT_RECOVERY},
{NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY,
NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY}
};
static u64 nvgpu_ctrl_ioctl_gpu_characteristics_flags(struct gk20a *g)

View File

@@ -172,6 +172,8 @@ struct nvgpu_gpu_zbc_query_table_args {
#define NVGPU_GPU_FLAGS_SUPPORT_SET_CTX_MMU_DEBUG_MODE (1ULL << 32)
/* Fault recovery is enabled */
#define NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY (1ULL << 33)
/* Copy Engine diversity is available */
#define NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY (1ULL << 34)
/* SM LRF ECC is enabled */
#define NVGPU_GPU_FLAGS_ECC_ENABLED_SM_LRF (1ULL << 60)
/* SM SHM ECC is enabled */