From d6a20e31b3fc3070f6ad4f514533d760daa4b674 Mon Sep 17 00:00:00 2001 From: Lakshmanan M Date: Thu, 21 Nov 2019 11:00:44 +0530 Subject: [PATCH] gpu: nvgpu: tu10x: Add CE diversity gpu characteristic flag Tu104 has multiple async-LCE (3), GRCE (2) and PCE (4). So it is possible to use a different LCE/PCE during redundant execution. This will allow us to claim very high coverage for permanent fault. JIRA NVGPU-4370 Change-Id: Ib39013d8d4f377eb20820db100af57c57592c39d Signed-off-by: Lakshmanan M Reviewed-on: https://git-master.nvidia.com/r/2243984 Reviewed-by: svc-mobile-coverity Reviewed-by: svc-mobile-misra Reviewed-by: svc-mobile-cert Reviewed-by: Antony Clince Alex Reviewed-by: Shashank Singh GVS: Gerrit_Virtual_Submit Reviewed-by: Vaibhav Kachore Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/hal/init/hal_tu104.c | 16 ++++++++++++++++ drivers/gpu/nvgpu/include/nvgpu/enabled.h | 5 ++++- drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c | 4 +++- include/uapi/linux/nvgpu.h | 2 ++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index a8aa52f91..a4e50e2b6 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -1650,6 +1650,22 @@ int tu104_init_hal(struct gk20a *g) nvgpu_set_enabled(g, NVGPU_SUPPORT_DGPU_PCIE_SCRIPT_EXECUTE, true); nvgpu_set_enabled(g, NVGPU_FMON_SUPPORT_ENABLE, true); + /* + * Tu104 has multiple async-LCE (3), GRCE (2) and PCE (4). + * The allocation used for the HW structures is deterministic. + * LCE/PCE is likely to follow the same resource allocation in primary + * and redundant execution mode if we use the same LCE/PCE pairs for + * both execution modes. All available LCEs and GRCEs should be mapped + * to unique PCEs. + * + * The recommendation is to swap the GRCEs with each other during + * redundant execution. The async-LCEs have their own PCEs, + * so the suggestion is to use a different async-LCE during redundant + * execution. This will allow us to claim very high coverage for + * permanent fault. + */ + nvgpu_set_enabled(g, NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY, true); + /* for now */ gops->clk.support_pmgr_domain = false; gops->clk.support_lpwr_pg = false; diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h index 476e7355e..89354a263 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h +++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h @@ -243,10 +243,13 @@ struct gk20a; /** FMON feature Enable */ #define NVGPU_FMON_SUPPORT_ENABLE 83U +/** Copy Engine diversity enable bit */ +#define NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY 84U + /* * Must be greater than the largest bit offset in the above list. */ -#define NVGPU_MAX_ENABLED_BITS 84U +#define NVGPU_MAX_ENABLED_BITS 85U /** * @brief Check if the passed flag is enabled. diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c index ed82bdebb..39b1db164 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c @@ -248,7 +248,9 @@ static struct nvgpu_flags_mapping flags_mapping[] = { {NVGPU_GPU_FLAGS_SUPPORT_SET_CTX_MMU_DEBUG_MODE, NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE}, {NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY, - NVGPU_SUPPORT_FAULT_RECOVERY} + NVGPU_SUPPORT_FAULT_RECOVERY}, + {NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY, + NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY} }; static u64 nvgpu_ctrl_ioctl_gpu_characteristics_flags(struct gk20a *g) diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h index b72a9a0e7..400011d31 100644 --- a/include/uapi/linux/nvgpu.h +++ b/include/uapi/linux/nvgpu.h @@ -172,6 +172,8 @@ struct nvgpu_gpu_zbc_query_table_args { #define NVGPU_GPU_FLAGS_SUPPORT_SET_CTX_MMU_DEBUG_MODE (1ULL << 32) /* Fault recovery is enabled */ #define NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY (1ULL << 33) +/* Copy Engine diversity is available */ +#define NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY (1ULL << 34) /* SM LRF ECC is enabled */ #define NVGPU_GPU_FLAGS_ECC_ENABLED_SM_LRF (1ULL << 60) /* SM SHM ECC is enabled */