gpu: nvgpu: tu10x: Add CE diversity gpu characteristic flag

Tu104 has multiple async-LCE (3), GRCE (2) and PCE (4). So it is possible to use a different LCE/PCE during redundant execution. This will allow us to claim very high coverage for permanent fault. JIRA NVGPU-4370 Change-Id: Ib39013d8d4f377eb20820db100af57c57592c39d Signed-off-by: Lakshmanan M <lm@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2243984 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Antony Clince Alex <aalex@nvidia.com> Reviewed-by: Shashank Singh <shashsingh@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2019-11-21 11:00:44 +05:30
parent eb4349548d
commit d6a20e31b3
4 changed files with 25 additions and 2 deletions
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -1650,6 +1650,22 @@ int tu104_init_hal(struct gk20a *g)
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_DGPU_PCIE_SCRIPT_EXECUTE, true);
 	nvgpu_set_enabled(g, NVGPU_FMON_SUPPORT_ENABLE, true);

+	/*
+	 * Tu104 has multiple async-LCE (3), GRCE (2) and PCE (4).
+	 * The allocation used for the HW structures is deterministic.
+	 * LCE/PCE is likely to follow the same resource allocation in primary
+	 * and redundant execution mode if we use the same LCE/PCE pairs for
+	 * both execution modes. All available LCEs and GRCEs should be mapped
+	 * to unique PCEs.
+	 *
+	 * The recommendation is to swap the GRCEs with each other during
+	 * redundant execution. The async-LCEs have their own PCEs,
+	 * so the suggestion is to use a different async-LCE during redundant
+	 * execution. This will allow us to claim very high coverage for
+	 * permanent fault.
+	 */
+	nvgpu_set_enabled(g, NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY, true);
+
 	/* for now */
 	gops->clk.support_pmgr_domain = false;
 	gops->clk.support_lpwr_pg = false;
--- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
@@ -243,10 +243,13 @@ struct gk20a;
 /** FMON feature Enable */
 #define NVGPU_FMON_SUPPORT_ENABLE		83U

+/** Copy Engine diversity enable bit */
+#define NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY	84U
+
 /*
 * Must be greater than the largest bit offset in the above list.
 */
-#define NVGPU_MAX_ENABLED_BITS			84U
+#define NVGPU_MAX_ENABLED_BITS			85U

 /**
 * @brief Check if the passed flag is enabled.
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -248,7 +248,9 @@ static struct nvgpu_flags_mapping flags_mapping[] = {
 	{NVGPU_GPU_FLAGS_SUPPORT_SET_CTX_MMU_DEBUG_MODE,
 		NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE},
 	{NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY,
-		NVGPU_SUPPORT_FAULT_RECOVERY}
+		NVGPU_SUPPORT_FAULT_RECOVERY},
+	{NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY,
+		NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY}
 };

 static u64 nvgpu_ctrl_ioctl_gpu_characteristics_flags(struct gk20a *g)
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -172,6 +172,8 @@ struct nvgpu_gpu_zbc_query_table_args {
 #define NVGPU_GPU_FLAGS_SUPPORT_SET_CTX_MMU_DEBUG_MODE	(1ULL << 32)
 /* Fault recovery is enabled */
 #define NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY		(1ULL << 33)
+/* Copy Engine diversity is available */
+#define NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY	(1ULL << 34)
 /* SM LRF ECC is enabled */
 #define NVGPU_GPU_FLAGS_ECC_ENABLED_SM_LRF	(1ULL << 60)
 /* SM SHM ECC is enabled */