From a52ee77837a2bc015ab5f52f35073464a0a9c27f Mon Sep 17 00:00:00 2001
From: Lakshmanan M <lm@nvidia.com>
Date: Thu, 28 Nov 2019 15:48:08 +0530
Subject: [PATCH] gpu: nvgpu: Add SM diversity gpu characteristic flag

To achieve permanent fault coverage, the CTAs launched by
each kernel in the mission and redundant contexts must execute
on different hardware resources.
This feature requires a change in software to make it possible
to modify the virtual SM id to TPC mapping across mission and
redundant contexts.

This CL adds only SM diversity flags which are exposed to
its clients through ioctl/devctl interfaces.
Actual virtual SM id to TPC mapping implementation
will be part of upcoming patch sets.

Added NvGpu CFLAGS to identify the safety build
"CONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY"

JIRA NVGPU-4133

Change-Id: I5a18256780e6726e399e39c1c8d155d2ef07d7bd
Signed-off-by: Lakshmanan M <lm@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2250461
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/Makefile.shared.configs     |  4 ++-
 drivers/gpu/nvgpu/common/init/nvgpu_init.c    |  2 +-
 drivers/gpu/nvgpu/hal/init/hal_gm20b.c        |  3 ++
 drivers/gpu/nvgpu/hal/init/hal_gp10b.c        |  3 ++
 drivers/gpu/nvgpu/hal/init/hal_gv11b.c        | 32 +++++++++++++++++++
 drivers/gpu/nvgpu/hal/init/hal_tu104.c        | 28 ++++++++++++++++
 .../gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c  |  3 ++
 .../gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c  | 31 ++++++++++++++++++
 drivers/gpu/nvgpu/include/nvgpu/enabled.h     |  5 ++-
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h       |  4 +++
 drivers/gpu/nvgpu/include/nvgpu/gr/setup.h    |  9 ++++++
 .../gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h |  2 ++
 drivers/gpu/nvgpu/os/linux/ioctl_channel.c    | 10 ++++++
 drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c       | 15 ++++++++-
 include/uapi/linux/nvgpu.h                    |  9 +++++-
 15 files changed, 155 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/nvgpu/Makefile.shared.configs b/drivers/gpu/nvgpu/Makefile.shared.configs
index 825f99d20..58f9af1d7 100644
--- a/drivers/gpu/nvgpu/Makefile.shared.configs
+++ b/drivers/gpu/nvgpu/Makefile.shared.configs
@@ -67,7 +67,9 @@ NVGPU_COMMON_CFLAGS                  += -DCONFIG_NVGPU_LOGGING
 ifeq ($(profile),$(filter $(profile),safety_debug safety_release))
 
 # Enable golden context verification only for safety debug/release build
-NVGPU_COMMON_CFLAGS             += -DCONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+NVGPU_COMMON_CFLAGS             +=      \
+	-DCONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION     \
+	-DCONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY
 
 endif
 
diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
index f0edc63df..a76eda5f2 100644
--- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c
+++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c
@@ -690,7 +690,7 @@ int nvgpu_can_busy(struct gk20a *g)
 
 int nvgpu_init_gpu_characteristics(struct gk20a *g)
 {
-#ifdef NV_BUILD_CONFIGURATION_IS_SAFETY
+#ifdef CONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY
 	nvgpu_set_enabled(g, NVGPU_DRIVER_REDUCED_PROFILE, true);
 #endif
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_MAP_DIRECT_KIND_CTRL, true);
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
index ded09ee70..fbd21e618 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -1237,6 +1237,9 @@ int gm20b_init_hal(struct gk20a *g)
 	nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, false);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE, false);
 
+	g->max_sm_diversity_config_count =
+		NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT;
+
 	g->name = "gm20b";
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
index c4accd862..a2ce803dc 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
@@ -1332,6 +1332,9 @@ int gp10b_init_hal(struct gk20a *g)
 	nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, false);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE, false);
 
+	g->max_sm_diversity_config_count =
+		NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT;
+
 	g->name = "gp10b";
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
index 930da533e..7987ef7bf 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -1551,6 +1551,38 @@ int gv11b_init_hal(struct gk20a *g)
 	 */
 	nvgpu_set_enabled(g, NVGPU_MM_BYPASSES_IOMMU, true);
 
+#ifndef CONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY
+	/*
+	 * To achieve permanent fault coverage, the CTAs launched by each kernel
+	 * in the mission and redundant contexts must execute on different
+	 * hardware resources. This feature proposes modifications in the
+	 * software to modify the virtual SM id to TPC mapping across the
+	 * mission and redundant contexts.
+	 *
+	 * The virtual SM identifier to TPC mapping is done by the nvgpu
+	 * when setting up the golden context. Once the table with this mapping
+	 * is initialized, it is used by all subsequent contexts that are
+	 * created. The proposal is for setting up the virtual SM identifier
+	 * to TPC mapping on a per-context basis and initializing this
+	 * virtual SM identifier to TPC mapping differently for the mission and
+	 * redundant contexts.
+	 *
+	 * The recommendation for the redundant setting is to offset the
+	 * assignment by 1 (TPC). This will ensure both GPC and TPC diversity.
+	 * The SM and Quadrant diversity will happen naturally.
+	 *
+	 * For kernels with few CTAs, the diversity is guaranteed to be 100%.
+	 * In case of completely random CTA allocation, e.g. large number of
+	 * CTAs in the waiting queue, the diversity is 1 - 1/#SM,
+	 * or 87.5% for GV11B.
+	 */
+	nvgpu_set_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY, true);
+	g->max_sm_diversity_config_count =
+		NVGPU_MAX_SM_DIVERSITY_CONFIG_COUNT;
+#else
+	g->max_sm_diversity_config_count =
+		NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT;
+#endif
 	g->name = "gv11b";
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
index 230ac7b78..7f0a2ed22 100644
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -1677,6 +1677,34 @@ int tu104_init_hal(struct gk20a *g)
 	 */
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY, true);
 
+	/*
+	 * To achieve permanent fault coverage, the CTAs launched by each kernel
+	 * in the mission and redundant contexts must execute on different
+	 * hardware resources. This feature proposes modifications in the
+	 * software to modify the virtual SM id to TPC mapping across the
+	 * mission and redundant contexts.
+	 *
+	 * The virtual SM identifier to TPC mapping is done by the nvgpu
+	 * when setting up the golden context. Once the table with this mapping
+	 * is initialized, it is used by all subsequent contexts that are
+	 * created. The proposal is for setting up the virtual SM identifier
+	 * to TPC mapping on a per-context basis and initializing this
+	 * virtual SM identifier to TPC mapping differently for the mission and
+	 * redundant contexts.
+	 *
+	 * The recommendation for the redundant setting is to offset the
+	 * assignment by 1 (TPC). This will ensure both GPC and TPC diversity.
+	 * The SM and Quadrant diversity will happen naturally.
+	 *
+	 * For kernels with few CTAs, the diversity is guaranteed to be 100%.
+	 * In case of completely random CTA allocation, e.g. large number of
+	 * CTAs in the waiting queue, the diversity is 1 - 1/#SM,
+	 * or 97.9% for TU104.
+	 */
+	nvgpu_set_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY, true);
+	g->max_sm_diversity_config_count =
+		NVGPU_MAX_SM_DIVERSITY_CONFIG_COUNT;
+
 	/* for now */
 	gops->clk.support_pmgr_domain = false;
 	gops->clk.support_lpwr_pg = false;
diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c
index 1692d9b6c..dabb0c2ec 100644
--- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c
@@ -932,6 +932,9 @@ int vgpu_gp10b_init_hal(struct gk20a *g)
 		gops->clk_arb.get_arbiter_clk_domains = NULL;
 	}
 
+	g->max_sm_diversity_config_count =
+		priv->constants.max_sm_diversity_config_count;
+
 	g->name = "gp10b";
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
index fb86faeff..b298499b7 100644
--- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
@@ -1049,6 +1049,37 @@ int vgpu_gv11b_init_hal(struct gk20a *g)
 		gops->clk_arb.get_arbiter_clk_domains = NULL;
 	}
 
+	/*
+	 * To achieve permanent fault coverage, the CTAs launched by each kernel
+	 * in the mission and redundant contexts must execute on different
+	 * hardware resources. This feature proposes modifications in the
+	 * software to modify the virtual SM id to TPC mapping across the
+	 * mission and redundant contexts.
+	 *
+	 * The virtual SM identifier to TPC mapping is done by the nvgpu
+	 * when setting up the golden context. Once the table with this mapping
+	 * is initialized, it is used by all subsequent contexts created.
+	 * The proposal is for setting up the virtual SM identifier to TPC
+	 * mapping on a per-context basis and initializing this virtual SM
+	 * identifier to TPC mapping differently for the mission and
+	 * redundant contexts.
+	 *
+	 * The recommendation for the redundant setting is to offset the
+	 * assignment by 1 (TPC). This will ensure both GPC and TPC diversity.
+	 * The SM and Quadrant diversity will happen naturally.
+	 *
+	 * For kernels with few CTAs, the diversity is guaranteed to be 100%.
+	 * In case of completely random CTA allocation, e.g. large number of
+	 * CTAs in the waiting queue, the diversity is 1 - 1/#SM,
+	 * or 87.5% for GV11B.
+	 */
+	if (priv->constants.max_sm_diversity_config_count > 1U) {
+		nvgpu_set_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY, true);
+	}
+
+	g->max_sm_diversity_config_count =
+		priv->constants.max_sm_diversity_config_count;
+
 	g->name = "gv11b";
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
index 89354a263..184980619 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
@@ -246,10 +246,13 @@ struct gk20a;
 /** Copy Engine diversity enable bit */
 #define NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY	84U
 
+/** SM diversity enable bit */
+#define NVGPU_SUPPORT_SM_DIVERSITY		85U
+
 /*
  * Must be greater than the largest bit offset in the above list.
  */
-#define NVGPU_MAX_ENABLED_BITS			85U
+#define NVGPU_MAX_ENABLED_BITS			86U
 
 /**
  * @brief Check if the passed flag is enabled.
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 89d2ab2a2..f418b6a84 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -998,6 +998,10 @@ struct gk20a {
 	/** @endcond */
 
 	u16 dgpu_max_clk;
+
+	/** Max SM diversity configuration count. */
+	u32 max_sm_diversity_config_count;
+
 };
 
 /**
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
index bc8fea47e..42f609a33 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h
@@ -34,6 +34,15 @@ struct nvgpu_channel;
 struct vm_gk20a;
 struct nvgpu_gr_ctx;
 
+/** Supports only mission (default) context. */
+#define NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT 1U
+
+/** Max SM diversity configuration count.
+ * Offset 0 for mission (default) context.
+ * Offset 1 for redundant context.
+ */
+#define NVGPU_MAX_SM_DIVERSITY_CONFIG_COUNT 2U
+
 /**
  * @brief Allocate and setup object context s/w image for GPU channel.
  *
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
index 40bbc7118..5831db5f7 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -535,6 +535,8 @@ struct tegra_vgpu_constants_params {
 	u32 sm_per_tpc;
 	u32 max_subctx_count;
 	u32 l2_en_mask[TEGRA_VGPU_L2_EN_MASK];
+	/** Max SM configuration count. */
+	u32 max_sm_diversity_config_count;
 };
 
 enum {
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
index d5d52b2f1..c23939e58 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
@@ -1133,6 +1133,16 @@ long gk20a_channel_ioctl(struct file *filp,
 		struct nvgpu_alloc_obj_ctx_args *args =
 				(struct nvgpu_alloc_obj_ctx_args *)buf;
 
+		if (nvgpu_is_enabled(ch->g, NVGPU_SUPPORT_SM_DIVERSITY)) {
+			if (args->sm_diversity_config >=
+				ch->g->max_sm_diversity_config_count) {
+				err = -EINVAL;
+				break;
+			}
+		} else {
+			args->sm_diversity_config = 0U;
+		}
+
 		err = gk20a_busy(ch->g);
 		if (err) {
 			dev_err(dev,
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
index 39b1db164..ef5b4899d 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -250,7 +250,9 @@ static struct nvgpu_flags_mapping flags_mapping[] = {
 	{NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY,
 		NVGPU_SUPPORT_FAULT_RECOVERY},
 	{NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY,
-		NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY}
+		NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY},
+	{NVGPU_GPU_FLAGS_SUPPORT_SM_DIVERSITY,
+		NVGPU_SUPPORT_SM_DIVERSITY}
 };
 
 static u64 nvgpu_ctrl_ioctl_gpu_characteristics_flags(struct gk20a *g)
@@ -321,6 +323,8 @@ gk20a_ctrl_ioctl_gpu_characteristics(
 
 	gpu.num_tpc_per_gpc = nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config);
 
+	gpu.max_sm_diversity_config_count = g->max_sm_diversity_config_count;
+
 	gpu.bus_type = NVGPU_GPU_BUS_TYPE_AXI; /* always AXI for now */
 
 	gpu.compression_page_size = g->ops.fb.compression_page_size(g);
@@ -849,6 +853,15 @@ static int gk20a_ctrl_vsm_mapping(struct gk20a *g,
 	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
 	u32 i;
 
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY)) {
+		if (args->sm_diversity_config >=
+			g->max_sm_diversity_config_count) {
+			return -EINVAL;
+		}
+	} else {
+	    args->sm_diversity_config = 0U;
+	}
+
 	vsms_buf = nvgpu_kzalloc(g, write_size);
 	if (vsms_buf == NULL)
 		return -ENOMEM;
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 400011d31..0b84423f4 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -174,6 +174,8 @@ struct nvgpu_gpu_zbc_query_table_args {
 #define NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY		(1ULL << 33)
 /* Copy Engine diversity is available */
 #define NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY	(1ULL << 34)
+/** SM diversity is available. */
+#define NVGPU_GPU_FLAGS_SUPPORT_SM_DIVERSITY		(1ULL << 35)
 /* SM LRF ECC is enabled */
 #define NVGPU_GPU_FLAGS_ECC_ENABLED_SM_LRF	(1ULL << 60)
 /* SM SHM ECC is enabled */
@@ -292,7 +294,8 @@ struct nvgpu_gpu_characteristics {
 	__u8 reserved2[6];
 
 	__u32 max_ctxsw_ring_buffer_size;
-	__u32 reserved3;
+	/* Max CTA diversity configuration count. */
+	__u32 max_sm_diversity_config_count;
 
 	/* Notes:
 	   - This struct can be safely appended with new fields. However, always
@@ -457,6 +460,8 @@ struct nvgpu_gpu_vsms_mapping_entry {
 };
 
 struct nvgpu_gpu_vsms_mapping {
+	__u32 sm_diversity_config;
+	__u32 reserved;
 	__u64 vsms_map_buf_addr;
 };
 
@@ -1519,6 +1524,8 @@ struct nvgpu_set_nvmap_fd_args {
 struct nvgpu_alloc_obj_ctx_args {
 	__u32 class_num; /* kepler3d, 2d, compute, etc       */
 	__u32 flags;     /* input, output */
+	__u32 sm_diversity_config; /* input */
+	__u32 reserved;
 	__u64 obj_id;    /* output, used to free later       */
 };