diff --git a/drivers/gpu/nvgpu/Makefile.shared.configs b/drivers/gpu/nvgpu/Makefile.shared.configs index 825f99d20..58f9af1d7 100644 --- a/drivers/gpu/nvgpu/Makefile.shared.configs +++ b/drivers/gpu/nvgpu/Makefile.shared.configs @@ -67,7 +67,9 @@ NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_LOGGING ifeq ($(profile),$(filter $(profile),safety_debug safety_release)) # Enable golden context verification only for safety debug/release build -NVGPU_COMMON_CFLAGS += -DCONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION +NVGPU_COMMON_CFLAGS += \ + -DCONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION \ + -DCONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY endif diff --git a/drivers/gpu/nvgpu/common/init/nvgpu_init.c b/drivers/gpu/nvgpu/common/init/nvgpu_init.c index f0edc63df..a76eda5f2 100644 --- a/drivers/gpu/nvgpu/common/init/nvgpu_init.c +++ b/drivers/gpu/nvgpu/common/init/nvgpu_init.c @@ -690,7 +690,7 @@ int nvgpu_can_busy(struct gk20a *g) int nvgpu_init_gpu_characteristics(struct gk20a *g) { -#ifdef NV_BUILD_CONFIGURATION_IS_SAFETY +#ifdef CONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY nvgpu_set_enabled(g, NVGPU_DRIVER_REDUCED_PROFILE, true); #endif nvgpu_set_enabled(g, NVGPU_SUPPORT_MAP_DIRECT_KIND_CTRL, true); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index ded09ee70..fbd21e618 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -1237,6 +1237,9 @@ int gm20b_init_hal(struct gk20a *g) nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, false); nvgpu_set_enabled(g, NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE, false); + g->max_sm_diversity_config_count = + NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT; + g->name = "gm20b"; return 0; diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index c4accd862..a2ce803dc 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -1332,6 +1332,9 @@ int gp10b_init_hal(struct gk20a *g) nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, false); nvgpu_set_enabled(g, NVGPU_SUPPORT_SET_CTX_MMU_DEBUG_MODE, false); + g->max_sm_diversity_config_count = + NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT; + g->name = "gp10b"; return 0; diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 930da533e..7987ef7bf 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -1551,6 +1551,38 @@ int gv11b_init_hal(struct gk20a *g) */ nvgpu_set_enabled(g, NVGPU_MM_BYPASSES_IOMMU, true); +#ifndef CONFIG_NVGPU_BUILD_CONFIGURATION_IS_SAFETY + /* + * To achieve permanent fault coverage, the CTAs launched by each kernel + * in the mission and redundant contexts must execute on different + * hardware resources. This feature proposes modifications in the + * software to modify the virtual SM id to TPC mapping across the + * mission and redundant contexts. + * + * The virtual SM identifier to TPC mapping is done by the nvgpu + * when setting up the golden context. Once the table with this mapping + * is initialized, it is used by all subsequent contexts that are + * created. The proposal is for setting up the virtual SM identifier + * to TPC mapping on a per-context basis and initializing this + * virtual SM identifier to TPC mapping differently for the mission and + * redundant contexts. + * + * The recommendation for the redundant setting is to offset the + * assignment by 1 (TPC). This will ensure both GPC and TPC diversity. + * The SM and Quadrant diversity will happen naturally. + * + * For kernels with few CTAs, the diversity is guaranteed to be 100%. + * In case of completely random CTA allocation, e.g. large number of + * CTAs in the waiting queue, the diversity is 1 - 1/#SM, + * or 87.5% for GV11B. + */ + nvgpu_set_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY, true); + g->max_sm_diversity_config_count = + NVGPU_MAX_SM_DIVERSITY_CONFIG_COUNT; +#else + g->max_sm_diversity_config_count = + NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT; +#endif g->name = "gv11b"; return 0; diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index 230ac7b78..7f0a2ed22 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -1677,6 +1677,34 @@ int tu104_init_hal(struct gk20a *g) */ nvgpu_set_enabled(g, NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY, true); + /* + * To achieve permanent fault coverage, the CTAs launched by each kernel + * in the mission and redundant contexts must execute on different + * hardware resources. This feature proposes modifications in the + * software to modify the virtual SM id to TPC mapping across the + * mission and redundant contexts. + * + * The virtual SM identifier to TPC mapping is done by the nvgpu + * when setting up the golden context. Once the table with this mapping + * is initialized, it is used by all subsequent contexts that are + * created. The proposal is for setting up the virtual SM identifier + * to TPC mapping on a per-context basis and initializing this + * virtual SM identifier to TPC mapping differently for the mission and + * redundant contexts. + * + * The recommendation for the redundant setting is to offset the + * assignment by 1 (TPC). This will ensure both GPC and TPC diversity. + * The SM and Quadrant diversity will happen naturally. + * + * For kernels with few CTAs, the diversity is guaranteed to be 100%. + * In case of completely random CTA allocation, e.g. large number of + * CTAs in the waiting queue, the diversity is 1 - 1/#SM, + * or 97.9% for TU104. + */ + nvgpu_set_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY, true); + g->max_sm_diversity_config_count = + NVGPU_MAX_SM_DIVERSITY_CONFIG_COUNT; + /* for now */ gops->clk.support_pmgr_domain = false; gops->clk.support_lpwr_pg = false; diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c index 1692d9b6c..dabb0c2ec 100644 --- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gp10b.c @@ -932,6 +932,9 @@ int vgpu_gp10b_init_hal(struct gk20a *g) gops->clk_arb.get_arbiter_clk_domains = NULL; } + g->max_sm_diversity_config_count = + priv->constants.max_sm_diversity_config_count; + g->name = "gp10b"; return 0; diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c index fb86faeff..b298499b7 100644 --- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c @@ -1049,6 +1049,37 @@ int vgpu_gv11b_init_hal(struct gk20a *g) gops->clk_arb.get_arbiter_clk_domains = NULL; } + /* + * To achieve permanent fault coverage, the CTAs launched by each kernel + * in the mission and redundant contexts must execute on different + * hardware resources. This feature proposes modifications in the + * software to modify the virtual SM id to TPC mapping across the + * mission and redundant contexts. + * + * The virtual SM identifier to TPC mapping is done by the nvgpu + * when setting up the golden context. Once the table with this mapping + * is initialized, it is used by all subsequent contexts created. + * The proposal is for setting up the virtual SM identifier to TPC + * mapping on a per-context basis and initializing this virtual SM + * identifier to TPC mapping differently for the mission and + * redundant contexts. + * + * The recommendation for the redundant setting is to offset the + * assignment by 1 (TPC). This will ensure both GPC and TPC diversity. + * The SM and Quadrant diversity will happen naturally. + * + * For kernels with few CTAs, the diversity is guaranteed to be 100%. + * In case of completely random CTA allocation, e.g. large number of + * CTAs in the waiting queue, the diversity is 1 - 1/#SM, + * or 87.5% for GV11B. + */ + if (priv->constants.max_sm_diversity_config_count > 1U) { + nvgpu_set_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY, true); + } + + g->max_sm_diversity_config_count = + priv->constants.max_sm_diversity_config_count; + g->name = "gv11b"; return 0; diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h index 89354a263..184980619 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h +++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h @@ -246,10 +246,13 @@ struct gk20a; /** Copy Engine diversity enable bit */ #define NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY 84U +/** SM diversity enable bit */ +#define NVGPU_SUPPORT_SM_DIVERSITY 85U + /* * Must be greater than the largest bit offset in the above list. */ -#define NVGPU_MAX_ENABLED_BITS 85U +#define NVGPU_MAX_ENABLED_BITS 86U /** * @brief Check if the passed flag is enabled. diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 89d2ab2a2..f418b6a84 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -998,6 +998,10 @@ struct gk20a { /** @endcond */ u16 dgpu_max_clk; + + /** Max SM diversity configuration count. */ + u32 max_sm_diversity_config_count; + }; /** diff --git a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h index bc8fea47e..42f609a33 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gr/setup.h @@ -34,6 +34,15 @@ struct nvgpu_channel; struct vm_gk20a; struct nvgpu_gr_ctx; +/** Supports only mission (default) context. */ +#define NVGPU_DEFAULT_SM_DIVERSITY_CONFIG_COUNT 1U + +/** Max SM diversity configuration count. + * Offset 0 for mission (default) context. + * Offset 1 for redundant context. + */ +#define NVGPU_MAX_SM_DIVERSITY_CONFIG_COUNT 2U + /** * @brief Allocate and setup object context s/w image for GPU channel. * diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h index 40bbc7118..5831db5f7 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h @@ -535,6 +535,8 @@ struct tegra_vgpu_constants_params { u32 sm_per_tpc; u32 max_subctx_count; u32 l2_en_mask[TEGRA_VGPU_L2_EN_MASK]; + /** Max SM configuration count. */ + u32 max_sm_diversity_config_count; }; enum { diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c index d5d52b2f1..c23939e58 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c @@ -1133,6 +1133,16 @@ long gk20a_channel_ioctl(struct file *filp, struct nvgpu_alloc_obj_ctx_args *args = (struct nvgpu_alloc_obj_ctx_args *)buf; + if (nvgpu_is_enabled(ch->g, NVGPU_SUPPORT_SM_DIVERSITY)) { + if (args->sm_diversity_config >= + ch->g->max_sm_diversity_config_count) { + err = -EINVAL; + break; + } + } else { + args->sm_diversity_config = 0U; + } + err = gk20a_busy(ch->g); if (err) { dev_err(dev, diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c index 39b1db164..ef5b4899d 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c @@ -250,7 +250,9 @@ static struct nvgpu_flags_mapping flags_mapping[] = { {NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY, NVGPU_SUPPORT_FAULT_RECOVERY}, {NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY, - NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY} + NVGPU_SUPPORT_COPY_ENGINE_DIVERSITY}, + {NVGPU_GPU_FLAGS_SUPPORT_SM_DIVERSITY, + NVGPU_SUPPORT_SM_DIVERSITY} }; static u64 nvgpu_ctrl_ioctl_gpu_characteristics_flags(struct gk20a *g) @@ -321,6 +323,8 @@ gk20a_ctrl_ioctl_gpu_characteristics( gpu.num_tpc_per_gpc = nvgpu_gr_config_get_max_tpc_per_gpc_count(gr_config); + gpu.max_sm_diversity_config_count = g->max_sm_diversity_config_count; + gpu.bus_type = NVGPU_GPU_BUS_TYPE_AXI; /* always AXI for now */ gpu.compression_page_size = g->ops.fb.compression_page_size(g); @@ -849,6 +853,15 @@ static int gk20a_ctrl_vsm_mapping(struct gk20a *g, struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g); u32 i; + if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY)) { + if (args->sm_diversity_config >= + g->max_sm_diversity_config_count) { + return -EINVAL; + } + } else { + args->sm_diversity_config = 0U; + } + vsms_buf = nvgpu_kzalloc(g, write_size); if (vsms_buf == NULL) return -ENOMEM; diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h index 400011d31..0b84423f4 100644 --- a/include/uapi/linux/nvgpu.h +++ b/include/uapi/linux/nvgpu.h @@ -174,6 +174,8 @@ struct nvgpu_gpu_zbc_query_table_args { #define NVGPU_GPU_FLAGS_SUPPORT_FAULT_RECOVERY (1ULL << 33) /* Copy Engine diversity is available */ #define NVGPU_GPU_FLAGS_SUPPORT_COPY_ENGINE_DIVERSITY (1ULL << 34) +/** SM diversity is available. */ +#define NVGPU_GPU_FLAGS_SUPPORT_SM_DIVERSITY (1ULL << 35) /* SM LRF ECC is enabled */ #define NVGPU_GPU_FLAGS_ECC_ENABLED_SM_LRF (1ULL << 60) /* SM SHM ECC is enabled */ @@ -292,7 +294,8 @@ struct nvgpu_gpu_characteristics { __u8 reserved2[6]; __u32 max_ctxsw_ring_buffer_size; - __u32 reserved3; + /* Max CTA diversity configuration count. */ + __u32 max_sm_diversity_config_count; /* Notes: - This struct can be safely appended with new fields. However, always @@ -457,6 +460,8 @@ struct nvgpu_gpu_vsms_mapping_entry { }; struct nvgpu_gpu_vsms_mapping { + __u32 sm_diversity_config; + __u32 reserved; __u64 vsms_map_buf_addr; }; @@ -1519,6 +1524,8 @@ struct nvgpu_set_nvmap_fd_args { struct nvgpu_alloc_obj_ctx_args { __u32 class_num; /* kepler3d, 2d, compute, etc */ __u32 flags; /* input, output */ + __u32 sm_diversity_config; /* input */ + __u32 reserved; __u64 obj_id; /* output, used to free later */ };