gpu: nvgpu: move init_sm_id_table hal to hal.gr.config

Move init_sm_id_table hal to common.hal.gr.config. Two separate hals for gm20b and gv100 are added. JIRA NVGPU-1884 Change-Id: Id307542db67b103ec25b02b41fd3b9d9bd8f30f0 Signed-off-by: Nitin Kumbhar <nkumbhar@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2073582 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2019-03-15 11:11:23 +05:30
parent e649d19c65
commit 03e137b552
20 changed files with 343 additions and 282 deletions
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -442,7 +442,8 @@ nvgpu-$(CONFIG_GK20A_VIDMEM) += \
 	common/mm/vidmem.o

 nvgpu-y += \
-	hal/gr/config/gr_config_gm20b.o
+	hal/gr/config/gr_config_gm20b.o \
+	hal/gr/config/gr_config_gv100.o

 nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
 	common/vgpu/ltc/ltc_vgpu.o \
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -387,7 +387,8 @@ ifeq ($(NVGPU_DEBUGGER),1)
 srcs += common/debugger.c
 endif

-srcs += hal/gr/config/gr_config_gm20b.c
+srcs += hal/gr/config/gr_config_gm20b.c \
+	hal/gr/config/gr_config_gv100.c

 ifeq ($(NVGPU_LS_PMU),1)
 # Add LS PMU files which are required for normal build
--- a/drivers/gpu/nvgpu/common/gr/gr.c
+++ b/drivers/gpu/nvgpu/common/gr/gr.c
@@ -101,8 +101,8 @@ int nvgpu_gr_init_fs_state(struct gk20a *g)
 		return err;
 	}

-	if (g->ops.gr.init_sm_id_table != NULL) {
-		err = g->ops.gr.init_sm_id_table(g);
+	if (g->ops.gr.config.init_sm_id_table != NULL) {
+		err = g->ops.gr.config.init_sm_id_table(g);
 		if (err != 0) {
 			return err;
 		}
--- a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c
@@ -160,7 +160,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.suspend_contexts = vgpu_gr_suspend_contexts,
 		.resume_contexts = vgpu_gr_resume_contexts,
 		.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
-		.init_sm_id_table = vgpu_gr_init_sm_id_table,
 		.commit_inst = vgpu_gr_commit_inst,
 		.trigger_suspend = NULL,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -282,6 +281,7 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		},
 		.config = {
 			.get_gpc_tpc_mask = vgpu_gr_get_gpc_tpc_mask,
+			.init_sm_id_table = vgpu_gr_init_sm_id_table,
 		},
 		.zbc = {
 			.add_color = NULL,
--- a/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c
@@ -1151,11 +1151,11 @@ int vgpu_gr_init_sm_id_table(struct gk20a *g)

 int vgpu_gr_init_fs_state(struct gk20a *g)
 {
-	if (!g->ops.gr.init_sm_id_table) {
+	if (!g->ops.gr.config.init_sm_id_table) {
 		return -EINVAL;
 	}

-	return g->ops.gr.init_sm_id_table(g);
+	return g->ops.gr.config.init_sm_id_table(g);
 }

 int vgpu_gr_update_pc_sampling(struct channel_gk20a *ch, bool enable)
--- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c
@@ -181,7 +181,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.suspend_contexts = vgpu_gr_suspend_contexts,
 		.resume_contexts = vgpu_gr_resume_contexts,
 		.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
-		.init_sm_id_table = vgpu_gr_init_sm_id_table,
 		.commit_inst = vgpu_gr_commit_inst,
 		.trigger_suspend = NULL,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -329,6 +328,7 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		},
 		.config = {
 			.get_gpc_tpc_mask = vgpu_gr_get_gpc_tpc_mask,
+			.init_sm_id_table = vgpu_gr_init_sm_id_table,
 		},
 		.zbc = {
 			.add_color = NULL,
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -768,30 +768,6 @@ int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
 	return 0;
 }

-int gr_gk20a_init_sm_id_table(struct gk20a *g)
-{
-	u32 gpc, tpc;
-	u32 sm_id = 0;
-
-	for (tpc = 0;
-	     tpc < nvgpu_gr_config_get_max_tpc_per_gpc_count(g->gr.config);
-	     tpc++) {
-		for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(g->gr.config); gpc++) {
-
-			if (tpc < nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc)) {
-				g->gr.sm_to_cluster[sm_id].tpc_index = tpc;
-				g->gr.sm_to_cluster[sm_id].gpc_index = gpc;
-				g->gr.sm_to_cluster[sm_id].sm_index = 0;
-				g->gr.sm_to_cluster[sm_id].global_tpc_index =
-									sm_id;
-				sm_id++;
-			}
-		}
-	}
-	g->gr.no_of_sm = sm_id;
-	return 0;
-}
-
 int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
 {
 	struct gk20a *g = c->g;
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -457,8 +457,6 @@ int gr_gk20a_resume_from_pause(struct gk20a *g);
 int gr_gk20a_clear_sm_errors(struct gk20a *g);
 u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g);

-int gr_gk20a_init_sm_id_table(struct gk20a *g);
-
 int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);

 u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc);
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -284,7 +284,6 @@ static const struct gpu_ops gm20b_ops = {
 		.suspend_contexts = gr_gk20a_suspend_contexts,
 		.resume_contexts = gr_gk20a_resume_contexts,
 		.get_preemption_mode_flags = gr_gm20b_get_preemption_mode_flags,
-		.init_sm_id_table = gr_gk20a_init_sm_id_table,
 		.commit_inst = gr_gk20a_commit_inst,
 		.trigger_suspend = gr_gk20a_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -399,6 +398,7 @@ static const struct gpu_ops gm20b_ops = {
 			.get_pes_tpc_mask = gm20b_gr_config_get_pes_tpc_mask,
 			.get_pd_dist_skip_table_size =
 				gm20b_gr_config_get_pd_dist_skip_table_size,
+			.init_sm_id_table = gm20b_gr_config_init_sm_id_table,
 		},
 		.zbc = {
 			.add_color = gm20b_gr_zbc_add_color,
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -307,7 +307,6 @@ static const struct gpu_ops gp10b_ops = {
 		.suspend_contexts = gr_gp10b_suspend_contexts,
 		.resume_contexts = gr_gk20a_resume_contexts,
 		.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
-		.init_sm_id_table = gr_gk20a_init_sm_id_table,
 		.commit_inst = gr_gk20a_commit_inst,
 		.trigger_suspend = gr_gk20a_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -445,6 +444,7 @@ static const struct gpu_ops gp10b_ops = {
 			.get_pes_tpc_mask = gm20b_gr_config_get_pes_tpc_mask,
 			.get_pd_dist_skip_table_size =
 				gm20b_gr_config_get_pd_dist_skip_table_size,
+			.init_sm_id_table = gm20b_gr_config_init_sm_id_table,
 		},
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		.fecs_trace = {
--- a/drivers/gpu/nvgpu/gv100/gr_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c
@@ -40,250 +40,10 @@
 #include <nvgpu/hw/gv100/hw_proj_gv100.h>
 #include <nvgpu/hw/gv100/hw_perf_gv100.h>

-
-/*
- *  Estimate performance if the given logical TPC in the given logical GPC were
- * removed.
- */
-static int gr_gv100_scg_estimate_perf(struct gk20a *g,
-					unsigned long *gpc_tpc_mask,
-					u32 disable_gpc_id, u32 disable_tpc_id,
-					int *perf)
-{
-	struct gr_gk20a *gr = &g->gr;
-	int err = 0;
-	u32 scale_factor = 512U; /* Use fx23.9 */
-	u32 pix_scale = 1024U*1024U;	/* Pix perf in [29:20] */
-	u32 world_scale = 1024U;	/* World performance in [19:10] */
-	u32 tpc_scale = 1U;		/* TPC balancing in [9:0] */
-	u32 scg_num_pes = 0U;
-	u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
-	u32 average_tpcs = 0U;		/* Average of # of TPCs per GPC */
-	u32 deviation;			/* absolute diff between TPC# and
-					 * average_tpcs, averaged across GPCs
-					 */
-	u32 norm_tpc_deviation;		/* deviation/max_tpc_per_gpc */
-	u32 tpc_balance;
-	u32 scg_gpc_pix_perf;
-	u32 scg_world_perf;
-	u32 gpc_id;
-	u32 pes_id;
-	int diff;
-	bool is_tpc_removed_gpc = false;
-	bool is_tpc_removed_pes = false;
-	u32 max_tpc_gpc = 0U;
-	u32 num_tpc_mask;
-	u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) *
-				nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
-
-	if (num_tpc_gpc == NULL) {
-		return -ENOMEM;
-	}
-
-	/* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
-	for (gpc_id = 0;
-	     gpc_id < nvgpu_gr_config_get_gpc_count(gr->config);
-	     gpc_id++) {
-		num_tpc_mask = gpc_tpc_mask[gpc_id];
-
-		if ((gpc_id == disable_gpc_id) &&
-		    ((num_tpc_mask & BIT32(disable_tpc_id)) != 0U)) {
-			/* Safety check if a TPC is removed twice */
-			if (is_tpc_removed_gpc) {
-				err = -EINVAL;
-				goto free_resources;
-			}
-			/* Remove logical TPC from set */
-			num_tpc_mask &= ~(BIT32(disable_tpc_id));
-			is_tpc_removed_gpc = true;
-		}
-
-		/* track balancing of tpcs across gpcs */
-		num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask);
-		average_tpcs += num_tpc_gpc[gpc_id];
-
-		/* save the maximum numer of gpcs */
-		max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ?
-				num_tpc_gpc[gpc_id] : max_tpc_gpc;
-
-		/*
-		 * Calculate ratio between TPC count and post-FS and post-SCG
-		 *
-		 * ratio represents relative throughput of the GPC
-		 */
-		scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] /
-				nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id);
-
-		if (min_scg_gpc_pix_perf > scg_gpc_pix_perf) {
-			min_scg_gpc_pix_perf = scg_gpc_pix_perf;
-		}
-
-		/* Calculate # of surviving PES */
-		for (pes_id = 0;
-		     pes_id < nvgpu_gr_config_get_gpc_ppc_count(gr->config, gpc_id);
-		     pes_id++) {
-			/* Count the number of TPC on the set */
-			num_tpc_mask = nvgpu_gr_config_get_pes_tpc_mask(
-						gr->config, gpc_id, pes_id) &
-					gpc_tpc_mask[gpc_id];
-
-			if ((gpc_id == disable_gpc_id) &&
-			    ((num_tpc_mask & BIT32(disable_tpc_id)) != 0U)) {
-
-				if (is_tpc_removed_pes) {
-					err = -EINVAL;
-					goto free_resources;
-				}
-				num_tpc_mask &= ~(BIT32(disable_tpc_id));
-				is_tpc_removed_pes = true;
-			}
-			if (hweight32(num_tpc_mask) != 0UL) {
-				scg_num_pes++;
-			}
-		}
-	}
-
-	if (!is_tpc_removed_gpc || !is_tpc_removed_pes) {
-		err = -EINVAL;
-		goto free_resources;
-	}
-
-	if (max_tpc_gpc == 0U) {
-		*perf = 0;
-		goto free_resources;
-	}
-
-	/* Now calculate perf */
-	scg_world_perf = (scale_factor * scg_num_pes) /
-		nvgpu_gr_config_get_ppc_count(gr->config);
-	deviation = 0;
-	average_tpcs = scale_factor * average_tpcs /
-			nvgpu_gr_config_get_gpc_count(gr->config);
-	for (gpc_id =0;
-	     gpc_id < nvgpu_gr_config_get_gpc_count(gr->config);
-	     gpc_id++) {
-		diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id];
-		if (diff < 0) {
-			diff = -diff;
-		}
-		deviation += U32(diff);
-	}
-
-	deviation /= nvgpu_gr_config_get_gpc_count(gr->config);
-
-	norm_tpc_deviation = deviation / max_tpc_gpc;
-
-	tpc_balance = scale_factor - norm_tpc_deviation;
-
-	if ((tpc_balance > scale_factor)          ||
-	    (scg_world_perf > scale_factor)       ||
-	    (min_scg_gpc_pix_perf > scale_factor) ||
-	    (norm_tpc_deviation > scale_factor)) {
-		err = -EINVAL;
-		goto free_resources;
-	}
-
-	*perf = (pix_scale * min_scg_gpc_pix_perf) +
-		(world_scale * scg_world_perf) +
-		(tpc_scale * tpc_balance);
-free_resources:
-	nvgpu_kfree(g, num_tpc_gpc);
-	return err;
-}
-
 void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
 {
 }

-int gr_gv100_init_sm_id_table(struct gk20a *g)
-{
-	unsigned long tpc;
-	u32 gpc, sm, pes, gtpc;
-	u32 sm_id = 0;
-	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
-	struct gr_gk20a *gr = &g->gr;
-	u32 num_sm = sm_per_tpc * nvgpu_gr_config_get_tpc_count(gr->config);
-	int perf, maxperf;
-	int err = 0;
-	unsigned long *gpc_tpc_mask;
-	u32 *tpc_table, *gpc_table;
-
-	gpc_table = nvgpu_kzalloc(g, nvgpu_gr_config_get_tpc_count(gr->config) *
-					sizeof(u32));
-	tpc_table = nvgpu_kzalloc(g, nvgpu_gr_config_get_tpc_count(gr->config) *
-					sizeof(u32));
-	gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) *
-			nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
-
-	if ((gpc_table == NULL) ||
-	    (tpc_table == NULL) ||
-	    (gpc_tpc_mask == NULL)) {
-		nvgpu_err(g, "Error allocating memory for sm tables");
-		err = -ENOMEM;
-		goto exit_build_table;
-	}
-
-	for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) {
-		for (pes = 0;
-		     pes < nvgpu_gr_config_get_gpc_ppc_count(g->gr.config, gpc);
-		     pes++) {
-			gpc_tpc_mask[gpc] |= nvgpu_gr_config_get_pes_tpc_mask(
-						g->gr.config, gpc, pes);
-		}
-	}
-
-	for (gtpc = 0; gtpc < nvgpu_gr_config_get_tpc_count(gr->config); gtpc++) {
-		maxperf = -1;
-		for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) {
-			for_each_set_bit(tpc, &gpc_tpc_mask[gpc],
-					nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc)) {
-				perf = -1;
-				err = gr_gv100_scg_estimate_perf(g,
-						gpc_tpc_mask, gpc, tpc, &perf);
-
-				if (err != 0) {
-					nvgpu_err(g,
-						"Error while estimating perf");
-					goto exit_build_table;
-				}
-
-				if (perf >= maxperf) {
-					maxperf = perf;
-					gpc_table[gtpc] = gpc;
-					tpc_table[gtpc] = tpc;
-				}
-			}
-		}
-		gpc_tpc_mask[gpc_table[gtpc]] &= ~(BIT64(tpc_table[gtpc]));
-	}
-
-	for (tpc = 0, sm_id = 0;  sm_id < num_sm; tpc++, sm_id += sm_per_tpc) {
-		for (sm = 0; sm < sm_per_tpc; sm++) {
-			u32 index = sm_id + sm;
-
-			g->gr.sm_to_cluster[index].gpc_index = gpc_table[tpc];
-			g->gr.sm_to_cluster[index].tpc_index = tpc_table[tpc];
-			g->gr.sm_to_cluster[index].sm_index = sm;
-			g->gr.sm_to_cluster[index].global_tpc_index = tpc;
-			nvgpu_log_info(g,
-				"gpc : %d tpc %d sm_index %d global_index: %d",
-				g->gr.sm_to_cluster[index].gpc_index,
-				g->gr.sm_to_cluster[index].tpc_index,
-				g->gr.sm_to_cluster[index].sm_index,
-				g->gr.sm_to_cluster[index].global_tpc_index);
-
-		}
-	}
-
-	g->gr.no_of_sm = num_sm;
-	nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm);
-exit_build_table:
-	nvgpu_kfree(g, gpc_table);
-	nvgpu_kfree(g, tpc_table);
-	nvgpu_kfree(g, gpc_tpc_mask);
-	return err;
-}
-
 u32 gr_gv100_get_patch_slots(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
--- a/drivers/gpu/nvgpu/gv100/gr_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.h
@@ -1,7 +1,7 @@
 /*
 * GV100 GPU GR
 *
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -26,7 +26,6 @@
 #define NVGPU_GR_GV100_H

 void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index);
-int gr_gv100_init_sm_id_table(struct gk20a *g);
 void gr_gv100_program_sm_id_numbering(struct gk20a *g,
 					u32 gpc, u32 tpc, u32 smid);
 int gr_gv100_load_smid_config(struct gk20a *g);
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -57,6 +57,7 @@
 #include "hal/fifo/fifo_intr_gv11b.h"
 #include "hal/gr/fecs_trace/fecs_trace_gm20b.h"
 #include "hal/gr/config/gr_config_gm20b.h"
+#include "hal/gr/config/gr_config_gv100.h"
 #include "hal/gr/zbc/zbc_gp10b.h"
 #include "hal/gr/zbc/zbc_gv11b.h"
 #include "hal/gr/init/gr_init_gm20b.h"
@@ -421,7 +422,6 @@ static const struct gpu_ops gv100_ops = {
 		.suspend_contexts = gr_gp10b_suspend_contexts,
 		.resume_contexts = gr_gk20a_resume_contexts,
 		.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
-		.init_sm_id_table = gr_gv100_init_sm_id_table,
 		.commit_inst = gr_gv11b_commit_inst,
 		.trigger_suspend = gv11b_gr_sm_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -582,6 +582,7 @@ static const struct gpu_ops gv100_ops = {
 			.get_pes_tpc_mask = gm20b_gr_config_get_pes_tpc_mask,
 			.get_pd_dist_skip_table_size =
 				gm20b_gr_config_get_pd_dist_skip_table_size,
+			.init_sm_id_table = gv100_gr_config_init_sm_id_table,
 		},
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		.fecs_trace = {
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -30,6 +30,7 @@
 #include "hal/bus/bus_gm20b.h"
 #include "hal/priv_ring/priv_ring_gm20b.h"
 #include "hal/priv_ring/priv_ring_gp10b.h"
+#include "hal/gr/config/gr_config_gv100.h"
 #include "hal/power_features/cg/gv11b_gating_reglist.h"
 #include "hal/cbc/cbc_gm20b.h"
 #include "hal/cbc/cbc_gp10b.h"
@@ -372,7 +373,6 @@ static const struct gpu_ops gv11b_ops = {
 		.suspend_contexts = gr_gp10b_suspend_contexts,
 		.resume_contexts = gr_gk20a_resume_contexts,
 		.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
-		.init_sm_id_table = gr_gv100_init_sm_id_table,
 		.commit_inst = gr_gv11b_commit_inst,
 		.trigger_suspend = gv11b_gr_sm_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -542,6 +542,7 @@ static const struct gpu_ops gv11b_ops = {
 			.get_pes_tpc_mask = gm20b_gr_config_get_pes_tpc_mask,
 			.get_pd_dist_skip_table_size =
 				gm20b_gr_config_get_pd_dist_skip_table_size,
+			.init_sm_id_table = gv100_gr_config_init_sm_id_table,
 		},
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		.fecs_trace = {
--- a/drivers/gpu/nvgpu/hal/gr/config/gr_config_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/gr/config/gr_config_gm20b.c
@@ -28,6 +28,30 @@

 #include <nvgpu/hw/gm20b/hw_gr_gm20b.h>

+int gm20b_gr_config_init_sm_id_table(struct gk20a *g)
+{
+	u32 gpc, tpc;
+	u32 sm_id = 0;
+
+	for (tpc = 0;
+	     tpc < nvgpu_gr_config_get_max_tpc_per_gpc_count(g->gr.config);
+	     tpc++) {
+		for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(g->gr.config); gpc++) {
+
+			if (tpc < nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc)) {
+				g->gr.sm_to_cluster[sm_id].tpc_index = tpc;
+				g->gr.sm_to_cluster[sm_id].gpc_index = gpc;
+				g->gr.sm_to_cluster[sm_id].sm_index = 0;
+				g->gr.sm_to_cluster[sm_id].global_tpc_index =
+									sm_id;
+				sm_id++;
+			}
+		}
+	}
+	g->gr.no_of_sm = sm_id;
+	return 0;
+}
+
 u32 gm20b_gr_config_get_gpc_tpc_mask(struct gk20a *g,
 	struct nvgpu_gr_config *config, u32 gpc_index)
 {
--- a/drivers/gpu/nvgpu/hal/gr/config/gr_config_gm20b.h
+++ b/drivers/gpu/nvgpu/hal/gr/config/gr_config_gm20b.h
@@ -28,6 +28,7 @@
 struct gk20a;
 struct nvgpu_gr_config;

+int gm20b_gr_config_init_sm_id_table(struct gk20a *g);
 u32 gm20b_gr_config_get_gpc_tpc_mask(struct gk20a *g,
 	struct nvgpu_gr_config *config, u32 gpc_index);
 u32 gm20b_gr_config_get_tpc_count_in_gpc(struct gk20a *g,
--- a/drivers/gpu/nvgpu/hal/gr/config/gr_config_gv100.c
+++ b/drivers/gpu/nvgpu/hal/gr/config/gr_config_gv100.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/types.h>
+#include <nvgpu/gr/config.h>
+
+#include "gr_config_gv100.h"
+
+/*
+ * Estimate performance if the given logical TPC in the given logical GPC were
+ * removed.
+ */
+static int gr_gv100_scg_estimate_perf(struct gk20a *g,
+					unsigned long *gpc_tpc_mask,
+					u32 disable_gpc_id, u32 disable_tpc_id,
+					int *perf)
+{
+	struct gr_gk20a *gr = &g->gr;
+	int err = 0;
+	u32 scale_factor = 512U; /* Use fx23.9 */
+	u32 pix_scale = 1024U*1024U;	/* Pix perf in [29:20] */
+	u32 world_scale = 1024U;	/* World performance in [19:10] */
+	u32 tpc_scale = 1U;		/* TPC balancing in [9:0] */
+	u32 scg_num_pes = 0U;
+	u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */
+	u32 average_tpcs = 0U;		/* Average of # of TPCs per GPC */
+	u32 deviation;			/* absolute diff between TPC# and
+					 * average_tpcs, averaged across GPCs
+					 */
+	u32 norm_tpc_deviation;		/* deviation/max_tpc_per_gpc */
+	u32 tpc_balance;
+	u32 scg_gpc_pix_perf;
+	u32 scg_world_perf;
+	u32 gpc_id;
+	u32 pes_id;
+	int diff;
+	bool is_tpc_removed_gpc = false;
+	bool is_tpc_removed_pes = false;
+	u32 max_tpc_gpc = 0U;
+	u32 num_tpc_mask;
+	u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) *
+				nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
+
+	if (num_tpc_gpc == NULL) {
+		return -ENOMEM;
+	}
+
+	/* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */
+	for (gpc_id = 0;
+	     gpc_id < nvgpu_gr_config_get_gpc_count(gr->config);
+	     gpc_id++) {
+		num_tpc_mask = gpc_tpc_mask[gpc_id];
+
+		if ((gpc_id == disable_gpc_id) &&
+		    ((num_tpc_mask & BIT32(disable_tpc_id)) != 0U)) {
+			/* Safety check if a TPC is removed twice */
+			if (is_tpc_removed_gpc) {
+				err = -EINVAL;
+				goto free_resources;
+			}
+			/* Remove logical TPC from set */
+			num_tpc_mask &= ~(BIT32(disable_tpc_id));
+			is_tpc_removed_gpc = true;
+		}
+
+		/* track balancing of tpcs across gpcs */
+		num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask);
+		average_tpcs += num_tpc_gpc[gpc_id];
+
+		/* save the maximum numer of gpcs */
+		max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ?
+				num_tpc_gpc[gpc_id] : max_tpc_gpc;
+
+		/*
+		 * Calculate ratio between TPC count and post-FS and post-SCG
+		 *
+		 * ratio represents relative throughput of the GPC
+		 */
+		scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] /
+				nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id);
+
+		if (min_scg_gpc_pix_perf > scg_gpc_pix_perf) {
+			min_scg_gpc_pix_perf = scg_gpc_pix_perf;
+		}
+
+		/* Calculate # of surviving PES */
+		for (pes_id = 0;
+		     pes_id < nvgpu_gr_config_get_gpc_ppc_count(gr->config, gpc_id);
+		     pes_id++) {
+			/* Count the number of TPC on the set */
+			num_tpc_mask = nvgpu_gr_config_get_pes_tpc_mask(
+						gr->config, gpc_id, pes_id) &
+					gpc_tpc_mask[gpc_id];
+
+			if ((gpc_id == disable_gpc_id) &&
+			    ((num_tpc_mask & BIT32(disable_tpc_id)) != 0U)) {
+
+				if (is_tpc_removed_pes) {
+					err = -EINVAL;
+					goto free_resources;
+				}
+				num_tpc_mask &= ~(BIT32(disable_tpc_id));
+				is_tpc_removed_pes = true;
+			}
+			if (hweight32(num_tpc_mask) != 0UL) {
+				scg_num_pes++;
+			}
+		}
+	}
+
+	if (!is_tpc_removed_gpc || !is_tpc_removed_pes) {
+		err = -EINVAL;
+		goto free_resources;
+	}
+
+	if (max_tpc_gpc == 0U) {
+		*perf = 0;
+		goto free_resources;
+	}
+
+	/* Now calculate perf */
+	scg_world_perf = (scale_factor * scg_num_pes) /
+		nvgpu_gr_config_get_ppc_count(gr->config);
+	deviation = 0;
+	average_tpcs = scale_factor * average_tpcs /
+			nvgpu_gr_config_get_gpc_count(gr->config);
+	for (gpc_id =0;
+	     gpc_id < nvgpu_gr_config_get_gpc_count(gr->config);
+	     gpc_id++) {
+		diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id];
+		if (diff < 0) {
+			diff = -diff;
+		}
+		deviation += U32(diff);
+	}
+
+	deviation /= nvgpu_gr_config_get_gpc_count(gr->config);
+
+	norm_tpc_deviation = deviation / max_tpc_gpc;
+
+	tpc_balance = scale_factor - norm_tpc_deviation;
+
+	if ((tpc_balance > scale_factor)          ||
+	    (scg_world_perf > scale_factor)       ||
+	    (min_scg_gpc_pix_perf > scale_factor) ||
+	    (norm_tpc_deviation > scale_factor)) {
+		err = -EINVAL;
+		goto free_resources;
+	}
+
+	*perf = (pix_scale * min_scg_gpc_pix_perf) +
+		(world_scale * scg_world_perf) +
+		(tpc_scale * tpc_balance);
+free_resources:
+	nvgpu_kfree(g, num_tpc_gpc);
+	return err;
+}
+
+int gv100_gr_config_init_sm_id_table(struct gk20a *g)
+{
+	unsigned long tpc;
+	u32 gpc, sm, pes, gtpc;
+	u32 sm_id = 0;
+	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+	struct gr_gk20a *gr = &g->gr;
+	u32 num_sm = sm_per_tpc * nvgpu_gr_config_get_tpc_count(gr->config);
+	int perf, maxperf;
+	int err = 0;
+	unsigned long *gpc_tpc_mask;
+	u32 *tpc_table, *gpc_table;
+
+	gpc_table = nvgpu_kzalloc(g, nvgpu_gr_config_get_tpc_count(gr->config) *
+					sizeof(u32));
+	tpc_table = nvgpu_kzalloc(g, nvgpu_gr_config_get_tpc_count(gr->config) *
+					sizeof(u32));
+	gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) *
+			nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS));
+
+	if ((gpc_table == NULL) ||
+	    (tpc_table == NULL) ||
+	    (gpc_tpc_mask == NULL)) {
+		nvgpu_err(g, "Error allocating memory for sm tables");
+		err = -ENOMEM;
+		goto exit_build_table;
+	}
+
+	for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) {
+		for (pes = 0;
+		     pes < nvgpu_gr_config_get_gpc_ppc_count(g->gr.config, gpc);
+		     pes++) {
+			gpc_tpc_mask[gpc] |= nvgpu_gr_config_get_pes_tpc_mask(
+						g->gr.config, gpc, pes);
+		}
+	}
+
+	for (gtpc = 0; gtpc < nvgpu_gr_config_get_tpc_count(gr->config); gtpc++) {
+		maxperf = -1;
+		for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) {
+			for_each_set_bit(tpc, &gpc_tpc_mask[gpc],
+					nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc)) {
+				perf = -1;
+				err = gr_gv100_scg_estimate_perf(g,
+						gpc_tpc_mask, gpc, tpc, &perf);
+
+				if (err != 0) {
+					nvgpu_err(g,
+						"Error while estimating perf");
+					goto exit_build_table;
+				}
+
+				if (perf >= maxperf) {
+					maxperf = perf;
+					gpc_table[gtpc] = gpc;
+					tpc_table[gtpc] = tpc;
+				}
+			}
+		}
+		gpc_tpc_mask[gpc_table[gtpc]] &= ~(BIT64(tpc_table[gtpc]));
+	}
+
+	for (tpc = 0, sm_id = 0;  sm_id < num_sm; tpc++, sm_id += sm_per_tpc) {
+		for (sm = 0; sm < sm_per_tpc; sm++) {
+			u32 index = sm_id + sm;
+
+			g->gr.sm_to_cluster[index].gpc_index = gpc_table[tpc];
+			g->gr.sm_to_cluster[index].tpc_index = tpc_table[tpc];
+			g->gr.sm_to_cluster[index].sm_index = sm;
+			g->gr.sm_to_cluster[index].global_tpc_index = tpc;
+			nvgpu_log_info(g,
+				"gpc : %d tpc %d sm_index %d global_index: %d",
+				g->gr.sm_to_cluster[index].gpc_index,
+				g->gr.sm_to_cluster[index].tpc_index,
+				g->gr.sm_to_cluster[index].sm_index,
+				g->gr.sm_to_cluster[index].global_tpc_index);
+
+		}
+	}
+
+	g->gr.no_of_sm = num_sm;
+	nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm);
+exit_build_table:
+	nvgpu_kfree(g, gpc_table);
+	nvgpu_kfree(g, tpc_table);
+	nvgpu_kfree(g, gpc_tpc_mask);
+	return err;
+}
--- a/drivers/gpu/nvgpu/hal/gr/config/gr_config_gv100.h
+++ b/drivers/gpu/nvgpu/hal/gr/config/gr_config_gv100.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_CONFIG_GV100_H
+#define NVGPU_GR_CONFIG_GV100_H
+
+#include <nvgpu/types.h>
+
+struct gk20a;
+
+int gv100_gr_config_init_sm_id_table(struct gk20a *g);
+
+#endif /* NVGPU_GR_CONFIG_GV100_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -415,7 +415,6 @@ struct gpu_ops {
 				u32 graphics_preempt_mode,
 				u32 compute_preempt_mode);
 		int (*set_boosted_ctx)(struct channel_gk20a *ch, bool boost);
-		int (*init_sm_id_table)(struct gk20a *g);
 		int (*init_sw_veid_bundle)(struct gk20a *g);
 		int (*commit_inst)(struct channel_gk20a *c, u64 gpu_va);
 		int (*trigger_suspend)(struct gk20a *g);
@@ -582,6 +581,7 @@ struct gpu_ops {
 				struct nvgpu_gr_config *config, u32 gpc_index,
 				u32 pes_index);
 			u32 (*get_pd_dist_skip_table_size)(void);
+			int (*init_sm_id_table)(struct gk20a *g);
 		} config;

 #ifdef CONFIG_GK20A_CTXSW_TRACE
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -61,6 +61,7 @@
 #include "hal/gr/fecs_trace/fecs_trace_gm20b.h"
 #include "hal/gr/fecs_trace/fecs_trace_gv11b.h"
 #include "hal/gr/config/gr_config_gm20b.h"
+#include "hal/gr/config/gr_config_gv100.h"
 #include "hal/gr/zbc/zbc_gp10b.h"
 #include "hal/gr/zbc/zbc_gv11b.h"
 #include "hal/gr/zcull/zcull_gm20b.h"
@@ -441,7 +442,6 @@ static const struct gpu_ops tu104_ops = {
 		.suspend_contexts = gr_gp10b_suspend_contexts,
 		.resume_contexts = gr_gk20a_resume_contexts,
 		.get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags,
-		.init_sm_id_table = gr_gv100_init_sm_id_table,
 		.commit_inst = gr_gv11b_commit_inst,
 		.trigger_suspend = gv11b_gr_sm_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -610,6 +610,7 @@ static const struct gpu_ops tu104_ops = {
 			.get_pes_tpc_mask = gm20b_gr_config_get_pes_tpc_mask,
 			.get_pd_dist_skip_table_size =
 				gm20b_gr_config_get_pd_dist_skip_table_size,
+			.init_sm_id_table = gv100_gr_config_init_sm_id_table,
 		},
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		.fecs_trace = {