diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 8f1d42dac..d6d601479 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -28,6 +28,7 @@ nvgpu-y += \ $(nvgpu-t19x)/gv100/fb_gv100.o \ $(nvgpu-t19x)/gv100/bios_gv100.o \ $(nvgpu-t19x)/gv100/fifo_gv100.o \ + $(nvgpu-t19x)/gv100/gr_gv100.o \ $(nvgpu-t19x)/gv100/hal_gv100.o nvgpu-$(CONFIG_TEGRA_GK20A) += $(nvgpu-t19x)/gv11b/platform_gv11b_tegra.o diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.c b/drivers/gpu/nvgpu/gv100/gr_gv100.c new file mode 100644 index 000000000..4b2038ba2 --- /dev/null +++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c @@ -0,0 +1,289 @@ +/* + * GV100 GPU GR + * + * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include "gk20a/gk20a.h" +#include "gk20a/gr_gk20a.h" + +#include "gv100/gr_gv100.h" +#include "gv11b/subctx_gv11b.h" + +#include +#include + +/* + * Estimate performance if the given logical TPC in the given logical GPC were + * removed. + */ +static int gr_gv100_scg_estimate_perf(struct gk20a *g, + unsigned long *gpc_tpc_mask, + u32 disable_gpc_id, u32 disable_tpc_id, + int *perf) +{ + struct gr_gk20a *gr = &g->gr; + int err = 0; + u32 scale_factor = 512UL; /* Use fx23.9 */ + u32 pix_scale = 1024*1024UL; /* Pix perf in [29:20] */ + u32 world_scale = 1024UL; /* World performance in [19:10] */ + u32 tpc_scale = 1; /* TPC balancing in [9:0] */ + u32 scg_num_pes = 0; + u32 min_scg_gpc_pix_perf = scale_factor; /* Init perf as maximum */ + u32 average_tpcs = 0; /* Average of # of TPCs per GPC */ + u32 deviation; /* absolute diff between TPC# and + * average_tpcs, averaged across GPCs + */ + u32 norm_tpc_deviation; /* deviation/max_tpc_per_gpc */ + u32 tpc_balance; + u32 scg_gpc_pix_perf; + u32 scg_world_perf; + u32 gpc_id; + u32 pes_id; + int diff; + bool is_tpc_removed_gpc = false; + bool is_tpc_removed_pes = false; + u32 max_tpc_gpc = 0; + u32 num_tpc_mask; + u32 *num_tpc_gpc = nvgpu_kzalloc(g, sizeof(u32) * + nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS)); + + if (!num_tpc_gpc) + return -ENOMEM; + + /* Calculate pix-perf-reduction-rate per GPC and find bottleneck TPC */ + for (gpc_id = 0; gpc_id < gr->gpc_count; gpc_id++) { + num_tpc_mask = gpc_tpc_mask[gpc_id]; + + if ((gpc_id == disable_gpc_id) && num_tpc_mask & + (0x1 << disable_tpc_id)) { + /* Safety check if a TPC is removed twice */ + if (is_tpc_removed_gpc) { + err = -EINVAL; + goto free_resources; + } + /* Remove logical TPC from set */ + num_tpc_mask &= ~(0x1 << disable_tpc_id); + is_tpc_removed_gpc = true; + } + + /* track balancing of tpcs across gpcs */ + num_tpc_gpc[gpc_id] = hweight32(num_tpc_mask); + average_tpcs += num_tpc_gpc[gpc_id]; + + /* save the maximum numer of gpcs */ + max_tpc_gpc = num_tpc_gpc[gpc_id] > max_tpc_gpc ? + num_tpc_gpc[gpc_id] : max_tpc_gpc; + + /* + * Calculate ratio between TPC count and post-FS and post-SCG + * + * ratio represents relative throughput of the GPC + */ + scg_gpc_pix_perf = scale_factor * num_tpc_gpc[gpc_id] / + gr->gpc_tpc_count[gpc_id]; + + if (min_scg_gpc_pix_perf > scg_gpc_pix_perf) + min_scg_gpc_pix_perf = scg_gpc_pix_perf; + + /* Calculate # of surviving PES */ + for (pes_id = 0; pes_id < gr->gpc_ppc_count[gpc_id]; pes_id++) { + /* Count the number of TPC on the set */ + num_tpc_mask = gr->pes_tpc_mask[pes_id][gpc_id] & + gpc_tpc_mask[gpc_id]; + + if ((gpc_id == disable_gpc_id) && (num_tpc_mask & + (0x1 << disable_tpc_id))) { + + if (is_tpc_removed_pes) { + err = -EINVAL; + goto free_resources; + } + num_tpc_mask &= ~(0x1 << disable_tpc_id); + is_tpc_removed_pes = true; + } + if (hweight32(num_tpc_mask)) + scg_num_pes++; + } + } + + if (!is_tpc_removed_gpc || !is_tpc_removed_pes) { + err = -EINVAL; + goto free_resources; + } + + if (max_tpc_gpc == 0) { + *perf = 0; + goto free_resources; + } + + /* Now calculate perf */ + scg_world_perf = (scale_factor * scg_num_pes) / gr->ppc_count; + deviation = 0; + average_tpcs = scale_factor * average_tpcs / gr->gpc_count; + for (gpc_id =0; gpc_id < gr->gpc_count; gpc_id++) { + diff = average_tpcs - scale_factor * num_tpc_gpc[gpc_id]; + if (diff < 0) + diff = -diff; + deviation += diff; + } + + deviation /= gr->gpc_count; + + norm_tpc_deviation = deviation / max_tpc_gpc; + + tpc_balance = scale_factor - norm_tpc_deviation; + + if ((tpc_balance > scale_factor) || + (scg_world_perf > scale_factor) || + (min_scg_gpc_pix_perf > scale_factor) || + (norm_tpc_deviation > scale_factor)) { + err = -EINVAL; + goto free_resources; + } + + *perf = (pix_scale * min_scg_gpc_pix_perf) + + (world_scale * scg_world_perf) + + (tpc_scale * tpc_balance); +free_resources: + nvgpu_kfree(g, num_tpc_gpc); + return err; +} + +void gr_gv100_bundle_cb_defaults(struct gk20a *g) +{ + struct gr_gk20a *gr = &g->gr; + + gr->bundle_cb_default_size = + gr_scc_bundle_cb_size_div_256b__prod_v(); + gr->min_gpm_fifo_depth = + gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(); + gr->bundle_cb_token_limit = + gr_pd_ab_dist_cfg2_token_limit_init_v(); +} + +void gr_gv100_cb_size_default(struct gk20a *g) +{ + struct gr_gk20a *gr = &g->gr; + + if (!gr->attrib_cb_default_size) + gr->attrib_cb_default_size = + gr_gpc0_ppc0_cbm_beta_cb_size_v_default_v(); + gr->alpha_cb_default_size = + gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v(); +} + +void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index) +{ +} + +void gr_gv100_init_sm_id_table(struct gk20a *g) +{ + u32 gpc, tpc, sm, pes, gtpc; + u32 sm_id = 0; + u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); + u32 num_sm = sm_per_tpc * g->gr.tpc_count; + int perf, maxperf; + int err; + unsigned long *gpc_tpc_mask; + u32 *tpc_table, *gpc_table; + + gpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32)); + tpc_table = nvgpu_kzalloc(g, g->gr.tpc_count * sizeof(u32)); + gpc_tpc_mask = nvgpu_kzalloc(g, sizeof(unsigned long) * + nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS)); + + if (!gpc_table || !tpc_table || !gpc_tpc_mask) { + nvgpu_err(g, "Error allocating memory for sm tables"); + goto exit_build_table; + } + + for (gpc = 0; gpc < g->gr.gpc_count; gpc++) + for (pes = 0; pes < g->gr.gpc_ppc_count[gpc]; pes++) + gpc_tpc_mask[gpc] |= g->gr.pes_tpc_mask[pes][gpc]; + + for (gtpc = 0; gtpc < g->gr.tpc_count; gtpc++) { + maxperf = -1; + for (gpc = 0; gpc < g->gr.gpc_count; gpc++) { + for_each_set_bit(tpc, &gpc_tpc_mask[gpc], + g->gr.gpc_tpc_count[gpc]) { + perf = -1; + err = gr_gv100_scg_estimate_perf(g, + gpc_tpc_mask, gpc, tpc, &perf); + + if (err) { + nvgpu_err(g, + "Error while estimating perf"); + goto exit_build_table; + } + + if (perf >= maxperf) { + maxperf = perf; + gpc_table[gtpc] = gpc; + tpc_table[gtpc] = tpc; + } + } + } + gpc_tpc_mask[gpc_table[gtpc]] &= ~(0x1 << tpc_table[gtpc]); + } + + for (tpc = 0, sm_id = 0; sm_id < num_sm; tpc++, sm_id += sm_per_tpc) { + for (sm = 0; sm < sm_per_tpc; sm++) { + g->gr.sm_to_cluster[sm_id + sm].gpc_index = + gpc_table[tpc]; + g->gr.sm_to_cluster[sm_id + sm].tpc_index = + tpc_table[tpc]; + g->gr.sm_to_cluster[sm_id + sm].sm_index = sm; + g->gr.sm_to_cluster[sm_id + sm].global_tpc_index = tpc; + } + } + + g->gr.no_of_sm = num_sm; + nvgpu_log_info(g, " total number of sm = %d", g->gr.no_of_sm); +exit_build_table: + nvgpu_kfree(g, gpc_table); + nvgpu_kfree(g, tpc_table); + nvgpu_kfree(g, gpc_tpc_mask); +} + +void gr_gv100_load_tpc_mask(struct gk20a *g) +{ + u64 pes_tpc_mask = 0x0ULL; + u32 gpc, pes; + u32 num_tpc_per_gpc = nvgpu_get_litter_value(g, + GPU_LIT_NUM_TPC_PER_GPC); + + /* gv100 has 6 GPC and 7 TPC/GPC */ + for (gpc = 0; gpc < g->gr.gpc_count; gpc++) { + for (pes = 0; pes < g->gr.pe_count_per_gpc; pes++) { + pes_tpc_mask |= (u64) g->gr.pes_tpc_mask[pes][gpc] << + (num_tpc_per_gpc * gpc); + } + } + + nvgpu_log_info(g, "pes_tpc_mask: %016llx\n", pes_tpc_mask); + gk20a_writel(g, gr_fe_tpc_fs_r(0), u64_lo32(pes_tpc_mask)); + gk20a_writel(g, gr_fe_tpc_fs_r(1), u64_hi32(pes_tpc_mask)); +} diff --git a/drivers/gpu/nvgpu/gv100/gr_gv100.h b/drivers/gpu/nvgpu/gv100/gr_gv100.h new file mode 100644 index 000000000..460b05aeb --- /dev/null +++ b/drivers/gpu/nvgpu/gv100/gr_gv100.h @@ -0,0 +1,36 @@ +/* + * GV100 GPU GR + * + * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _NVGPU_GR_GV100_H_ +#define _NVGPU_GR_GV100_H_ + +void gr_gv100_bundle_cb_defaults(struct gk20a *g); +void gr_gv100_cb_size_default(struct gk20a *g); +void gr_gv100_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index); +void gr_gv100_init_sm_id_table(struct gk20a *g); +void gr_gv100_program_sm_id_numbering(struct gk20a *g, + u32 gpc, u32 tpc, u32 smid); +int gr_gv100_load_smid_config(struct gk20a *g); + +#endif diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index f20d2dcff..cefaf1aef 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -78,7 +78,7 @@ #include "gv11b/dbg_gpu_gv11b.h" #include "gv11b/hal_gv11b.h" -#include "gv11b/gr_gv11b.h" +#include "gv100/gr_gv100.h" #include "gv11b/mc_gv11b.h" #include "gv11b/ltc_gv11b.h" #include "gv11b/gv11b.h" @@ -263,8 +263,8 @@ static const struct gpu_ops gv100_ops = { }, .gr = { .init_gpc_mmu = gr_gv11b_init_gpc_mmu, - .bundle_cb_defaults = gr_gv11b_bundle_cb_defaults, - .cb_size_default = gr_gv11b_cb_size_default, + .bundle_cb_defaults = gr_gv100_bundle_cb_defaults, + .cb_size_default = gr_gv100_cb_size_default, .calc_global_ctx_buffer_size = gr_gv11b_calc_global_ctx_buffer_size, .commit_global_attrib_cb = gr_gv11b_commit_global_attrib_cb, @@ -285,7 +285,7 @@ static const struct gpu_ops gv100_ops = { .set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask, .falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments, .load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode, - .set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask, + .set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask, .get_gpc_tpc_mask = gr_gm20b_get_gpc_tpc_mask, .free_channel_ctx = gk20a_free_channel_ctx, .alloc_obj_ctx = gk20a_alloc_obj_ctx, @@ -335,7 +335,7 @@ static const struct gpu_ops gv100_ops = { .resume_contexts = gr_gk20a_resume_contexts, .get_preemption_mode_flags = gr_gp10b_get_preemption_mode_flags, .fuse_override = gp10b_gr_fuse_override, - .init_sm_id_table = gr_gv11b_init_sm_id_table, + .init_sm_id_table = gr_gv100_init_sm_id_table, .load_smid_config = gr_gv11b_load_smid_config, .program_sm_id_numbering = gr_gv11b_program_sm_id_numbering, .is_ltcs_ltss_addr = gr_gm20b_is_ltcs_ltss_addr,