From aba26fa082c988120cba2c1fa326325d550d270e Mon Sep 17 00:00:00 2001 From: Vedashree Vidwans Date: Tue, 13 Apr 2021 21:55:47 -0700 Subject: [PATCH] gpu: nvgpu: handle chip specific erratas Currently, there are few chip specific erratas present in nvgpu code. For better traceability of the erratas and corresponding fixes, introduce flags to indicate existing erratas on a chip. These flags decide if a corresponding solution is applied to the chip(s). This patch introduces below functions to handle errata flags: - nvgpu_init_errata_flags - nvgpu_set_errata - nvgpu_is_errata_present - nvgpu_print_errata_flags - nvgpu_free_errata_flags nvgpu_print_errata_flags: print below details of erratas present in chip 1. errata flag name 2. chip where the errata was first discovered 3. short description of the errata Flags corresponding to erratas present in a chip are set during chip hal init sequence. JIRA NVGPU-6510 Change-Id: Id5a8fb627222ac0a585aba071af052950f4de965 Signed-off-by: Vedashree Vidwans Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2498095 Reviewed-by: Seema Khowala Reviewed-by: Vaibhav Kachore Reviewed-by: mobile promotions Tested-by: mobile promotions GVS: Gerrit_Virtual_Submit --- arch/nvgpu-common.yaml | 6 +- drivers/gpu/nvgpu/Makefile | 1 + drivers/gpu/nvgpu/Makefile.sources | 1 + drivers/gpu/nvgpu/common/fifo/preempt.c | 14 +- drivers/gpu/nvgpu/common/mm/mm.c | 9 +- drivers/gpu/nvgpu/common/nvlink/nvlink.c | 10 +- .../nvgpu/common/sync/channel_sync_syncpt.c | 11 +- .../nvgpu/common/sync/channel_user_syncpt.c | 9 +- drivers/gpu/nvgpu/common/utils/errata.c | 125 ++++++++++++++++ drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c | 29 ++-- drivers/gpu/nvgpu/hal/init/hal_gm20b.c | 5 + drivers/gpu/nvgpu/hal/init/hal_gp10b.c | 5 + drivers/gpu/nvgpu/hal/init/hal_gv11b.c | 5 + drivers/gpu/nvgpu/hal/init/hal_tu104.c | 7 + .../gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c | 5 + drivers/gpu/nvgpu/include/nvgpu/enabled.h | 1 - drivers/gpu/nvgpu/include/nvgpu/errata.h | 137 ++++++++++++++++++ drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 5 + drivers/gpu/nvgpu/os/linux/driver_common.c | 3 +- drivers/gpu/nvgpu/os/linux/module.c | 11 +- drivers/gpu/nvgpu/os/linux/pci.c | 11 +- .../gpu/nvgpu/os/linux/platform_gk20a_tegra.c | 3 +- drivers/gpu/nvgpu/os/linux/sysfs.c | 5 + drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c | 8 + drivers/gpu/nvgpu/os/posix/nvgpu.c | 7 + libs/dgpu/libnvgpu-drv-dgpu_safe.export | 3 + libs/igpu/libnvgpu-drv-igpu_safe.export | 3 + userspace/units/mm/mm/mm.c | 9 +- 28 files changed, 410 insertions(+), 38 deletions(-) create mode 100644 drivers/gpu/nvgpu/common/utils/errata.c create mode 100644 drivers/gpu/nvgpu/include/nvgpu/errata.h diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml index 5d858b9ae..fa8f8163b 100644 --- a/arch/nvgpu-common.yaml +++ b/arch/nvgpu-common.yaml @@ -237,15 +237,17 @@ sim: utils: safe: yes - owner: Terje B + owner: Alex W sources: [ include/nvgpu/utils.h, include/nvgpu/worker.h, include/nvgpu/rbtree.h, include/nvgpu/enabled.h, + include/nvgpu/errata.h, common/utils/string.c, common/utils/worker.c, common/utils/rbtree.c, - common/utils/enabled.c ] + common/utils/enabled.c, + common/utils/errata.c ] ## ## Common elements. diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 6a39b3790..849be93c2 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -200,6 +200,7 @@ nvgpu-y += \ nvgpu-y += \ common/device.o \ common/utils/enabled.o \ + common/utils/errata.o \ common/utils/rbtree.o \ common/utils/string.o \ common/utils/worker.o \ diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index de0409a56..57d8f4ac2 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -92,6 +92,7 @@ endif srcs += common/device.c \ common/utils/enabled.c \ + common/utils/errata.c \ common/utils/rbtree.c \ common/utils/string.c \ common/utils/worker.c \ diff --git a/drivers/gpu/nvgpu/common/fifo/preempt.c b/drivers/gpu/nvgpu/common/fifo/preempt.c index e6ae3565c..17d40ffaf 100644 --- a/drivers/gpu/nvgpu/common/fifo/preempt.c +++ b/drivers/gpu/nvgpu/common/fifo/preempt.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -54,8 +55,11 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) nvgpu_mutex_acquire(&tsg->runlist->runlist_lock); - nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), - RUNLIST_DISABLED); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { + nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), + RUNLIST_DISABLED); + } + #ifdef CONFIG_NVGPU_LS_PMU mutex_ret = nvgpu_pmu_lock_acquire(g, g->pmu, PMU_MUTEX_ID_FIFO, &token); @@ -77,8 +81,10 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) } } #endif - nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), - RUNLIST_ENABLED); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { + nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), + RUNLIST_ENABLED); + } nvgpu_mutex_release(&tsg->runlist->runlist_lock); diff --git a/drivers/gpu/nvgpu/common/mm/mm.c b/drivers/gpu/nvgpu/common/mm/mm.c index 3b01db471..862f63b97 100644 --- a/drivers/gpu/nvgpu/common/mm/mm.c +++ b/drivers/gpu/nvgpu/common/mm/mm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -177,7 +178,7 @@ static void nvgpu_remove_mm_support(struct mm_gk20a *mm) #ifdef CONFIG_NVGPU_DGPU nvgpu_vidmem_destroy(g); - if (g->ops.ramin.deinit_pdb_cache_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) { g->ops.ramin.deinit_pdb_cache_war(g); } #endif @@ -197,7 +198,7 @@ static int nvgpu_init_system_vm(struct mm_gk20a *mm) * For some reason the maxwell PMU code is dependent on the large page * size. No reason AFAICT for this. Probably a bug somewhere. */ - if (nvgpu_is_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM)) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM)) { big_page_size = nvgpu_safe_cast_u64_to_u32(SZ_128K); } @@ -587,14 +588,14 @@ static int nvgpu_init_mm_pdb_cache_war(struct gk20a *g) { int err; - if (g->ops.ramin.init_pdb_cache_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) { err = g->ops.ramin.init_pdb_cache_war(g); if (err != 0) { return err; } } - if (g->ops.fb.apply_pdb_cache_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_FB_PDB_CACHE)) { err = g->ops.fb.apply_pdb_cache_war(g); if (err != 0) { return err; diff --git a/drivers/gpu/nvgpu/common/nvlink/nvlink.c b/drivers/gpu/nvgpu/common/nvlink/nvlink.c index 1c18ab88a..67489cbef 100644 --- a/drivers/gpu/nvgpu/common/nvlink/nvlink.c +++ b/drivers/gpu/nvgpu/common/nvlink/nvlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef CONFIG_NVGPU_NVLINK @@ -104,7 +105,7 @@ static int nvgpu_nvlink_enable_links_post_top(struct gk20a *g, for_each_set_bit(bit, &enabled_links, NVLINK_MAX_LINKS_SW) { link_id = (u32)bit; - if (g->ops.nvlink.set_sw_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_1888034)) { g->ops.nvlink.set_sw_war(g, link_id); } g->ops.nvlink.intr.init_link_err_intr(g, link_id); @@ -264,7 +265,10 @@ int nvgpu_nvlink_early_init(struct gk20a *g) * on the GPU. This is temporary WAR while we get the VBIOS updated with * correct mask. */ - g->ops.nvlink.get_connected_link_mask(&(g->nvlink.connected_links)); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_VBIOS_NVLINK_MASK)) { + g->ops.nvlink.get_connected_link_mask( + &(g->nvlink.connected_links)); + } nvgpu_log(g, gpu_dbg_nvlink, "connected_links = 0x%08x", g->nvlink.connected_links); diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c index 924ac1fa9..8a30c228d 100644 --- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c +++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c @@ -1,7 +1,7 @@ /* * GK20A Channel Synchronization Abstraction * - * Copyright (c) 2014-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -378,8 +379,12 @@ nvgpu_channel_sync_syncpt_create(struct nvgpu_channel *c) * Once nvhost update the return value as NVGPU_INVALID_SYNCPT_ID, * we can remove the zero check. */ - if ((sp->id == 0U) || - (sp->id == NVGPU_INVALID_SYNCPT_ID)) { + if ((nvgpu_is_errata_present(c->g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0)) && + (sp->id == 0U)) { + nvgpu_err(c->g, "failed to get free syncpt"); + goto err_free; + } + if (sp->id == NVGPU_INVALID_SYNCPT_ID) { nvgpu_err(c->g, "failed to get free syncpt"); goto err_free; } diff --git a/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c index 6382e8883..e9088795b 100644 --- a/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c +++ b/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "channel_user_syncpt_priv.h" static int user_sync_build_debug_name(struct nvgpu_channel *ch, @@ -93,8 +94,12 @@ nvgpu_channel_user_syncpt_create(struct nvgpu_channel *ch) * Once nvhost update the return value as NVGPU_INVALID_SYNCPT_ID, * we can remove the zero check. */ - if ((s->syncpt_id == 0U) || - (s->syncpt_id == NVGPU_INVALID_SYNCPT_ID)) { + if ((nvgpu_is_errata_present(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0)) && + (s->syncpt_id == 0U)) { + nvgpu_err(g, "failed to get free syncpt"); + goto err_free; + } + if (s->syncpt_id == NVGPU_INVALID_SYNCPT_ID) { nvgpu_err(g, "failed to get free syncpt"); goto err_free; } diff --git a/drivers/gpu/nvgpu/common/utils/errata.c b/drivers/gpu/nvgpu/common/utils/errata.c new file mode 100644 index 000000000..8a6182405 --- /dev/null +++ b/drivers/gpu/nvgpu/common/utils/errata.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +/** + * Array of flag names + */ +#define DEFINE_ERRATA(flag, chip, desc) [flag] = nvgpu_stringify(flag) +static const char *errata_flag_names[NVGPU_MAX_ERRATA_BITS + 1U] = { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +/** + * Array of flag descriptions + */ +#define DEFINE_ERRATA(flag, chip, desc) [flag] = desc +static const char *errata_flag_desc[NVGPU_MAX_ERRATA_BITS + 1U] = { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +/** + * Array of chips where errata was first discovered + */ +#define DEFINE_ERRATA(flag, chip, desc) [flag] = chip +static const char *errata_flag_chip[NVGPU_MAX_ERRATA_BITS + 1U] = { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +void nvgpu_print_errata_flags(struct gk20a *g) +{ + u32 i; + + nvgpu_log(g, gpu_dbg_info, "NVGPU Erratas present in chip"); + nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s", + "Flag", "Chip", "Description"); + nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s", + "----", "-----", "-----------"); + + for (i = 0U; i < U32(NVGPU_MAX_ERRATA_BITS); i++) { + /* Only print erratas present in chip */ + if (nvgpu_is_errata_present(g, i)) { + nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s", + errata_flag_names[i], + errata_flag_chip[i], + errata_flag_desc[i]); + } + } +} + +int nvgpu_init_errata_flags(struct gk20a *g) +{ + /* + * Zero all flags initially. Flags that should be set to non-zero states + * can be done so during hal init. + */ + g->errata_flags = nvgpu_kzalloc(g, + BITS_TO_LONGS(U32(NVGPU_MAX_ERRATA_BITS)) * + sizeof(unsigned long)); + if (g->errata_flags == NULL) { + return -ENOMEM; + } + + return 0; +} + +/* + * Call this on driver shutdown! + */ +void nvgpu_free_errata_flags(struct gk20a *g) +{ + nvgpu_kfree(g, g->errata_flags); +} + +bool nvgpu_is_errata_present(struct gk20a *g, u32 flag) +{ + if (flag < NVGPU_MAX_ERRATA_BITS) { + return nvgpu_test_bit(flag, g->errata_flags); + } else { + return 0; + } +} + +void nvgpu_set_errata(struct gk20a *g, u32 flag, bool state) +{ + if (flag >= NVGPU_MAX_ERRATA_BITS) { + return; + } + + if (state) { + nvgpu_set_bit(flag, g->errata_flags); + } else { + nvgpu_clear_bit(flag, g->errata_flags); + } +} diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c index 03121309f..25489c009 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -173,11 +174,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g, nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "Single bit error detected in SM LRF!"); - gr_gp10b_sm_lrf_ecc_overcount_war(true, - lrf_ecc_sed_status, - lrf_ecc_ded_status, - &lrf_single_count_delta, - lrf_double_count_delta); + if (nvgpu_is_errata_present(g, + NVGPU_ERRATA_LRF_ECC_OVERCOUNT)) { + gr_gp10b_sm_lrf_ecc_overcount_war(true, + lrf_ecc_sed_status, + lrf_ecc_ded_status, + &lrf_single_count_delta, + lrf_double_count_delta); + } g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter = nvgpu_safe_add_u32( g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter, @@ -187,11 +191,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g, nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "Double bit error detected in SM LRF!"); - gr_gp10b_sm_lrf_ecc_overcount_war(false, - lrf_ecc_sed_status, - lrf_ecc_ded_status, - &lrf_double_count_delta, - lrf_single_count_delta); + if (nvgpu_is_errata_present(g, + NVGPU_ERRATA_LRF_ECC_OVERCOUNT)) { + gr_gp10b_sm_lrf_ecc_overcount_war(false, + lrf_ecc_sed_status, + lrf_ecc_ded_status, + &lrf_double_count_delta, + lrf_single_count_delta); + } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter = nvgpu_safe_add_u32( g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter, diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index 8de0801c6..f55e66ee0 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -1168,6 +1169,10 @@ int gm20b_init_hal(struct gk20a *g) gops->get_litter_value = gm20b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_1547668, true); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, true); #ifdef CONFIG_NVGPU_FECS_TRACE nvgpu_set_enabled(g, NVGPU_FECS_TRACE_VA, false); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index 8601f702c..05034f4ec 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -1253,6 +1254,10 @@ int gp10b_init_hal(struct gk20a *g) gops->get_litter_value = gp10b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_LRF_ECC_OVERCOUNT, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, true); #ifdef CONFIG_NVGPU_FECS_TRACE nvgpu_set_enabled(g, NVGPU_FECS_TRACE_VA, false); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 8d716f70e..3ee57cafa 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -1519,6 +1520,10 @@ int gv11b_init_hal(struct gk20a *g) gops->get_litter_value = gv11b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_2016608, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, false); /* Read fuses to check if gpu needs to boot in secure/non-secure mode */ diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index e9c7e8c76..1a053f1ef 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include "hal/mm/mm_gm20b.h" #include "hal/mm/mm_gp10b.h" @@ -1723,6 +1724,12 @@ int tu104_init_hal(struct gk20a *g) gops->get_litter_value = tu104_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_INIT_PDB_CACHE, true); + nvgpu_set_errata(g, NVGPU_ERRATA_FB_PDB_CACHE, true); + nvgpu_set_errata(g, NVGPU_ERRATA_VBIOS_NVLINK_MASK, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_SEC_PRIVSECURITY, true); nvgpu_set_enabled(g, NVGPU_SEC_SECUREGPCCS, true); nvgpu_set_enabled(g, NVGPU_SUPPORT_MULTIPLE_WPR, true); diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c index 007eff742..5bd3bf6dd 100644 --- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c @@ -92,6 +92,7 @@ #include "common/clk_arb/clk_arb_gp10b.h" #include +#include #include #include #include @@ -1063,6 +1064,10 @@ int vgpu_gv11b_init_hal(struct gk20a *g) gops->top = vgpu_gv11b_ops_top; gops->grmgr = vgpu_gv11b_ops_grmgr; + nvgpu_set_errata(g, NVGPU_ERRATA_2016608, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + #ifdef CONFIG_NVGPU_FECS_TRACE nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true); #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h index 6280d7d63..cc3001647 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h +++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h @@ -86,7 +86,6 @@ struct gk20a; "Use coherent aperture for sysmem"), \ DEFINE_FLAG(NVGPU_MM_USE_PHYSICAL_SG, \ "Use physical scatter tables instead of IOMMU"), \ - DEFINE_FLAG(NVGPU_MM_FORCE_128K_PMU_VM, "WAR for gm20b chips"), \ DEFINE_FLAG(NVGPU_MM_BYPASSES_IOMMU, \ "Some chips (using nvlink) bypass the IOMMU on tegra"), \ /* Host Flags */ \ diff --git a/drivers/gpu/nvgpu/include/nvgpu/errata.h b/drivers/gpu/nvgpu/include/nvgpu/errata.h new file mode 100644 index 000000000..056598c14 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/errata.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_ERRATA_H +#define NVGPU_ERRATA_H + +struct gk20a; + +#include + +/** + * @defgroup errata + * @ingroup unit-common-utils + * @{ + */ + +/** @cond DOXYGEN_SHOULD_SKIP_THIS */ +#if defined(CONFIG_NVGPU_NON_FUSA) && defined(CONFIG_NVGPU_NEXT) +#include "include/nvgpu/nvgpu_next_errata.h" +#else +#define ERRATA_FLAGS_NEXT +#endif +/** @endcond DOXYGEN_SHOULD_SKIP_THIS */ + +/* + * Available flags that describes an errata with details about where the issues + * were first discovered. Each flag here is defined by it's offset + * in a bitmap. + */ + +#define ERRATA_FLAGS \ + /* GM20B */ \ + DEFINE_ERRATA(NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, "GM20B", "MM"),\ + DEFINE_ERRATA(NVGPU_ERRATA_1547668, "GM20B", "CG"), \ + /* GP10B */ \ + DEFINE_ERRATA(NVGPU_ERRATA_LRF_ECC_OVERCOUNT, "GP10B", "GR ECC"), \ + DEFINE_ERRATA(NVGPU_ERRATA_200391931, "GP10B", "GR Perf"), \ + /* GV11B */ \ + DEFINE_ERRATA(NVGPU_ERRATA_2016608, "GV11B", "FIFO Runlist preempt"), \ + /* GV100 */ \ + DEFINE_ERRATA(NVGPU_ERRATA_1888034, "GV100", "Nvlink"), \ + /* TU104 */ \ + DEFINE_ERRATA(NVGPU_ERRATA_INIT_PDB_CACHE, "TU104", "MM PDB"), \ + DEFINE_ERRATA(NVGPU_ERRATA_FB_PDB_CACHE, "TU104", "FB PDB"), \ + DEFINE_ERRATA(NVGPU_ERRATA_VBIOS_NVLINK_MASK, "TU104", "Nvlink VBIOS"),\ + /* NvGPU Driver */ \ + DEFINE_ERRATA(NVGPU_ERRATA_SYNCPT_INVALID_ID_0, "SW", "Syncpt ID"),\ + DEFINE_ERRATA(NVGPU_MAX_ERRATA_BITS, "NA", "Marks max number of flags"), + +/** + * Enumerated array of flags + */ +#define DEFINE_ERRATA(flag, chip, desc) flag +enum enum_errata_flags { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +/** + * @brief Check if the passed flag is enabled. + * + * @param g [in] The GPU. + * @param flag [in] Which flag to check. + * + * @return Boolean value to indicate the status of the bit. + * + * @retval TRUE if given errata is present. + * @retval FALSE if given errata is absent. + */ +bool nvgpu_is_errata_present(struct gk20a *g, u32 flag); + +/** + * @brief Initialize and allocate memory for errata flags. + * + * @param g [in] The GPU pointer. + * + * @return 0 for success, < 0 for error. + * + * @retval -ENOMEM if fails to allocate the necessary memory. + */ +int nvgpu_init_errata_flags(struct gk20a *g); + +/** + * @brief Free errata flags memory. Called during driver exit. + * + * @param g [in] The GPU pointer. + */ +void nvgpu_free_errata_flags(struct gk20a *g); + +/** + * @brief Print errata flags value. + * + * @param g [in] The GPU pointer. + */ +void nvgpu_print_errata_flags(struct gk20a *g); + +/** + * @brief Set state of a errata flag. + * + * @param g [in] The GPU. + * @param flag [in] Flag index. + * @param state [in] The state to set the \a flag to. + * + * Set state of the given \a flag index to \a state. + * + * This is generally a somewhat low level operation with lots of potential + * side effects. Be weary about where and when you use this. Typically a bunch + * of calls to this early in the driver boot sequence makes sense (as + * information is determined about the GPU at run time). Calling this in steady + * state operation is probably an incorrect thing to do. + */ +void nvgpu_set_errata(struct gk20a *g, u32 flag, bool state); + +/** + * @} + */ +#endif /* NVGPU_ERRATA_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 489ae1c3f..28b742b95 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -357,6 +357,11 @@ struct gk20a { */ struct nvgpu_nvhost_dev *nvhost; + /** + * Used by . Do not access directly! + */ + unsigned long *errata_flags; + /** * Used by . Do not access directly! */ diff --git a/drivers/gpu/nvgpu/os/linux/driver_common.c b/drivers/gpu/nvgpu/os/linux/driver_common.c index 22ac511c6..682f8425a 100644 --- a/drivers/gpu/nvgpu/os/linux/driver_common.c +++ b/drivers/gpu/nvgpu/os/linux/driver_common.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -256,7 +257,7 @@ static void nvgpu_init_mm_vars(struct gk20a *g) platform->unified_memory); nvgpu_set_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES, platform->unify_address_spaces); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, platform->force_128K_pmu_vm); nvgpu_mutex_init(&g->mm.tlb_lock); diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index a68736b47..de6fa7ca0 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1009,6 +1010,7 @@ void gk20a_remove_support(struct gk20a *g) nvgpu_remove_usermode_support(g); nvgpu_free_enabled_flags(g); + nvgpu_free_errata_flags(g); gk20a_lockout_registers(g); } @@ -1616,9 +1618,13 @@ static int gk20a_probe(struct platform_device *dev) nvgpu_kmem_init(gk20a); + err = nvgpu_init_errata_flags(gk20a); + if (err) + goto return_err_platform; + err = nvgpu_init_enabled_flags(gk20a); if (err) - goto return_err; + goto return_err_errata; np = nvgpu_get_node(gk20a); if (of_dma_is_coherent(np)) { @@ -1730,6 +1736,9 @@ static int gk20a_probe(struct platform_device *dev) return_err: nvgpu_free_enabled_flags(gk20a); +return_err_errata: + nvgpu_free_errata_flags(gk20a); +return_err_platform: /* * Last since the above allocs may use data structures in here. diff --git a/drivers/gpu/nvgpu/os/linux/pci.c b/drivers/gpu/nvgpu/os/linux/pci.c index 6484dfbc8..343889b59 100644 --- a/drivers/gpu/nvgpu/os/linux/pci.c +++ b/drivers/gpu/nvgpu/os/linux/pci.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -542,10 +543,15 @@ static int nvgpu_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, platform); - err = nvgpu_init_enabled_flags(g); + err = nvgpu_init_errata_flags(g); if (err) goto err_free_platform; + err = nvgpu_init_enabled_flags(g); + if (err) { + goto err_free_errata; + } + platform->g = g; l->dev = &pdev->dev; @@ -690,6 +696,9 @@ err_disable_msi: if (g->msi_enabled) pci_disable_msi(pdev); #endif + nvgpu_free_enabled_flags(g); +err_free_errata: + nvgpu_free_errata_flags(g); err_free_platform: nvgpu_kfree(g, platform); err_free_l: diff --git a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c index 67b9804a8..62754c191 100644 --- a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c +++ b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c @@ -61,6 +61,7 @@ #include #include #include +#include #include "hal/clk/clk_gm20b.h" @@ -864,7 +865,7 @@ static int gk20a_tegra_probe(struct device *dev) } platform->g->clk.gpc_pll.id = GK20A_GPC_PLL; - if (platform->platform_chip_id == TEGRA_210) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_1547668)) { /* WAR for bug 1547668: Disable railgating and scaling irrespective of platform data if the rework was not made. */ np = of_find_node_by_path("/gpu-dvfs-rework"); diff --git a/drivers/gpu/nvgpu/os/linux/sysfs.c b/drivers/gpu/nvgpu/os/linux/sysfs.c index 404a083bb..4157f10f9 100644 --- a/drivers/gpu/nvgpu/os/linux/sysfs.c +++ b/drivers/gpu/nvgpu/os/linux/sysfs.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -492,6 +493,10 @@ static ssize_t ldiv_slowdown_factor_store(struct device *dev, unsigned long val = 0; int err; + if (!nvgpu_is_errata_present(g, NVGPU_ERRATA_200391931)) { + return 0; + } + if (kstrtoul(buf, 10, &val) < 0) { nvgpu_err(g, "parse error for input SLOWDOWN factor\n"); return -EINVAL; diff --git a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c index c154c5886..e5aeefbf3 100644 --- a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c +++ b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -341,8 +342,15 @@ int vgpu_probe(struct platform_device *pdev) nvgpu_kmem_init(gk20a); + err = nvgpu_init_errata_flags(gk20a); + if (err) { + kfree(gk20a); + return err; + } + err = nvgpu_init_enabled_flags(gk20a); if (err) { + nvgpu_free_errata_flags(gk20a); kfree(gk20a); return err; } diff --git a/drivers/gpu/nvgpu/os/posix/nvgpu.c b/drivers/gpu/nvgpu/os/posix/nvgpu.c index c16d027ea..72b5a21d9 100644 --- a/drivers/gpu/nvgpu/os/posix/nvgpu.c +++ b/drivers/gpu/nvgpu/os/posix/nvgpu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -267,6 +268,10 @@ struct gk20a *nvgpu_posix_probe(void) goto fail_kmem; } + if (nvgpu_init_errata_flags(g) != 0) { + goto fail_errata_flags; + } + if (nvgpu_init_enabled_flags(g) != 0) { goto fail_enabled_flags; } @@ -297,6 +302,8 @@ struct gk20a *nvgpu_posix_probe(void) return g; fail_enabled_flags: + nvgpu_free_errata_flags(g); +fail_errata_flags: nvgpu_kmem_fini(g, 0); fail_kmem: free(p); diff --git a/libs/dgpu/libnvgpu-drv-dgpu_safe.export b/libs/dgpu/libnvgpu-drv-dgpu_safe.export index 846646e05..7e5f4dde7 100644 --- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export +++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export @@ -500,6 +500,7 @@ nvgpu_gr_suspend nvgpu_gr_sw_ready nvgpu_has_syncpoints nvgpu_init_enabled_flags +nvgpu_init_errata_flags nvgpu_init_hal nvgpu_init_ltc_support nvgpu_init_mm_support @@ -510,6 +511,7 @@ nvgpu_iommuable nvgpu_free_inst_block nvgpu_inst_block_ptr nvgpu_is_enabled +nvgpu_is_errata_present nvgpu_kcalloc_impl nvgpu_kfree_impl nvgpu_kmalloc_impl @@ -703,6 +705,7 @@ nvgpu_tsg_unbind_channel_check_hw_state nvgpu_tsg_unbind_channel_check_ctx_reload nvgpu_set_bit nvgpu_set_enabled +nvgpu_set_errata nvgpu_set_power_state nvgpu_set_pte nvgpu_sgt_alignment diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export index fc3e3c38d..12cf3cb10 100644 --- a/libs/igpu/libnvgpu-drv-igpu_safe.export +++ b/libs/igpu/libnvgpu-drv-igpu_safe.export @@ -515,6 +515,7 @@ nvgpu_gr_suspend nvgpu_gr_sw_ready nvgpu_has_syncpoints nvgpu_init_enabled_flags +nvgpu_init_errata_flags nvgpu_init_fb_support nvgpu_init_hal nvgpu_init_ltc_support @@ -526,6 +527,7 @@ nvgpu_iommuable nvgpu_free_inst_block nvgpu_inst_block_ptr nvgpu_is_enabled +nvgpu_is_errata_present nvgpu_kcalloc_impl nvgpu_kfree_impl nvgpu_kmalloc_impl @@ -719,6 +721,7 @@ nvgpu_tsg_unbind_channel_check_hw_state nvgpu_tsg_unbind_channel_check_ctx_reload nvgpu_set_bit nvgpu_set_enabled +nvgpu_set_errata nvgpu_set_power_state nvgpu_set_pte nvgpu_sgt_alignment diff --git a/userspace/units/mm/mm/mm.c b/userspace/units/mm/mm/mm.c index 621a71658..a45272b95 100644 --- a/userspace/units/mm/mm/mm.c +++ b/userspace/units/mm/mm/mm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -125,7 +126,7 @@ static void init_platform(struct unit_module *m, struct gk20a *g, bool is_iGPU) /* Enable extra features to increase line coverage */ nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, true); nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, true); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true); } /* @@ -289,7 +290,7 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args) */ nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, false); nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, false); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, false); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, false); g->has_cde = false; errors += nvgpu_init_mm_support_inject_error(m, g, ERROR_TYPE_HAL, 1, @@ -297,7 +298,7 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args) nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, true); nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, true); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true); g->has_cde = true; /*