diff --git a/arch/nvgpu-common.yaml b/arch/nvgpu-common.yaml index 5d858b9ae..fa8f8163b 100644 --- a/arch/nvgpu-common.yaml +++ b/arch/nvgpu-common.yaml @@ -237,15 +237,17 @@ sim: utils: safe: yes - owner: Terje B + owner: Alex W sources: [ include/nvgpu/utils.h, include/nvgpu/worker.h, include/nvgpu/rbtree.h, include/nvgpu/enabled.h, + include/nvgpu/errata.h, common/utils/string.c, common/utils/worker.c, common/utils/rbtree.c, - common/utils/enabled.c ] + common/utils/enabled.c, + common/utils/errata.c ] ## ## Common elements. diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile index 6a39b3790..849be93c2 100644 --- a/drivers/gpu/nvgpu/Makefile +++ b/drivers/gpu/nvgpu/Makefile @@ -200,6 +200,7 @@ nvgpu-y += \ nvgpu-y += \ common/device.o \ common/utils/enabled.o \ + common/utils/errata.o \ common/utils/rbtree.o \ common/utils/string.o \ common/utils/worker.o \ diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources index de0409a56..57d8f4ac2 100644 --- a/drivers/gpu/nvgpu/Makefile.sources +++ b/drivers/gpu/nvgpu/Makefile.sources @@ -92,6 +92,7 @@ endif srcs += common/device.c \ common/utils/enabled.c \ + common/utils/errata.c \ common/utils/rbtree.c \ common/utils/string.c \ common/utils/worker.c \ diff --git a/drivers/gpu/nvgpu/common/fifo/preempt.c b/drivers/gpu/nvgpu/common/fifo/preempt.c index e6ae3565c..17d40ffaf 100644 --- a/drivers/gpu/nvgpu/common/fifo/preempt.c +++ b/drivers/gpu/nvgpu/common/fifo/preempt.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -54,8 +55,11 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) nvgpu_mutex_acquire(&tsg->runlist->runlist_lock); - nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), - RUNLIST_DISABLED); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { + nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), + RUNLIST_DISABLED); + } + #ifdef CONFIG_NVGPU_LS_PMU mutex_ret = nvgpu_pmu_lock_acquire(g, g->pmu, PMU_MUTEX_ID_FIFO, &token); @@ -77,8 +81,10 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) } } #endif - nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), - RUNLIST_ENABLED); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) { + nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id), + RUNLIST_ENABLED); + } nvgpu_mutex_release(&tsg->runlist->runlist_lock); diff --git a/drivers/gpu/nvgpu/common/mm/mm.c b/drivers/gpu/nvgpu/common/mm/mm.c index 3b01db471..862f63b97 100644 --- a/drivers/gpu/nvgpu/common/mm/mm.c +++ b/drivers/gpu/nvgpu/common/mm/mm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -177,7 +178,7 @@ static void nvgpu_remove_mm_support(struct mm_gk20a *mm) #ifdef CONFIG_NVGPU_DGPU nvgpu_vidmem_destroy(g); - if (g->ops.ramin.deinit_pdb_cache_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) { g->ops.ramin.deinit_pdb_cache_war(g); } #endif @@ -197,7 +198,7 @@ static int nvgpu_init_system_vm(struct mm_gk20a *mm) * For some reason the maxwell PMU code is dependent on the large page * size. No reason AFAICT for this. Probably a bug somewhere. */ - if (nvgpu_is_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM)) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM)) { big_page_size = nvgpu_safe_cast_u64_to_u32(SZ_128K); } @@ -587,14 +588,14 @@ static int nvgpu_init_mm_pdb_cache_war(struct gk20a *g) { int err; - if (g->ops.ramin.init_pdb_cache_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) { err = g->ops.ramin.init_pdb_cache_war(g); if (err != 0) { return err; } } - if (g->ops.fb.apply_pdb_cache_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_FB_PDB_CACHE)) { err = g->ops.fb.apply_pdb_cache_war(g); if (err != 0) { return err; diff --git a/drivers/gpu/nvgpu/common/nvlink/nvlink.c b/drivers/gpu/nvgpu/common/nvlink/nvlink.c index 1c18ab88a..67489cbef 100644 --- a/drivers/gpu/nvgpu/common/nvlink/nvlink.c +++ b/drivers/gpu/nvgpu/common/nvlink/nvlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,6 +27,7 @@ #include #include #include +#include #ifdef CONFIG_NVGPU_NVLINK @@ -104,7 +105,7 @@ static int nvgpu_nvlink_enable_links_post_top(struct gk20a *g, for_each_set_bit(bit, &enabled_links, NVLINK_MAX_LINKS_SW) { link_id = (u32)bit; - if (g->ops.nvlink.set_sw_war != NULL) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_1888034)) { g->ops.nvlink.set_sw_war(g, link_id); } g->ops.nvlink.intr.init_link_err_intr(g, link_id); @@ -264,7 +265,10 @@ int nvgpu_nvlink_early_init(struct gk20a *g) * on the GPU. This is temporary WAR while we get the VBIOS updated with * correct mask. */ - g->ops.nvlink.get_connected_link_mask(&(g->nvlink.connected_links)); + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_VBIOS_NVLINK_MASK)) { + g->ops.nvlink.get_connected_link_mask( + &(g->nvlink.connected_links)); + } nvgpu_log(g, gpu_dbg_nvlink, "connected_links = 0x%08x", g->nvlink.connected_links); diff --git a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c index 924ac1fa9..8a30c228d 100644 --- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c +++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c @@ -1,7 +1,7 @@ /* * GK20A Channel Synchronization Abstraction * - * Copyright (c) 2014-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -378,8 +379,12 @@ nvgpu_channel_sync_syncpt_create(struct nvgpu_channel *c) * Once nvhost update the return value as NVGPU_INVALID_SYNCPT_ID, * we can remove the zero check. */ - if ((sp->id == 0U) || - (sp->id == NVGPU_INVALID_SYNCPT_ID)) { + if ((nvgpu_is_errata_present(c->g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0)) && + (sp->id == 0U)) { + nvgpu_err(c->g, "failed to get free syncpt"); + goto err_free; + } + if (sp->id == NVGPU_INVALID_SYNCPT_ID) { nvgpu_err(c->g, "failed to get free syncpt"); goto err_free; } diff --git a/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c b/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c index 6382e8883..e9088795b 100644 --- a/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c +++ b/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "channel_user_syncpt_priv.h" static int user_sync_build_debug_name(struct nvgpu_channel *ch, @@ -93,8 +94,12 @@ nvgpu_channel_user_syncpt_create(struct nvgpu_channel *ch) * Once nvhost update the return value as NVGPU_INVALID_SYNCPT_ID, * we can remove the zero check. */ - if ((s->syncpt_id == 0U) || - (s->syncpt_id == NVGPU_INVALID_SYNCPT_ID)) { + if ((nvgpu_is_errata_present(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0)) && + (s->syncpt_id == 0U)) { + nvgpu_err(g, "failed to get free syncpt"); + goto err_free; + } + if (s->syncpt_id == NVGPU_INVALID_SYNCPT_ID) { nvgpu_err(g, "failed to get free syncpt"); goto err_free; } diff --git a/drivers/gpu/nvgpu/common/utils/errata.c b/drivers/gpu/nvgpu/common/utils/errata.c new file mode 100644 index 000000000..8a6182405 --- /dev/null +++ b/drivers/gpu/nvgpu/common/utils/errata.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +/** + * Array of flag names + */ +#define DEFINE_ERRATA(flag, chip, desc) [flag] = nvgpu_stringify(flag) +static const char *errata_flag_names[NVGPU_MAX_ERRATA_BITS + 1U] = { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +/** + * Array of flag descriptions + */ +#define DEFINE_ERRATA(flag, chip, desc) [flag] = desc +static const char *errata_flag_desc[NVGPU_MAX_ERRATA_BITS + 1U] = { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +/** + * Array of chips where errata was first discovered + */ +#define DEFINE_ERRATA(flag, chip, desc) [flag] = chip +static const char *errata_flag_chip[NVGPU_MAX_ERRATA_BITS + 1U] = { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +void nvgpu_print_errata_flags(struct gk20a *g) +{ + u32 i; + + nvgpu_log(g, gpu_dbg_info, "NVGPU Erratas present in chip"); + nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s", + "Flag", "Chip", "Description"); + nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s", + "----", "-----", "-----------"); + + for (i = 0U; i < U32(NVGPU_MAX_ERRATA_BITS); i++) { + /* Only print erratas present in chip */ + if (nvgpu_is_errata_present(g, i)) { + nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s", + errata_flag_names[i], + errata_flag_chip[i], + errata_flag_desc[i]); + } + } +} + +int nvgpu_init_errata_flags(struct gk20a *g) +{ + /* + * Zero all flags initially. Flags that should be set to non-zero states + * can be done so during hal init. + */ + g->errata_flags = nvgpu_kzalloc(g, + BITS_TO_LONGS(U32(NVGPU_MAX_ERRATA_BITS)) * + sizeof(unsigned long)); + if (g->errata_flags == NULL) { + return -ENOMEM; + } + + return 0; +} + +/* + * Call this on driver shutdown! + */ +void nvgpu_free_errata_flags(struct gk20a *g) +{ + nvgpu_kfree(g, g->errata_flags); +} + +bool nvgpu_is_errata_present(struct gk20a *g, u32 flag) +{ + if (flag < NVGPU_MAX_ERRATA_BITS) { + return nvgpu_test_bit(flag, g->errata_flags); + } else { + return 0; + } +} + +void nvgpu_set_errata(struct gk20a *g, u32 flag, bool state) +{ + if (flag >= NVGPU_MAX_ERRATA_BITS) { + return; + } + + if (state) { + nvgpu_set_bit(flag, g->errata_flags); + } else { + nvgpu_clear_bit(flag, g->errata_flags); + } +} diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c index 03121309f..25489c009 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -173,11 +174,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g, nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "Single bit error detected in SM LRF!"); - gr_gp10b_sm_lrf_ecc_overcount_war(true, - lrf_ecc_sed_status, - lrf_ecc_ded_status, - &lrf_single_count_delta, - lrf_double_count_delta); + if (nvgpu_is_errata_present(g, + NVGPU_ERRATA_LRF_ECC_OVERCOUNT)) { + gr_gp10b_sm_lrf_ecc_overcount_war(true, + lrf_ecc_sed_status, + lrf_ecc_ded_status, + &lrf_single_count_delta, + lrf_double_count_delta); + } g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter = nvgpu_safe_add_u32( g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter, @@ -187,11 +191,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g, nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "Double bit error detected in SM LRF!"); - gr_gp10b_sm_lrf_ecc_overcount_war(false, - lrf_ecc_sed_status, - lrf_ecc_ded_status, - &lrf_double_count_delta, - lrf_single_count_delta); + if (nvgpu_is_errata_present(g, + NVGPU_ERRATA_LRF_ECC_OVERCOUNT)) { + gr_gp10b_sm_lrf_ecc_overcount_war(false, + lrf_ecc_sed_status, + lrf_ecc_ded_status, + &lrf_double_count_delta, + lrf_single_count_delta); + } g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter = nvgpu_safe_add_u32( g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter, diff --git a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c index 8de0801c6..f55e66ee0 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -1168,6 +1169,10 @@ int gm20b_init_hal(struct gk20a *g) gops->get_litter_value = gm20b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_1547668, true); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, true); #ifdef CONFIG_NVGPU_FECS_TRACE nvgpu_set_enabled(g, NVGPU_FECS_TRACE_VA, false); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c index 8601f702c..05034f4ec 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -1253,6 +1254,10 @@ int gp10b_init_hal(struct gk20a *g) gops->get_litter_value = gp10b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_LRF_ECC_OVERCOUNT, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, true); #ifdef CONFIG_NVGPU_FECS_TRACE nvgpu_set_enabled(g, NVGPU_FECS_TRACE_VA, false); diff --git a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c index 8d716f70e..3ee57cafa 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -1519,6 +1520,10 @@ int gv11b_init_hal(struct gk20a *g) gops->get_litter_value = gv11b_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_2016608, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, false); /* Read fuses to check if gpu needs to boot in secure/non-secure mode */ diff --git a/drivers/gpu/nvgpu/hal/init/hal_tu104.c b/drivers/gpu/nvgpu/hal/init/hal_tu104.c index e9c7e8c76..1a053f1ef 100644 --- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c +++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c @@ -22,6 +22,7 @@ * DEALINGS IN THE SOFTWARE. */ #include +#include #include "hal/mm/mm_gm20b.h" #include "hal/mm/mm_gp10b.h" @@ -1723,6 +1724,12 @@ int tu104_init_hal(struct gk20a *g) gops->get_litter_value = tu104_get_litter_value; gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup; + nvgpu_set_errata(g, NVGPU_ERRATA_INIT_PDB_CACHE, true); + nvgpu_set_errata(g, NVGPU_ERRATA_FB_PDB_CACHE, true); + nvgpu_set_errata(g, NVGPU_ERRATA_VBIOS_NVLINK_MASK, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + nvgpu_set_enabled(g, NVGPU_SEC_PRIVSECURITY, true); nvgpu_set_enabled(g, NVGPU_SEC_SECUREGPCCS, true); nvgpu_set_enabled(g, NVGPU_SUPPORT_MULTIPLE_WPR, true); diff --git a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c index 007eff742..5bd3bf6dd 100644 --- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c @@ -92,6 +92,7 @@ #include "common/clk_arb/clk_arb_gp10b.h" #include +#include #include #include #include @@ -1063,6 +1064,10 @@ int vgpu_gv11b_init_hal(struct gk20a *g) gops->top = vgpu_gv11b_ops_top; gops->grmgr = vgpu_gv11b_ops_grmgr; + nvgpu_set_errata(g, NVGPU_ERRATA_2016608, true); + nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true); + nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true); + #ifdef CONFIG_NVGPU_FECS_TRACE nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true); #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h index 6280d7d63..cc3001647 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h +++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h @@ -86,7 +86,6 @@ struct gk20a; "Use coherent aperture for sysmem"), \ DEFINE_FLAG(NVGPU_MM_USE_PHYSICAL_SG, \ "Use physical scatter tables instead of IOMMU"), \ - DEFINE_FLAG(NVGPU_MM_FORCE_128K_PMU_VM, "WAR for gm20b chips"), \ DEFINE_FLAG(NVGPU_MM_BYPASSES_IOMMU, \ "Some chips (using nvlink) bypass the IOMMU on tegra"), \ /* Host Flags */ \ diff --git a/drivers/gpu/nvgpu/include/nvgpu/errata.h b/drivers/gpu/nvgpu/include/nvgpu/errata.h new file mode 100644 index 000000000..056598c14 --- /dev/null +++ b/drivers/gpu/nvgpu/include/nvgpu/errata.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef NVGPU_ERRATA_H +#define NVGPU_ERRATA_H + +struct gk20a; + +#include + +/** + * @defgroup errata + * @ingroup unit-common-utils + * @{ + */ + +/** @cond DOXYGEN_SHOULD_SKIP_THIS */ +#if defined(CONFIG_NVGPU_NON_FUSA) && defined(CONFIG_NVGPU_NEXT) +#include "include/nvgpu/nvgpu_next_errata.h" +#else +#define ERRATA_FLAGS_NEXT +#endif +/** @endcond DOXYGEN_SHOULD_SKIP_THIS */ + +/* + * Available flags that describes an errata with details about where the issues + * were first discovered. Each flag here is defined by it's offset + * in a bitmap. + */ + +#define ERRATA_FLAGS \ + /* GM20B */ \ + DEFINE_ERRATA(NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, "GM20B", "MM"),\ + DEFINE_ERRATA(NVGPU_ERRATA_1547668, "GM20B", "CG"), \ + /* GP10B */ \ + DEFINE_ERRATA(NVGPU_ERRATA_LRF_ECC_OVERCOUNT, "GP10B", "GR ECC"), \ + DEFINE_ERRATA(NVGPU_ERRATA_200391931, "GP10B", "GR Perf"), \ + /* GV11B */ \ + DEFINE_ERRATA(NVGPU_ERRATA_2016608, "GV11B", "FIFO Runlist preempt"), \ + /* GV100 */ \ + DEFINE_ERRATA(NVGPU_ERRATA_1888034, "GV100", "Nvlink"), \ + /* TU104 */ \ + DEFINE_ERRATA(NVGPU_ERRATA_INIT_PDB_CACHE, "TU104", "MM PDB"), \ + DEFINE_ERRATA(NVGPU_ERRATA_FB_PDB_CACHE, "TU104", "FB PDB"), \ + DEFINE_ERRATA(NVGPU_ERRATA_VBIOS_NVLINK_MASK, "TU104", "Nvlink VBIOS"),\ + /* NvGPU Driver */ \ + DEFINE_ERRATA(NVGPU_ERRATA_SYNCPT_INVALID_ID_0, "SW", "Syncpt ID"),\ + DEFINE_ERRATA(NVGPU_MAX_ERRATA_BITS, "NA", "Marks max number of flags"), + +/** + * Enumerated array of flags + */ +#define DEFINE_ERRATA(flag, chip, desc) flag +enum enum_errata_flags { + ERRATA_FLAGS_NEXT + ERRATA_FLAGS +}; +#undef DEFINE_ERRATA + +/** + * @brief Check if the passed flag is enabled. + * + * @param g [in] The GPU. + * @param flag [in] Which flag to check. + * + * @return Boolean value to indicate the status of the bit. + * + * @retval TRUE if given errata is present. + * @retval FALSE if given errata is absent. + */ +bool nvgpu_is_errata_present(struct gk20a *g, u32 flag); + +/** + * @brief Initialize and allocate memory for errata flags. + * + * @param g [in] The GPU pointer. + * + * @return 0 for success, < 0 for error. + * + * @retval -ENOMEM if fails to allocate the necessary memory. + */ +int nvgpu_init_errata_flags(struct gk20a *g); + +/** + * @brief Free errata flags memory. Called during driver exit. + * + * @param g [in] The GPU pointer. + */ +void nvgpu_free_errata_flags(struct gk20a *g); + +/** + * @brief Print errata flags value. + * + * @param g [in] The GPU pointer. + */ +void nvgpu_print_errata_flags(struct gk20a *g); + +/** + * @brief Set state of a errata flag. + * + * @param g [in] The GPU. + * @param flag [in] Flag index. + * @param state [in] The state to set the \a flag to. + * + * Set state of the given \a flag index to \a state. + * + * This is generally a somewhat low level operation with lots of potential + * side effects. Be weary about where and when you use this. Typically a bunch + * of calls to this early in the driver boot sequence makes sense (as + * information is determined about the GPU at run time). Calling this in steady + * state operation is probably an incorrect thing to do. + */ +void nvgpu_set_errata(struct gk20a *g, u32 flag, bool state); + +/** + * @} + */ +#endif /* NVGPU_ERRATA_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 489ae1c3f..28b742b95 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -357,6 +357,11 @@ struct gk20a { */ struct nvgpu_nvhost_dev *nvhost; + /** + * Used by . Do not access directly! + */ + unsigned long *errata_flags; + /** * Used by . Do not access directly! */ diff --git a/drivers/gpu/nvgpu/os/linux/driver_common.c b/drivers/gpu/nvgpu/os/linux/driver_common.c index 22ac511c6..682f8425a 100644 --- a/drivers/gpu/nvgpu/os/linux/driver_common.c +++ b/drivers/gpu/nvgpu/os/linux/driver_common.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -256,7 +257,7 @@ static void nvgpu_init_mm_vars(struct gk20a *g) platform->unified_memory); nvgpu_set_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES, platform->unify_address_spaces); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, platform->force_128K_pmu_vm); nvgpu_mutex_init(&g->mm.tlb_lock); diff --git a/drivers/gpu/nvgpu/os/linux/module.c b/drivers/gpu/nvgpu/os/linux/module.c index a68736b47..de6fa7ca0 100644 --- a/drivers/gpu/nvgpu/os/linux/module.c +++ b/drivers/gpu/nvgpu/os/linux/module.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1009,6 +1010,7 @@ void gk20a_remove_support(struct gk20a *g) nvgpu_remove_usermode_support(g); nvgpu_free_enabled_flags(g); + nvgpu_free_errata_flags(g); gk20a_lockout_registers(g); } @@ -1616,9 +1618,13 @@ static int gk20a_probe(struct platform_device *dev) nvgpu_kmem_init(gk20a); + err = nvgpu_init_errata_flags(gk20a); + if (err) + goto return_err_platform; + err = nvgpu_init_enabled_flags(gk20a); if (err) - goto return_err; + goto return_err_errata; np = nvgpu_get_node(gk20a); if (of_dma_is_coherent(np)) { @@ -1730,6 +1736,9 @@ static int gk20a_probe(struct platform_device *dev) return_err: nvgpu_free_enabled_flags(gk20a); +return_err_errata: + nvgpu_free_errata_flags(gk20a); +return_err_platform: /* * Last since the above allocs may use data structures in here. diff --git a/drivers/gpu/nvgpu/os/linux/pci.c b/drivers/gpu/nvgpu/os/linux/pci.c index 6484dfbc8..343889b59 100644 --- a/drivers/gpu/nvgpu/os/linux/pci.c +++ b/drivers/gpu/nvgpu/os/linux/pci.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -542,10 +543,15 @@ static int nvgpu_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, platform); - err = nvgpu_init_enabled_flags(g); + err = nvgpu_init_errata_flags(g); if (err) goto err_free_platform; + err = nvgpu_init_enabled_flags(g); + if (err) { + goto err_free_errata; + } + platform->g = g; l->dev = &pdev->dev; @@ -690,6 +696,9 @@ err_disable_msi: if (g->msi_enabled) pci_disable_msi(pdev); #endif + nvgpu_free_enabled_flags(g); +err_free_errata: + nvgpu_free_errata_flags(g); err_free_platform: nvgpu_kfree(g, platform); err_free_l: diff --git a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c index 67b9804a8..62754c191 100644 --- a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c +++ b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c @@ -61,6 +61,7 @@ #include #include #include +#include #include "hal/clk/clk_gm20b.h" @@ -864,7 +865,7 @@ static int gk20a_tegra_probe(struct device *dev) } platform->g->clk.gpc_pll.id = GK20A_GPC_PLL; - if (platform->platform_chip_id == TEGRA_210) { + if (nvgpu_is_errata_present(g, NVGPU_ERRATA_1547668)) { /* WAR for bug 1547668: Disable railgating and scaling irrespective of platform data if the rework was not made. */ np = of_find_node_by_path("/gpu-dvfs-rework"); diff --git a/drivers/gpu/nvgpu/os/linux/sysfs.c b/drivers/gpu/nvgpu/os/linux/sysfs.c index 404a083bb..4157f10f9 100644 --- a/drivers/gpu/nvgpu/os/linux/sysfs.c +++ b/drivers/gpu/nvgpu/os/linux/sysfs.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -492,6 +493,10 @@ static ssize_t ldiv_slowdown_factor_store(struct device *dev, unsigned long val = 0; int err; + if (!nvgpu_is_errata_present(g, NVGPU_ERRATA_200391931)) { + return 0; + } + if (kstrtoul(buf, 10, &val) < 0) { nvgpu_err(g, "parse error for input SLOWDOWN factor\n"); return -EINVAL; diff --git a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c index c154c5886..e5aeefbf3 100644 --- a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c +++ b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -341,8 +342,15 @@ int vgpu_probe(struct platform_device *pdev) nvgpu_kmem_init(gk20a); + err = nvgpu_init_errata_flags(gk20a); + if (err) { + kfree(gk20a); + return err; + } + err = nvgpu_init_enabled_flags(gk20a); if (err) { + nvgpu_free_errata_flags(gk20a); kfree(gk20a); return err; } diff --git a/drivers/gpu/nvgpu/os/posix/nvgpu.c b/drivers/gpu/nvgpu/os/posix/nvgpu.c index c16d027ea..72b5a21d9 100644 --- a/drivers/gpu/nvgpu/os/posix/nvgpu.c +++ b/drivers/gpu/nvgpu/os/posix/nvgpu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -267,6 +268,10 @@ struct gk20a *nvgpu_posix_probe(void) goto fail_kmem; } + if (nvgpu_init_errata_flags(g) != 0) { + goto fail_errata_flags; + } + if (nvgpu_init_enabled_flags(g) != 0) { goto fail_enabled_flags; } @@ -297,6 +302,8 @@ struct gk20a *nvgpu_posix_probe(void) return g; fail_enabled_flags: + nvgpu_free_errata_flags(g); +fail_errata_flags: nvgpu_kmem_fini(g, 0); fail_kmem: free(p); diff --git a/libs/dgpu/libnvgpu-drv-dgpu_safe.export b/libs/dgpu/libnvgpu-drv-dgpu_safe.export index 846646e05..7e5f4dde7 100644 --- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export +++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export @@ -500,6 +500,7 @@ nvgpu_gr_suspend nvgpu_gr_sw_ready nvgpu_has_syncpoints nvgpu_init_enabled_flags +nvgpu_init_errata_flags nvgpu_init_hal nvgpu_init_ltc_support nvgpu_init_mm_support @@ -510,6 +511,7 @@ nvgpu_iommuable nvgpu_free_inst_block nvgpu_inst_block_ptr nvgpu_is_enabled +nvgpu_is_errata_present nvgpu_kcalloc_impl nvgpu_kfree_impl nvgpu_kmalloc_impl @@ -703,6 +705,7 @@ nvgpu_tsg_unbind_channel_check_hw_state nvgpu_tsg_unbind_channel_check_ctx_reload nvgpu_set_bit nvgpu_set_enabled +nvgpu_set_errata nvgpu_set_power_state nvgpu_set_pte nvgpu_sgt_alignment diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export index fc3e3c38d..12cf3cb10 100644 --- a/libs/igpu/libnvgpu-drv-igpu_safe.export +++ b/libs/igpu/libnvgpu-drv-igpu_safe.export @@ -515,6 +515,7 @@ nvgpu_gr_suspend nvgpu_gr_sw_ready nvgpu_has_syncpoints nvgpu_init_enabled_flags +nvgpu_init_errata_flags nvgpu_init_fb_support nvgpu_init_hal nvgpu_init_ltc_support @@ -526,6 +527,7 @@ nvgpu_iommuable nvgpu_free_inst_block nvgpu_inst_block_ptr nvgpu_is_enabled +nvgpu_is_errata_present nvgpu_kcalloc_impl nvgpu_kfree_impl nvgpu_kmalloc_impl @@ -719,6 +721,7 @@ nvgpu_tsg_unbind_channel_check_hw_state nvgpu_tsg_unbind_channel_check_ctx_reload nvgpu_set_bit nvgpu_set_enabled +nvgpu_set_errata nvgpu_set_power_state nvgpu_set_pte nvgpu_sgt_alignment diff --git a/userspace/units/mm/mm/mm.c b/userspace/units/mm/mm/mm.c index 621a71658..a45272b95 100644 --- a/userspace/units/mm/mm/mm.c +++ b/userspace/units/mm/mm/mm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -125,7 +126,7 @@ static void init_platform(struct unit_module *m, struct gk20a *g, bool is_iGPU) /* Enable extra features to increase line coverage */ nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, true); nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, true); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true); } /* @@ -289,7 +290,7 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args) */ nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, false); nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, false); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, false); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, false); g->has_cde = false; errors += nvgpu_init_mm_support_inject_error(m, g, ERROR_TYPE_HAL, 1, @@ -297,7 +298,7 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args) nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, true); nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true); - nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, true); + nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true); g->has_cde = true; /*