gpu: nvgpu: handle chip specific erratas

Currently, there are few chip specific erratas present in nvgpu code. For better traceability of the erratas and corresponding fixes, introduce flags to indicate existing erratas on a chip. These flags decide if a corresponding solution is applied to the chip(s). This patch introduces below functions to handle errata flags: - nvgpu_init_errata_flags - nvgpu_set_errata - nvgpu_is_errata_present - nvgpu_print_errata_flags - nvgpu_free_errata_flags nvgpu_print_errata_flags: print below details of erratas present in chip 1. errata flag name 2. chip where the errata was first discovered 3. short description of the errata Flags corresponding to erratas present in a chip are set during chip hal init sequence. JIRA NVGPU-6510 Change-Id: Id5a8fb627222ac0a585aba071af052950f4de965 Signed-off-by: Vedashree Vidwans <vvidwans@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2498095 Reviewed-by: Seema Khowala <seemaj@nvidia.com> Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit
2025-12-22 09:12:24 +03:00 · 2021-04-13 21:55:47 -07:00
parent 6222ebeaea
commit aba26fa082
28 changed files with 410 additions and 38 deletions
--- a/arch/nvgpu-common.yaml
+++ b/arch/nvgpu-common.yaml
@@ -237,15 +237,17 @@ sim:

 utils:
  safe: yes
-  owner: Terje B
+  owner: Alex W
  sources: [ include/nvgpu/utils.h,
             include/nvgpu/worker.h,
             include/nvgpu/rbtree.h,
             include/nvgpu/enabled.h,
+             include/nvgpu/errata.h,
             common/utils/string.c,
             common/utils/worker.c,
             common/utils/rbtree.c,
-             common/utils/enabled.c ]
+             common/utils/enabled.c,
+             common/utils/errata.c ]

 ##
 ## Common elements.
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -200,6 +200,7 @@ nvgpu-y += \
 nvgpu-y += \
 	common/device.o \
 	common/utils/enabled.o \
+	common/utils/errata.o \
 	common/utils/rbtree.o \
 	common/utils/string.o \
 	common/utils/worker.o \
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -92,6 +92,7 @@ endif

 srcs +=	common/device.c \
 	common/utils/enabled.c \
+	common/utils/errata.c \
 	common/utils/rbtree.c \
 	common/utils/string.c \
 	common/utils/worker.c \
--- a/drivers/gpu/nvgpu/common/fifo/preempt.c
+++ b/drivers/gpu/nvgpu/common/fifo/preempt.c
@@ -22,6 +22,7 @@

 #include <nvgpu/soc.h>
 #include <nvgpu/gk20a.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/runlist.h>
 #include <nvgpu/types.h>
 #include <nvgpu/channel.h>
@@ -54,8 +55,11 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg)

 	nvgpu_mutex_acquire(&tsg->runlist->runlist_lock);

+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) {
 		nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id),
 					RUNLIST_DISABLED);
+	}
+
 #ifdef CONFIG_NVGPU_LS_PMU
 	mutex_ret = nvgpu_pmu_lock_acquire(g, g->pmu,
 						PMU_MUTEX_ID_FIFO, &token);
@@ -77,8 +81,10 @@ int nvgpu_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg)
 		}
 	}
 #endif
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_2016608)) {
 		nvgpu_runlist_set_state(g, BIT32(tsg->runlist->id),
 					RUNLIST_ENABLED);
+	}

 	nvgpu_mutex_release(&tsg->runlist->runlist_lock);

--- a/drivers/gpu/nvgpu/common/mm/mm.c
+++ b/drivers/gpu/nvgpu/common/mm/mm.c
@@ -30,6 +30,7 @@
 #include <nvgpu/semaphore.h>
 #include <nvgpu/pramin.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/ce_app.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/engines.h>
@@ -177,7 +178,7 @@ static void nvgpu_remove_mm_support(struct mm_gk20a *mm)
 #ifdef CONFIG_NVGPU_DGPU
 	nvgpu_vidmem_destroy(g);

-	if (g->ops.ramin.deinit_pdb_cache_war != NULL) {
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) {
 		g->ops.ramin.deinit_pdb_cache_war(g);
 	}
 #endif
@@ -197,7 +198,7 @@ static int nvgpu_init_system_vm(struct mm_gk20a *mm)
 	 * For some reason the maxwell PMU code is dependent on the large page
 	 * size. No reason AFAICT for this. Probably a bug somewhere.
 	 */
-	if (nvgpu_is_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM)) {
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM)) {
 		big_page_size = nvgpu_safe_cast_u64_to_u32(SZ_128K);
 	}

@@ -587,14 +588,14 @@ static int nvgpu_init_mm_pdb_cache_war(struct gk20a *g)
 {
 	int err;

-	if (g->ops.ramin.init_pdb_cache_war != NULL) {
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) {
 		err = g->ops.ramin.init_pdb_cache_war(g);
 		if (err != 0) {
 			return err;
 		}
 	}

-	if (g->ops.fb.apply_pdb_cache_war != NULL) {
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_FB_PDB_CACHE)) {
 		err = g->ops.fb.apply_pdb_cache_war(g);
 		if (err != 0) {
 			return err;
--- a/drivers/gpu/nvgpu/common/nvlink/nvlink.c
+++ b/drivers/gpu/nvgpu/common/nvlink/nvlink.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -27,6 +27,7 @@
 #include <nvgpu/device.h>
 #include <nvgpu/nvlink_bios.h>
 #include <nvgpu/device.h>
+#include <nvgpu/errata.h>

 #ifdef CONFIG_NVGPU_NVLINK

@@ -104,7 +105,7 @@ static int nvgpu_nvlink_enable_links_post_top(struct gk20a *g,

 	for_each_set_bit(bit, &enabled_links, NVLINK_MAX_LINKS_SW) {
 		link_id = (u32)bit;
-		if (g->ops.nvlink.set_sw_war != NULL) {
+		if (nvgpu_is_errata_present(g, NVGPU_ERRATA_1888034)) {
 			g->ops.nvlink.set_sw_war(g, link_id);
 		}
 		g->ops.nvlink.intr.init_link_err_intr(g, link_id);
@@ -264,7 +265,10 @@ int nvgpu_nvlink_early_init(struct gk20a *g)
 	 * on the GPU. This is temporary WAR while we get the VBIOS updated with
 	 * correct mask.
 	 */
-	g->ops.nvlink.get_connected_link_mask(&(g->nvlink.connected_links));
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_VBIOS_NVLINK_MASK)) {
+		g->ops.nvlink.get_connected_link_mask(
+			&(g->nvlink.connected_links));
+	}

 	nvgpu_log(g, gpu_dbg_nvlink, "connected_links = 0x%08x",
 						g->nvlink.connected_links);
--- a/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_sync_syncpt.c
@@ -1,7 +1,7 @@
 /*
 * GK20A Channel Synchronization Abstraction
 *
- * Copyright (c) 2014-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -28,6 +28,7 @@

 #include <nvgpu/kmem.h>
 #include <nvgpu/log.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/atomic.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/list.h>
@@ -378,8 +379,12 @@ nvgpu_channel_sync_syncpt_create(struct nvgpu_channel *c)
 	 * Once nvhost update the return value as NVGPU_INVALID_SYNCPT_ID,
 	 * we can remove the zero check.
 	 */
-	if ((sp->id == 0U) ||
-		(sp->id == NVGPU_INVALID_SYNCPT_ID)) {
+	if ((nvgpu_is_errata_present(c->g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0)) &&
+		(sp->id == 0U)) {
+		nvgpu_err(c->g, "failed to get free syncpt");
+		goto err_free;
+	}
+	if (sp->id == NVGPU_INVALID_SYNCPT_ID) {
 		nvgpu_err(c->g, "failed to get free syncpt");
 		goto err_free;
 	}
--- a/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c
+++ b/drivers/gpu/nvgpu/common/sync/channel_user_syncpt.c
@@ -27,6 +27,7 @@
 #include <nvgpu/channel.h>
 #include <nvgpu/channel_user_syncpt.h>
 #include <nvgpu/string.h>
+#include <nvgpu/errata.h>
 #include "channel_user_syncpt_priv.h"

 static int user_sync_build_debug_name(struct nvgpu_channel *ch,
@@ -93,8 +94,12 @@ nvgpu_channel_user_syncpt_create(struct nvgpu_channel *ch)
 	 * Once nvhost update the return value as NVGPU_INVALID_SYNCPT_ID,
 	 * we can remove the zero check.
 	 */
-	if ((s->syncpt_id == 0U) ||
-			(s->syncpt_id == NVGPU_INVALID_SYNCPT_ID)) {
+	if ((nvgpu_is_errata_present(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0)) &&
+		(s->syncpt_id == 0U)) {
+		nvgpu_err(g, "failed to get free syncpt");
+		goto err_free;
+	}
+	if (s->syncpt_id == NVGPU_INVALID_SYNCPT_ID) {
 		nvgpu_err(g, "failed to get free syncpt");
 		goto err_free;
 	}
--- a/drivers/gpu/nvgpu/common/utils/errata.c
+++ b/drivers/gpu/nvgpu/common/utils/errata.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/errata.h>
+#include <nvgpu/bitops.h>
+#include <nvgpu/log.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/static_analysis.h>
+#include <nvgpu/utils.h>
+
+/**
+ * Array of flag names
+ */
+#define DEFINE_ERRATA(flag, chip, desc) [flag] = nvgpu_stringify(flag)
+static const char *errata_flag_names[NVGPU_MAX_ERRATA_BITS + 1U] = {
+	ERRATA_FLAGS_NEXT
+	ERRATA_FLAGS
+};
+#undef DEFINE_ERRATA
+
+/**
+ * Array of flag descriptions
+ */
+#define DEFINE_ERRATA(flag, chip, desc) [flag] = desc
+static const char *errata_flag_desc[NVGPU_MAX_ERRATA_BITS + 1U] = {
+	ERRATA_FLAGS_NEXT
+	ERRATA_FLAGS
+};
+#undef DEFINE_ERRATA
+
+/**
+ * Array of chips where errata was first discovered
+ */
+#define DEFINE_ERRATA(flag, chip, desc) [flag] = chip
+static const char *errata_flag_chip[NVGPU_MAX_ERRATA_BITS + 1U] = {
+	ERRATA_FLAGS_NEXT
+	ERRATA_FLAGS
+};
+#undef DEFINE_ERRATA
+
+void nvgpu_print_errata_flags(struct gk20a *g)
+{
+	u32 i;
+
+	nvgpu_log(g, gpu_dbg_info, "NVGPU Erratas present in chip");
+	nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s",
+			"Flag", "Chip", "Description");
+	nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s",
+			"----", "-----", "-----------");
+
+	for (i = 0U; i < U32(NVGPU_MAX_ERRATA_BITS); i++) {
+		/* Only print erratas present in chip */
+		if (nvgpu_is_errata_present(g, i)) {
+			nvgpu_log(g, gpu_dbg_info, "%-55.55s %-5.5s %s",
+			errata_flag_names[i],
+			errata_flag_chip[i],
+			errata_flag_desc[i]);
+		}
+	}
+}
+
+int nvgpu_init_errata_flags(struct gk20a *g)
+{
+	/*
+	 * Zero all flags initially. Flags that should be set to non-zero states
+	 * can be done so during hal init.
+	 */
+	g->errata_flags = nvgpu_kzalloc(g,
+				BITS_TO_LONGS(U32(NVGPU_MAX_ERRATA_BITS)) *
+				sizeof(unsigned long));
+	if (g->errata_flags == NULL) {
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Call this on driver shutdown!
+ */
+void nvgpu_free_errata_flags(struct gk20a *g)
+{
+	nvgpu_kfree(g, g->errata_flags);
+}
+
+bool nvgpu_is_errata_present(struct gk20a *g, u32 flag)
+{
+	if (flag < NVGPU_MAX_ERRATA_BITS) {
+		return nvgpu_test_bit(flag, g->errata_flags);
+	} else {
+		return 0;
+	}
+}
+
+void nvgpu_set_errata(struct gk20a *g, u32 flag, bool state)
+{
+	if (flag >= NVGPU_MAX_ERRATA_BITS) {
+		return;
+	}
+
+	if (state) {
+		nvgpu_set_bit(flag, g->errata_flags);
+	} else {
+		nvgpu_clear_bit(flag, g->errata_flags);
+	}
+}
--- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gp10b.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -23,6 +23,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/io.h>
 #include <nvgpu/class.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/static_analysis.h>

@@ -173,11 +174,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
 			"Single bit error detected in SM LRF!");

+		if (nvgpu_is_errata_present(g,
+				NVGPU_ERRATA_LRF_ECC_OVERCOUNT)) {
 			gr_gp10b_sm_lrf_ecc_overcount_war(true,
 							lrf_ecc_sed_status,
 							lrf_ecc_ded_status,
 							&lrf_single_count_delta,
 							lrf_double_count_delta);
+		}
 		g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter =
 			   nvgpu_safe_add_u32(
 				g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter,
@@ -187,11 +191,14 @@ int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
 		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
 			"Double bit error detected in SM LRF!");

+		if (nvgpu_is_errata_present(g,
+				NVGPU_ERRATA_LRF_ECC_OVERCOUNT)) {
 			gr_gp10b_sm_lrf_ecc_overcount_war(false,
 							lrf_ecc_sed_status,
 							lrf_ecc_ded_status,
 							&lrf_double_count_delta,
 							lrf_single_count_delta);
+		}
 		g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter =
 			   nvgpu_safe_add_u32(
 				g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
--- a/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gm20b.c
@@ -22,6 +22,7 @@
 * DEALINGS IN THE SOFTWARE.
 */
 #include <nvgpu/ptimer.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/debugger.h>
@@ -1168,6 +1169,10 @@ int gm20b_init_hal(struct gk20a *g)
 	gops->get_litter_value = gm20b_get_litter_value;
 	gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;

+	nvgpu_set_errata(g, NVGPU_ERRATA_1547668, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true);
+
 	nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, true);
 #ifdef CONFIG_NVGPU_FECS_TRACE
 	nvgpu_set_enabled(g, NVGPU_FECS_TRACE_VA, false);
--- a/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gp10b.c
@@ -22,6 +22,7 @@
 * DEALINGS IN THE SOFTWARE.
 */
 #include <nvgpu/ptimer.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/error_notifier.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/debugger.h>
@@ -1253,6 +1254,10 @@ int gp10b_init_hal(struct gk20a *g)
 	gops->get_litter_value = gp10b_get_litter_value;
 	gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;

+	nvgpu_set_errata(g, NVGPU_ERRATA_LRF_ECC_OVERCOUNT, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true);
+
 	nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, true);
 #ifdef CONFIG_NVGPU_FECS_TRACE
 	nvgpu_set_enabled(g, NVGPU_FECS_TRACE_VA, false);
--- a/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_gv11b.c
@@ -22,6 +22,7 @@
 * DEALINGS IN THE SOFTWARE.
 */
 #include <nvgpu/gk20a.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/acr.h>
 #include <nvgpu/ce.h>
 #include <nvgpu/ce_app.h>
@@ -1519,6 +1520,10 @@ int gv11b_init_hal(struct gk20a *g)
 	gops->get_litter_value = gv11b_get_litter_value;
 	gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;

+	nvgpu_set_errata(g, NVGPU_ERRATA_2016608, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true);
+
 	nvgpu_set_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP, false);

 	/* Read fuses to check if gpu needs to boot in secure/non-secure mode */
--- a/drivers/gpu/nvgpu/hal/init/hal_tu104.c
+++ b/drivers/gpu/nvgpu/hal/init/hal_tu104.c
@@ -22,6 +22,7 @@
 * DEALINGS IN THE SOFTWARE.
 */
 #include <nvgpu/preempt.h>
+#include <nvgpu/errata.h>

 #include "hal/mm/mm_gm20b.h"
 #include "hal/mm/mm_gp10b.h"
@@ -1723,6 +1724,12 @@ int tu104_init_hal(struct gk20a *g)
 	gops->get_litter_value = tu104_get_litter_value;
 	gops->semaphore_wakeup = nvgpu_channel_semaphore_wakeup;

+	nvgpu_set_errata(g, NVGPU_ERRATA_INIT_PDB_CACHE, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_FB_PDB_CACHE, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_VBIOS_NVLINK_MASK, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true);
+
 	nvgpu_set_enabled(g, NVGPU_SEC_PRIVSECURITY, true);
 	nvgpu_set_enabled(g, NVGPU_SEC_SECUREGPCCS, true);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_MULTIPLE_WPR, true);
--- a/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/hal/vgpu/init/vgpu_hal_gv11b.c
@@ -92,6 +92,7 @@
 #include "common/clk_arb/clk_arb_gp10b.h"

 #include <nvgpu/gk20a.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/gr/gr.h>
 #include <nvgpu/gr/gr_intr.h>
 #include <nvgpu/vgpu/vgpu.h>
@@ -1063,6 +1064,10 @@ int vgpu_gv11b_init_hal(struct gk20a *g)
 	gops->top = vgpu_gv11b_ops_top;
 	gops->grmgr = vgpu_gv11b_ops_grmgr;

+	nvgpu_set_errata(g, NVGPU_ERRATA_2016608, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_200391931, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_SYNCPT_INVALID_ID_0, true);
+
 #ifdef CONFIG_NVGPU_FECS_TRACE
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, true);
 #endif
--- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
@@ -86,7 +86,6 @@ struct gk20a;
 		"Use coherent aperture for sysmem"),			\
 	DEFINE_FLAG(NVGPU_MM_USE_PHYSICAL_SG,				\
 		"Use physical scatter tables instead of IOMMU"),	\
-	DEFINE_FLAG(NVGPU_MM_FORCE_128K_PMU_VM, "WAR for gm20b chips"),	\
 	DEFINE_FLAG(NVGPU_MM_BYPASSES_IOMMU,				\
 		"Some chips (using nvlink) bypass the IOMMU on tegra"),	\
 	/* Host Flags */						\
--- a/drivers/gpu/nvgpu/include/nvgpu/errata.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/errata.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_ERRATA_H
+#define NVGPU_ERRATA_H
+
+struct gk20a;
+
+#include <nvgpu/types.h>
+
+/**
+ * @defgroup errata
+ * @ingroup unit-common-utils
+ * @{
+ */
+
+/** @cond DOXYGEN_SHOULD_SKIP_THIS */
+#if defined(CONFIG_NVGPU_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
+#include "include/nvgpu/nvgpu_next_errata.h"
+#else
+#define ERRATA_FLAGS_NEXT
+#endif
+/** @endcond DOXYGEN_SHOULD_SKIP_THIS */
+
+/*
+ * Available flags that describes an errata with details about where the issues
+ * were first discovered. Each flag here is defined by it's offset
+ * in a bitmap.
+ */
+
+#define ERRATA_FLAGS							\
+	/* GM20B */							\
+	DEFINE_ERRATA(NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, "GM20B", "MM"),\
+	DEFINE_ERRATA(NVGPU_ERRATA_1547668, "GM20B", "CG"),		\
+	/* GP10B */							\
+	DEFINE_ERRATA(NVGPU_ERRATA_LRF_ECC_OVERCOUNT, "GP10B", "GR ECC"),	\
+	DEFINE_ERRATA(NVGPU_ERRATA_200391931, "GP10B", "GR Perf"),	\
+	/* GV11B */							\
+	DEFINE_ERRATA(NVGPU_ERRATA_2016608, "GV11B", "FIFO Runlist preempt"), \
+	/* GV100 */							\
+	DEFINE_ERRATA(NVGPU_ERRATA_1888034, "GV100", "Nvlink"),	\
+	/* TU104 */							\
+	DEFINE_ERRATA(NVGPU_ERRATA_INIT_PDB_CACHE, "TU104", "MM PDB"),	\
+	DEFINE_ERRATA(NVGPU_ERRATA_FB_PDB_CACHE, "TU104", "FB PDB"),	\
+	DEFINE_ERRATA(NVGPU_ERRATA_VBIOS_NVLINK_MASK, "TU104", "Nvlink VBIOS"),\
+	/* NvGPU Driver */						\
+	DEFINE_ERRATA(NVGPU_ERRATA_SYNCPT_INVALID_ID_0, "SW", "Syncpt ID"),\
+	DEFINE_ERRATA(NVGPU_MAX_ERRATA_BITS, "NA", "Marks max number of flags"),
+
+/**
+ * Enumerated array of flags
+ */
+#define DEFINE_ERRATA(flag, chip, desc) flag
+enum enum_errata_flags {
+	ERRATA_FLAGS_NEXT
+	ERRATA_FLAGS
+};
+#undef DEFINE_ERRATA
+
+/**
+ * @brief Check if the passed flag is enabled.
+ *
+ * @param g [in]	The GPU.
+ * @param flag [in]	Which flag to check.
+ *
+ * @return Boolean value to indicate the status of the bit.
+ *
+ * @retval TRUE if given errata is present.
+ * @retval FALSE if given errata is absent.
+ */
+bool nvgpu_is_errata_present(struct gk20a *g, u32 flag);
+
+/**
+ * @brief Initialize and allocate memory for errata flags.
+ *
+ * @param g [in]	The GPU pointer.
+ *
+ * @return 0 for success, < 0 for error.
+ *
+ * @retval -ENOMEM if fails to allocate the necessary memory.
+ */
+int nvgpu_init_errata_flags(struct gk20a *g);
+
+/**
+ * @brief Free errata flags memory. Called during driver exit.
+ *
+ * @param g [in]	The GPU pointer.
+ */
+void nvgpu_free_errata_flags(struct gk20a *g);
+
+/**
+ * @brief Print errata flags value.
+ *
+ * @param g [in]	The GPU pointer.
+ */
+void nvgpu_print_errata_flags(struct gk20a *g);
+
+/**
+ * @brief Set state of a errata flag.
+ *
+ * @param g [in]	The GPU.
+ * @param flag [in]	Flag index.
+ * @param state [in]	The state to set the \a flag to.
+ *
+ * Set state of the given \a flag index to \a state.
+ *
+ * This is generally a somewhat low level operation with lots of potential
+ * side effects. Be weary about where and when you use this. Typically a bunch
+ * of calls to this early in the driver boot sequence makes sense (as
+ * information is determined about the GPU at run time). Calling this in steady
+ * state operation is probably an incorrect thing to do.
+ */
+void nvgpu_set_errata(struct gk20a *g, u32 flag, bool state);
+
+/**
+ * @}
+ */
+#endif /* NVGPU_ERRATA_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -357,6 +357,11 @@ struct gk20a {
 	 */
 	struct nvgpu_nvhost_dev *nvhost;

+	/**
+	 * Used by <nvgpu/errata.h>. Do not access directly!
+	 */
+	unsigned long *errata_flags;
+
 	/**
 	 * Used by <nvgpu/enabled.h>. Do not access directly!
 	 */
--- a/drivers/gpu/nvgpu/os/linux/driver_common.c
+++ b/drivers/gpu/nvgpu/os/linux/driver_common.c
@@ -15,6 +15,7 @@
 */

 #include <linux/reboot.h>
+#include <nvgpu/errata.h>
 #include <linux/dma-mapping.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
@@ -256,7 +257,7 @@ static void nvgpu_init_mm_vars(struct gk20a *g)
 			    platform->unified_memory);
 	nvgpu_set_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES,
 			    platform->unify_address_spaces);
-	nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM,
+	nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM,
 			    platform->force_128K_pmu_vm);

 	nvgpu_mutex_init(&g->mm.tlb_lock);
--- a/drivers/gpu/nvgpu/os/linux/module.c
+++ b/drivers/gpu/nvgpu/os/linux/module.c
@@ -47,6 +47,7 @@
 #include <nvgpu/soc.h>
 #include <nvgpu/fbp.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/debug.h>
 #include <nvgpu/vidmem.h>
 #include <nvgpu/sim.h>
@@ -1009,6 +1010,7 @@ void gk20a_remove_support(struct gk20a *g)
 	nvgpu_remove_usermode_support(g);

 	nvgpu_free_enabled_flags(g);
+	nvgpu_free_errata_flags(g);

 	gk20a_lockout_registers(g);
 }
@@ -1616,9 +1618,13 @@ static int gk20a_probe(struct platform_device *dev)

 	nvgpu_kmem_init(gk20a);

+	err = nvgpu_init_errata_flags(gk20a);
+	if (err)
+		goto return_err_platform;
+
 	err = nvgpu_init_enabled_flags(gk20a);
 	if (err)
-		goto return_err;
+		goto return_err_errata;

 	np = nvgpu_get_node(gk20a);
 	if (of_dma_is_coherent(np)) {
@@ -1730,6 +1736,9 @@ static int gk20a_probe(struct platform_device *dev)

 return_err:
 	nvgpu_free_enabled_flags(gk20a);
+return_err_errata:
+	nvgpu_free_errata_flags(gk20a);
+return_err_platform:

 	/*
 	 * Last since the above allocs may use data structures in here.
--- a/drivers/gpu/nvgpu/os/linux/pci.c
+++ b/drivers/gpu/nvgpu/os/linux/pci.c
@@ -25,6 +25,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/mc.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/nvlink_probe.h>
 #include <nvgpu/soc.h>
 #include <nvgpu/sim.h>
@@ -542,10 +543,15 @@ static int nvgpu_pci_probe(struct pci_dev *pdev,

 	pci_set_drvdata(pdev, platform);

-	err = nvgpu_init_enabled_flags(g);
+	err = nvgpu_init_errata_flags(g);
 	if (err)
 		goto err_free_platform;

+	err = nvgpu_init_enabled_flags(g);
+	if (err) {
+		goto err_free_errata;
+	}
+
 	platform->g = g;
 	l->dev = &pdev->dev;

@@ -690,6 +696,9 @@ err_disable_msi:
 	if (g->msi_enabled)
 		pci_disable_msi(pdev);
 #endif
+	nvgpu_free_enabled_flags(g);
+err_free_errata:
+	nvgpu_free_errata_flags(g);
 err_free_platform:
 	nvgpu_kfree(g, platform);
 err_free_l:
--- a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c
@@ -61,6 +61,7 @@
 #include <nvgpu/pmu/pmu_perfmon.h>
 #include <nvgpu/linux/dma.h>
 #include <nvgpu/soc.h>
+#include <nvgpu/errata.h>

 #include "hal/clk/clk_gm20b.h"

@@ -864,7 +865,7 @@ static int gk20a_tegra_probe(struct device *dev)
 	}

 	platform->g->clk.gpc_pll.id = GK20A_GPC_PLL;
-	if (platform->platform_chip_id == TEGRA_210) {
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_1547668)) {
 		/* WAR for bug 1547668: Disable railgating and scaling
 		   irrespective of platform data if the rework was not made. */
 		np = of_find_node_by_path("/gpu-dvfs-rework");
--- a/drivers/gpu/nvgpu/os/linux/sysfs.c
+++ b/drivers/gpu/nvgpu/os/linux/sysfs.c
@@ -19,6 +19,7 @@
 #include <linux/fb.h>
 #include <linux/version.h>

+#include <nvgpu/errata.h>
 #include <nvgpu/kmem.h>
 #include <nvgpu/nvhost.h>
 #include <nvgpu/ptimer.h>
@@ -492,6 +493,10 @@ static ssize_t ldiv_slowdown_factor_store(struct device *dev,
 	unsigned long val = 0;
 	int err;

+	if (!nvgpu_is_errata_present(g, NVGPU_ERRATA_200391931)) {
+		return 0;
+	}
+
 	if (kstrtoul(buf, 10, &val) < 0) {
 		nvgpu_err(g, "parse error for input SLOWDOWN factor\n");
 		return -EINVAL;
--- a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c
+++ b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c
@@ -34,6 +34,7 @@
 #include <nvgpu/kmem.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/errata.h>
 #include <nvgpu/debug.h>
 #include <nvgpu/soc.h>
 #include <nvgpu/defaults.h>
@@ -341,8 +342,15 @@ int vgpu_probe(struct platform_device *pdev)

 	nvgpu_kmem_init(gk20a);

+	err = nvgpu_init_errata_flags(gk20a);
+	if (err) {
+		kfree(gk20a);
+		return err;
+	}
+
 	err = nvgpu_init_enabled_flags(gk20a);
 	if (err) {
+		nvgpu_free_errata_flags(gk20a);
 		kfree(gk20a);
 		return err;
 	}
--- a/drivers/gpu/nvgpu/os/posix/nvgpu.c
+++ b/drivers/gpu/nvgpu/os/posix/nvgpu.c
@@ -33,6 +33,7 @@
 #include <nvgpu/os_sched.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/enabled.h>
+#include <nvgpu/errata.h>

 #include <nvgpu/posix/probe.h>
 #include <nvgpu/posix/mock-regs.h>
@@ -267,6 +268,10 @@ struct gk20a *nvgpu_posix_probe(void)
 		goto fail_kmem;
 	}

+	if (nvgpu_init_errata_flags(g) != 0) {
+		goto fail_errata_flags;
+	}
+
 	if (nvgpu_init_enabled_flags(g) != 0) {
 		goto fail_enabled_flags;
 	}
@@ -297,6 +302,8 @@ struct gk20a *nvgpu_posix_probe(void)
 	return g;

 fail_enabled_flags:
+	nvgpu_free_errata_flags(g);
+fail_errata_flags:
 	nvgpu_kmem_fini(g, 0);
 fail_kmem:
 	free(p);
--- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export
+++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export
@@ -500,6 +500,7 @@ nvgpu_gr_suspend
 nvgpu_gr_sw_ready
 nvgpu_has_syncpoints
 nvgpu_init_enabled_flags
+nvgpu_init_errata_flags
 nvgpu_init_hal
 nvgpu_init_ltc_support
 nvgpu_init_mm_support
@@ -510,6 +511,7 @@ nvgpu_iommuable
 nvgpu_free_inst_block
 nvgpu_inst_block_ptr
 nvgpu_is_enabled
+nvgpu_is_errata_present
 nvgpu_kcalloc_impl
 nvgpu_kfree_impl
 nvgpu_kmalloc_impl
@@ -703,6 +705,7 @@ nvgpu_tsg_unbind_channel_check_hw_state
 nvgpu_tsg_unbind_channel_check_ctx_reload
 nvgpu_set_bit
 nvgpu_set_enabled
+nvgpu_set_errata
 nvgpu_set_power_state
 nvgpu_set_pte
 nvgpu_sgt_alignment
--- a/libs/igpu/libnvgpu-drv-igpu_safe.export
+++ b/libs/igpu/libnvgpu-drv-igpu_safe.export
@@ -515,6 +515,7 @@ nvgpu_gr_suspend
 nvgpu_gr_sw_ready
 nvgpu_has_syncpoints
 nvgpu_init_enabled_flags
+nvgpu_init_errata_flags
 nvgpu_init_fb_support
 nvgpu_init_hal
 nvgpu_init_ltc_support
@@ -526,6 +527,7 @@ nvgpu_iommuable
 nvgpu_free_inst_block
 nvgpu_inst_block_ptr
 nvgpu_is_enabled
+nvgpu_is_errata_present
 nvgpu_kcalloc_impl
 nvgpu_kfree_impl
 nvgpu_kmalloc_impl
@@ -719,6 +721,7 @@ nvgpu_tsg_unbind_channel_check_hw_state
 nvgpu_tsg_unbind_channel_check_ctx_reload
 nvgpu_set_bit
 nvgpu_set_enabled
+nvgpu_set_errata
 nvgpu_set_power_state
 nvgpu_set_pte
 nvgpu_sgt_alignment
--- a/userspace/units/mm/mm/mm.c
+++ b/userspace/units/mm/mm/mm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -24,6 +24,7 @@
 #include <unit/io.h>
 #include <unit/unit.h>
 #include <unit/core.h>
+#include <nvgpu/errata.h>

 #include <nvgpu/nvgpu_init.h>
 #include <nvgpu/posix/io.h>
@@ -125,7 +126,7 @@ static void init_platform(struct unit_module *m, struct gk20a *g, bool is_iGPU)
 	/* Enable extra features to increase line coverage */
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, true);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true);
-	nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true);
 }

 /*
@@ -289,7 +290,7 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args)
 	 */
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, false);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, false);
-	nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, false);
+	nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, false);
 	g->has_cde = false;

 	errors += nvgpu_init_mm_support_inject_error(m, g, ERROR_TYPE_HAL, 1,
@@ -297,7 +298,7 @@ int test_nvgpu_init_mm(struct unit_module *m, struct gk20a *g, void *args)

 	nvgpu_set_enabled(g, NVGPU_SUPPORT_SEC2_VM, true);
 	nvgpu_set_enabled(g, NVGPU_SUPPORT_GSP_VM, true);
-	nvgpu_set_enabled(g, NVGPU_MM_FORCE_128K_PMU_VM, true);
+	nvgpu_set_errata(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM, true);
 	g->has_cde = true;

 	/*