Open source GPL/LGPL release

2025-12-24 10:34:43 +03:00 · 2022-07-21 16:03:29 -07:00
commit f338182221
2260 changed files with 576813 additions and 0 deletions
--- a/drivers/gpu/nvgpu/common/mm/allocators/bitmap_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/allocators/bitmap_allocator.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/bitops.h>
+#include <nvgpu/allocator.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/barrier.h>
+
+#include "bitmap_allocator_priv.h"
+
+static u64 nvgpu_bitmap_alloc_length(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+
+	return ba->length;
+}
+
+static u64 nvgpu_bitmap_alloc_base(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+
+	return ba->base;
+}
+
+static bool nvgpu_bitmap_alloc_inited(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+	bool inited = ba->inited;
+
+	nvgpu_smp_rmb();
+	return inited;
+}
+
+static u64 nvgpu_bitmap_alloc_end(struct nvgpu_allocator *a)
+{
+	struct nvgpu_bitmap_allocator *ba = a->priv;
+
+	return nvgpu_safe_add_u64(ba->base, ba->length);
+}
+
+/*
+ * @page_size is ignored.
+ */
+static u64 nvgpu_bitmap_balloc_fixed(struct nvgpu_allocator *na,
+				    u64 base, u64 len, u32 page_size)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(na);
+	u64 blks, offs, ret;
+
+	/* Compute the bit offset and make sure it's aligned to a block.  */
+	offs = base >> a->blk_shift;
+	if (nvgpu_safe_mult_u64(offs, a->blk_size) != base) {
+		return 0;
+	}
+
+	offs = nvgpu_safe_sub_u64(offs, a->bit_offs);
+
+	blks = len >> a->blk_shift;
+	if (nvgpu_safe_mult_u64(blks, a->blk_size) != len) {
+		blks++;
+	}
+	nvgpu_assert(blks <= U32_MAX);
+
+	alloc_lock(na);
+
+	/* Check if the space requested is already occupied. */
+	ret = bitmap_find_next_zero_area(a->bitmap, a->num_bits, offs,
+					(u32)blks, 0UL);
+	if (ret != offs) {
+		goto fail;
+	}
+
+	nvgpu_assert(blks <= U32_MAX);
+	nvgpu_bitmap_set(a->bitmap, (u32)offs, U32(blks));
+
+	a->bytes_alloced = nvgpu_safe_add_u64(a->bytes_alloced,
+				nvgpu_safe_mult_u64(blks, a->blk_size));
+NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 14_3), "Bug 2615925")
+	nvgpu_assert(a->nr_fixed_allocs < U64_MAX);
+	a->nr_fixed_allocs++;
+	alloc_unlock(na);
+
+	alloc_dbg(na, "Alloc-fixed 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]",
+		  base, len, blks, blks);
+	return base;
+
+fail:
+	alloc_unlock(na);
+	alloc_dbg(na, "Alloc-fixed failed! (0x%llx)", base);
+	return 0;
+}
+
+/*
+ * Two possibilities for this function: either we are freeing a fixed allocation
+ * or we are freeing a regular alloc but with GPU_ALLOC_NO_ALLOC_PAGE defined.
+ *
+ * Note: this function won't do much error checking. Thus you could really
+ * confuse the allocator if you misuse this function.
+ */
+static void nvgpu_bitmap_free_fixed(struct nvgpu_allocator *na,
+				    u64 base, u64 len)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(na);
+	u64 blks, offs;
+
+	offs = base >> a->blk_shift;
+	if (nvgpu_safe_mult_u64(offs, a->blk_size) != base) {
+		nvgpu_do_assert();
+		return;
+	}
+
+	offs = nvgpu_safe_sub_u64(offs, a->bit_offs);
+
+	blks = len >> a->blk_shift;
+	if (nvgpu_safe_mult_u64(blks, a->blk_size) != len) {
+		blks++;
+	}
+
+	alloc_lock(na);
+	nvgpu_assert(offs <= U32_MAX);
+	nvgpu_assert(blks <= (u32)INT_MAX);
+	nvgpu_bitmap_clear(a->bitmap, (u32)offs, (u32)blks);
+	a->bytes_freed = nvgpu_safe_add_u64(a->bytes_freed,
+				nvgpu_safe_mult_u64(blks, a->blk_size));
+	alloc_unlock(na);
+
+	alloc_dbg(na, "Free-fixed 0x%-10llx 0x%-5llx [bits=0x%llx (%llu)]",
+		  base, len, blks, blks);
+}
+
+/*
+ * Add the passed alloc to the tree of stored allocations.
+ */
+static void insert_alloc_metadata(struct nvgpu_bitmap_allocator *a,
+				  struct nvgpu_bitmap_alloc *alloc)
+{
+	alloc->alloc_entry.key_start = alloc->base;
+	alloc->alloc_entry.key_end = nvgpu_safe_add_u64(alloc->base,
+							alloc->length);
+
+	nvgpu_rbtree_insert(&alloc->alloc_entry, &a->allocs);
+}
+
+/*
+ * Find and remove meta-data from the outstanding allocations.
+ */
+static struct nvgpu_bitmap_alloc *find_alloc_metadata(
+	struct nvgpu_bitmap_allocator *a, u64 addr)
+{
+	struct nvgpu_bitmap_alloc *alloc;
+	struct nvgpu_rbtree_node *node = NULL;
+
+	nvgpu_rbtree_search(addr, &node, a->allocs);
+	if (node == NULL) {
+		return NULL;
+	}
+
+	alloc = nvgpu_bitmap_alloc_from_rbtree_node(node);
+
+	nvgpu_rbtree_unlink(node, &a->allocs);
+
+	return alloc;
+}
+
+/*
+ * Tree of alloc meta data stores the address of the alloc not the bit offset.
+ */
+static int nvgpu_bitmap_store_alloc(struct nvgpu_bitmap_allocator *a,
+				      u64 addr, u64 len)
+{
+	struct nvgpu_bitmap_alloc *alloc =
+		nvgpu_kmem_cache_alloc(a->meta_data_cache);
+
+	if (alloc == NULL) {
+		return -ENOMEM;
+	}
+
+	alloc->base = addr;
+	alloc->length = len;
+
+	insert_alloc_metadata(a, alloc);
+
+	return 0;
+}
+
+/*
+ * @len is in bytes. This routine will figure out the right number of bits to
+ * actually allocate. The return is the address in bytes as well.
+ *
+ * This is a find-first-fit allocator.
+ * Check the input parameter validity.
+ * Acquire the alloc_lock.
+ * Searche a bitmap for the first space that is large enough to satisfy the
+ *  requested size of bits by walking the next available free blocks by
+ *  bitmap_find_next_zero_area().
+ * Release the alloc_lock.
+ */
+static u64 nvgpu_bitmap_balloc(struct nvgpu_allocator *na, u64 len)
+{
+	u64 tmp_u64, addr;
+	u32 blks;
+	unsigned long offs, adjusted_offs, limit;
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(na);
+
+	if (len == 0ULL) {
+		alloc_dbg(na, "len = 0, Alloc failed!");
+		return 0;
+	}
+
+	tmp_u64 = len >> a->blk_shift;
+	nvgpu_assert(tmp_u64 <= U32_MAX);
+	blks = (u32)tmp_u64;
+
+	if (nvgpu_safe_mult_u64(blks, a->blk_size) != len) {
+		blks++;
+	}
+
+	alloc_lock(na);
+
+	/*
+	 * First look from next_blk and onwards...
+	 */
+	offs = bitmap_find_next_zero_area(a->bitmap, a->num_bits,
+					  a->next_blk, blks, 0);
+	if (offs >= a->num_bits) {
+		/*
+		 * If that didn't work try the remaining area. Since there can
+		 * be available space that spans across a->next_blk we need to
+		 * search up to the first set bit after that.
+		 */
+		limit = find_next_bit(a->bitmap, a->num_bits, a->next_blk);
+		offs = bitmap_find_next_zero_area(a->bitmap, limit,
+						  0, blks, 0);
+		if (offs >= a->next_blk) {
+			goto fail;
+		}
+	}
+
+	nvgpu_assert(offs <= U32_MAX);
+	nvgpu_bitmap_set(a->bitmap, (u32)offs, blks);
+	a->next_blk = offs + blks;
+
+	adjusted_offs = nvgpu_safe_add_u64(offs, a->bit_offs);
+	addr = nvgpu_safe_mult_u64(((u64)adjusted_offs), a->blk_size);
+
+	/*
+	 * Only do meta-data storage if we are allowed to allocate storage for
+	 * that meta-data. The issue with using malloc and friends is that
+	 * in latency and success critical paths an alloc_page() call can either
+	 * sleep for potentially a long time or fail. Since we might not want
+	 * either of these possibilities assume that the caller will keep what
+	 * data it needs around to successfully free this allocation.
+	 */
+	if ((a->flags & GPU_ALLOC_NO_ALLOC_PAGE) == 0ULL) {
+		if (nvgpu_bitmap_store_alloc(a, addr,
+						blks * a->blk_size) != 0) {
+			goto fail_reset_bitmap;
+		}
+	}
+
+	alloc_dbg(na, "Alloc 0x%-10llx 0x%-5llx [bits=0x%x (%u)]",
+		  addr, len, blks, blks);
+
+NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 14_3), "Bug 2615925")
+	nvgpu_assert(a->nr_allocs < U64_MAX);
+	a->nr_allocs++;
+	a->bytes_alloced = nvgpu_safe_add_u64(a->bytes_alloced,
+				nvgpu_safe_mult_u64(blks, a->blk_size));
+	alloc_unlock(na);
+
+	return addr;
+
+fail_reset_bitmap:
+	nvgpu_assert(blks <= (u32)INT_MAX);
+	nvgpu_assert(offs <= U32_MAX);
+	nvgpu_bitmap_clear(a->bitmap, (u32)offs, blks);
+fail:
+	a->next_blk = 0;
+	alloc_unlock(na);
+	alloc_dbg(na, "Alloc failed!");
+	return 0;
+}
+
+static void nvgpu_bitmap_free(struct nvgpu_allocator *na, u64 addr)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(na);
+	struct nvgpu_bitmap_alloc *alloc = NULL;
+	u64 offs, adjusted_offs, blks;
+
+	alloc_lock(na);
+
+	if ((a->flags & GPU_ALLOC_NO_ALLOC_PAGE) != 0ULL) {
+		(void) WARN(true,
+			"Using wrong free for NO_ALLOC_PAGE bitmap allocator");
+		goto done;
+	}
+
+	alloc = find_alloc_metadata(a, addr);
+	if (alloc == NULL) {
+		goto done;
+	}
+
+	/*
+	 * Address comes from adjusted offset (i.e the bit offset with
+	 * a->bit_offs added. So start with that and then work out the real
+	 * offs into the bitmap.
+	 */
+	adjusted_offs = addr >> a->blk_shift;
+	offs = adjusted_offs - a->bit_offs;
+	blks = alloc->length >> a->blk_shift;
+
+	nvgpu_assert(blks <= (u32)INT_MAX);
+	nvgpu_assert(offs <= U32_MAX);
+	nvgpu_bitmap_clear(a->bitmap, (u32)offs, (u32)blks);
+	alloc_dbg(na, "Free  0x%-10llx", addr);
+
+	a->bytes_freed = nvgpu_safe_add_u64(a->bytes_freed, alloc->length);
+
+done:
+	if ((a->meta_data_cache != NULL) && (alloc != NULL)) {
+		nvgpu_kmem_cache_free(a->meta_data_cache, alloc);
+	}
+	alloc_unlock(na);
+}
+
+static void nvgpu_bitmap_alloc_destroy(struct nvgpu_allocator *na)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(na);
+	struct nvgpu_bitmap_alloc *alloc;
+	struct nvgpu_rbtree_node *node = NULL;
+
+	/*
+	 * Kill any outstanding allocations.
+	 */
+	nvgpu_rbtree_enum_start(0, &node, a->allocs);
+	while (node != NULL) {
+		alloc = nvgpu_bitmap_alloc_from_rbtree_node(node);
+
+		nvgpu_rbtree_unlink(node, &a->allocs);
+		nvgpu_kmem_cache_free(a->meta_data_cache, alloc);
+
+		nvgpu_rbtree_enum_start(0, &node, a->allocs);
+	}
+
+	nvgpu_kmem_cache_destroy(a->meta_data_cache);
+	nvgpu_kfree(nvgpu_alloc_to_gpu(na), a->bitmap);
+	nvgpu_kfree(nvgpu_alloc_to_gpu(na), a);
+}
+
+#ifdef __KERNEL__
+static void nvgpu_bitmap_print_stats(struct nvgpu_allocator *na,
+				     struct seq_file *s, int lock)
+{
+	struct nvgpu_bitmap_allocator *a = bitmap_allocator(na);
+
+	alloc_pstat(s, na, "Bitmap allocator params:");
+	alloc_pstat(s, na, "  start = 0x%llx", a->base);
+	alloc_pstat(s, na, "  end   = 0x%llx", a->base + a->length);
+	alloc_pstat(s, na, "  blks  = 0x%llx", a->num_bits);
+
+	/* Actual stats. */
+	alloc_pstat(s, na, "Stats:");
+	alloc_pstat(s, na, "  Number allocs = 0x%llx", a->nr_allocs);
+	alloc_pstat(s, na, "  Number fixed  = 0x%llx", a->nr_fixed_allocs);
+	alloc_pstat(s, na, "  Bytes alloced = 0x%llx", a->bytes_alloced);
+	alloc_pstat(s, na, "  Bytes freed   = 0x%llx", a->bytes_freed);
+	alloc_pstat(s, na, "  Outstanding   = 0x%llx",
+		      a->bytes_alloced - a->bytes_freed);
+}
+#endif
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 8_7), "Bug 2823817")
+static const struct nvgpu_allocator_ops bitmap_ops = {
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
+	.alloc		= nvgpu_bitmap_balloc,
+	.free_alloc	= nvgpu_bitmap_free,
+
+	.alloc_fixed	= nvgpu_bitmap_balloc_fixed,
+	.free_fixed	= nvgpu_bitmap_free_fixed,
+
+	.base		= nvgpu_bitmap_alloc_base,
+	.length		= nvgpu_bitmap_alloc_length,
+	.end		= nvgpu_bitmap_alloc_end,
+	.inited		= nvgpu_bitmap_alloc_inited,
+
+	.fini		= nvgpu_bitmap_alloc_destroy,
+
+#ifdef __KERNEL__
+	.print_stats	= nvgpu_bitmap_print_stats,
+#endif
+};
+
+static int nvgpu_bitmap_check_argument_limits(u64 base, u64 length,
+								u64 blk_size)
+{
+	bool is_blk_size_pwr_2;
+	bool is_base_aligned;
+	bool is_length_aligned;
+
+	nvgpu_assert(blk_size > 0ULL);
+	is_blk_size_pwr_2 = (blk_size & (blk_size - 1ULL)) == 0ULL;
+	is_base_aligned =  (base & (blk_size - 1ULL)) == 0ULL;
+	is_length_aligned = (length & (blk_size - 1ULL)) == 0ULL;
+
+	if (!is_blk_size_pwr_2) {
+		nvgpu_do_assert();
+		return -EINVAL;
+	}
+
+	if (!is_base_aligned || !is_length_aligned) {
+		return -EINVAL;
+	}
+
+	if (length == 0ULL) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int nvgpu_bitmap_allocator_init(struct gk20a *g, struct nvgpu_allocator *na,
+				const char *name, u64 base, u64 length,
+				u64 blk_size, u64 flags)
+{
+	int err;
+	struct nvgpu_bitmap_allocator *a;
+
+	err = nvgpu_bitmap_check_argument_limits(base, length, blk_size);
+	if (err != 0) {
+		return err;
+	}
+
+	if (base == 0ULL) {
+		base = blk_size;
+		length = nvgpu_safe_sub_u64(length, blk_size);
+	}
+
+	a = nvgpu_kzalloc(g, sizeof(struct nvgpu_bitmap_allocator));
+	if (a == NULL) {
+		return -ENOMEM;
+	}
+
+	err = nvgpu_alloc_common_init(na, g, name, a, false, &bitmap_ops);
+	if (err != 0) {
+		goto fail;
+	}
+
+	if ((flags & GPU_ALLOC_NO_ALLOC_PAGE) == 0ULL) {
+		a->meta_data_cache = nvgpu_kmem_cache_create(g,
+					sizeof(struct nvgpu_bitmap_alloc));
+		if (a->meta_data_cache == NULL) {
+			err = -ENOMEM;
+			goto fail;
+		}
+	}
+
+	a->base = base;
+	a->length = length;
+	a->blk_size = blk_size;
+	a->blk_shift = nvgpu_safe_sub_u64(nvgpu_ffs(a->blk_size), 1UL);
+	a->num_bits = length >> a->blk_shift;
+	a->bit_offs = a->base >> a->blk_shift;
+	a->flags = flags;
+	a->allocs = NULL;
+
+	a->bitmap = nvgpu_kcalloc(g, BITS_TO_LONGS(a->num_bits),
+				  sizeof(*a->bitmap));
+	if (a->bitmap == NULL) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	nvgpu_smp_wmb();
+	a->inited = true;
+
+#ifdef CONFIG_DEBUG_FS
+	nvgpu_init_alloc_debug(g, na);
+#endif
+	alloc_dbg(na, "New allocator: type      bitmap");
+	alloc_dbg(na, "               base      0x%llx", a->base);
+	alloc_dbg(na, "               bit_offs  0x%llx", a->bit_offs);
+	alloc_dbg(na, "               size      0x%llx", a->length);
+	alloc_dbg(na, "               blk_size  0x%llx", a->blk_size);
+	alloc_dbg(na, "               flags     0x%llx", a->flags);
+
+	return 0;
+
+fail:
+	if (a->meta_data_cache != NULL) {
+		nvgpu_kmem_cache_destroy(a->meta_data_cache);
+	}
+	nvgpu_kfree(g, a);
+	return err;
+}
--- a/drivers/gpu/nvgpu/common/mm/allocators/bitmap_allocator_priv.h
+++ b/drivers/gpu/nvgpu/common/mm/allocators/bitmap_allocator_priv.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef BITMAP_ALLOCATOR_PRIV_H
+#define BITMAP_ALLOCATOR_PRIV_H
+
+/**
+ * @file
+ *
+ * Implementation of a bitmap allocator.
+ */
+
+#include <nvgpu/rbtree.h>
+#include <nvgpu/kmem.h>
+
+struct nvgpu_allocator;
+
+/**
+ * Structure to hold the implementation details of the bitmap allocator.
+ */
+struct nvgpu_bitmap_allocator {
+	/**
+	 * Pointer to the common allocator structure.
+	 */
+	struct nvgpu_allocator *owner;
+
+	/**
+	 * Base address of the space.
+	 */
+	u64 base;
+
+	/**
+	 * Length of the space.
+	 */
+	u64 length;
+
+	/**
+	 * Size that corresponds to 1 bit.
+	 */
+	u64 blk_size;
+
+	/**
+	 * Bit shift to divide by blk_size.
+	 */
+	u64 blk_shift;
+
+	/**
+	 * Number of allocatable bits.
+	 */
+	u64 num_bits;
+
+	/**
+	 * Offset of bitmap.
+	 */
+	u64 bit_offs;
+
+	/**
+	 * Optimization for making repeated allocations faster. Keep track of
+	 * the next bit after the most recent allocation. This is where the next
+	 * search will start from. This should make allocation faster in cases
+	 * where lots of allocations get made one after another. It shouldn't
+	 * have a negative impact on the case where the allocator is fragmented.
+	 */
+	u64 next_blk;
+
+	/**
+	 * The actual bitmap used for allocations.
+	 */
+	unsigned long *bitmap;
+
+	/**
+	 * Tree of outstanding allocations.
+	 */
+	struct nvgpu_rbtree_node *allocs;
+
+	/**
+	 * Metadata cache of allocations (contains address and size of
+	 * allocations).
+	 */
+	struct nvgpu_kmem_cache *meta_data_cache;
+
+	/**
+	 * Configuration flags of the allocator. See \a GPU_ALLOC_* flags.
+	 */
+	u64 flags;
+
+	/**
+	 * Boolean to indicate if the allocator has been fully initialized.
+	 */
+	bool inited;
+
+	/**
+	 * Statistics: track the number of non-fixed allocations.
+	 */
+	u64 nr_allocs;
+
+	/**
+	 * Statistics: track the number of fixed allocations.
+	 */
+	u64 nr_fixed_allocs;
+
+	/**
+	 * Statistics: total number of bytes allocated for both fixed and non-
+	 * fixed allocations.
+	 */
+	u64 bytes_alloced;
+
+	/**
+	 * Statistics: total number of bytes freed for both fixed and non-fixed
+	 * allocations.
+	 */
+	u64 bytes_freed;
+};
+
+/**
+ * Structure to hold the allocation metadata.
+ */
+struct nvgpu_bitmap_alloc {
+	/**
+	 * Base address of the allocation.
+	 */
+	u64 base;
+
+	/**
+	 * Size of the allocation.
+	 */
+	u64 length;
+
+	/**
+	 * RB tree of allocations.
+	 */
+	struct nvgpu_rbtree_node alloc_entry;
+};
+
+/**
+ * @brief Given a tree node, retrieve the metdata of the allocation.
+ *
+ * @param[in] node	Pointer to the tree node.
+ *
+ * @return pointer to the struct nvgpu_bitmap_alloc of the node.
+ */
+static inline struct nvgpu_bitmap_alloc *
+nvgpu_bitmap_alloc_from_rbtree_node(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_bitmap_alloc *)
+	((uintptr_t)node - offsetof(struct nvgpu_bitmap_alloc, alloc_entry));
+};
+
+/**
+ * @brief Given a generic allocator context, retrieve a pointer to the bitmap
+ * allocator context structure.
+ *
+ * @param[in] a		Pointer to nvgpu allocator.
+ *
+ * @return pointer to the struct nvgpu_bitmap_allocator.
+ */
+static inline struct nvgpu_bitmap_allocator *bitmap_allocator(
+	struct nvgpu_allocator *a)
+{
+	return (struct nvgpu_bitmap_allocator *)(a)->priv;
+}
+
+
+#endif
--- a/drivers/gpu/nvgpu/common/mm/allocators/buddy_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/allocators/buddy_allocator.c
--- a/drivers/gpu/nvgpu/common/mm/allocators/buddy_allocator_priv.h
+++ b/drivers/gpu/nvgpu/common/mm/allocators/buddy_allocator_priv.h
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_MM_BUDDY_ALLOCATOR_PRIV_H
+#define NVGPU_MM_BUDDY_ALLOCATOR_PRIV_H
+
+/**
+ * @file
+ *
+ * Implementation of the buddy allocator.
+ */
+
+#include <nvgpu/rbtree.h>
+#include <nvgpu/list.h>
+#include <nvgpu/static_analysis.h>
+
+struct nvgpu_kmem_cache;
+struct nvgpu_allocator;
+struct vm_gk20a;
+
+/**
+ * Structure that defines each buddy as an element in a binary tree.
+ */
+struct nvgpu_buddy {
+	/**
+	 * Parent node.
+	 */
+	struct nvgpu_buddy *parent;
+
+	/**
+	 * This node's buddy.
+	 */
+	struct nvgpu_buddy *buddy;
+
+	/**
+	 * Lower address sub-node.
+	 */
+	struct nvgpu_buddy *left;
+
+	/**
+	 * Higher address sub-node.
+	 */
+	struct nvgpu_buddy *right;
+
+	/**
+	 * List entry for various lists.
+	 */
+	struct nvgpu_list_node buddy_entry;
+
+	/**
+	 * RB tree of allocations.
+	 */
+	struct nvgpu_rbtree_node alloced_entry;
+
+	/**
+	 * Start address of this buddy.
+	 */
+	u64 start;
+
+	/**
+	 * End address of this buddy.
+	 */
+	u64 end;
+
+	/**
+	 * Buddy order.
+	 */
+	u64 order;
+
+	/**
+	 * Possible flags to use in the buddy allocator. Set in the #flags
+	 * member.
+	 * @addtogroup BALLOC_BUDDY_FLAGS
+	 * @{
+	 */
+#define BALLOC_BUDDY_ALLOCED	0x1U
+#define BALLOC_BUDDY_SPLIT	0x2U
+#define BALLOC_BUDDY_IN_LIST	0x4U
+	/**@}*/
+
+	/**
+	 * Buddy flags among the @ref BALLOC_BUDDY_FLAGS
+	 */
+	u32 flags;
+
+
+	/**
+	 * Possible PDE sizes. This allows for grouping like sized allocations
+	 * into the same PDE. Set in the #pte_size member.
+	 * @addtogroup BALLOC_PTE_SIZE
+	 * @{
+	 */
+#define BALLOC_PTE_SIZE_ANY	(~0U)
+#define BALLOC_PTE_SIZE_INVALID	0U
+#define BALLOC_PTE_SIZE_SMALL	1U
+#define BALLOC_PTE_SIZE_BIG	2U
+	/**@}*/
+
+	/**
+	 * Size of the PDE this buddy is using. Possible values in
+	 * @ref BALLOC_PTE_SIZE
+	 */
+	u32 pte_size;
+};
+
+/**
+ * @brief Given a list node, retrieve the buddy.
+ *
+ * @param[in] node	Pointer to the list node.
+ *
+ * @return pointer to the struct nvgpu_buddy of the node.
+ */
+static inline struct nvgpu_buddy *
+nvgpu_buddy_from_buddy_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_buddy *)
+		((uintptr_t)node - offsetof(struct nvgpu_buddy, buddy_entry));
+};
+
+/**
+ * @brief Given a tree node, retrieve the buddy.
+ *
+ * @param[in] node	Pointer to the tree node.
+ *
+ * @return pointer to the struct nvgpu_buddy of the node.
+ */
+static inline struct nvgpu_buddy *
+nvgpu_buddy_from_rbtree_node(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_buddy *)
+		((uintptr_t)node - offsetof(struct nvgpu_buddy, alloced_entry));
+};
+
+/**
+ * @brief Macro generator to create is/set/clr operations for each of the
+ * flags in @ref BALLOC_BUDDY_FLAGS.
+ *
+ * The created functions are:
+ *
+ *     bool buddy_is_alloced(struct nvgpu_buddy *b);
+ *     void buddy_set_alloced(struct nvgpu_buddy *b);
+ *     void buddy_clr_alloced(struct nvgpu_buddy *b);
+ *
+ *     bool buddy_is_split(struct nvgpu_buddy *b);
+ *     void buddy_set_split(struct nvgpu_buddy *b);
+ *     void buddy_clr_split(struct nvgpu_buddy *b);
+ *
+ *     bool buddy_is_in_list(struct nvgpu_buddy *b);
+ *     void buddy_set_in_list(struct nvgpu_buddy *b);
+ *     void buddy_clr_in_list(struct nvgpu_buddy *b);
+ *
+ * @param[in] flag	One of is, set or clr
+ * @param[in] flag_up	One of the @ref BALLOC_BUDDY_FLAGS
+ *
+ * @{
+ */
+#define nvgpu_buddy_allocator_flag_ops(flag, flag_up)			\
+	static inline bool buddy_is_ ## flag(struct nvgpu_buddy *b)	\
+	{								\
+		return (b->flags & BALLOC_BUDDY_ ## flag_up) != 0U;	\
+	}								\
+	static inline void buddy_set_ ## flag(struct nvgpu_buddy *b)	\
+	{								\
+		b->flags |= BALLOC_BUDDY_ ## flag_up;			\
+	}								\
+	static inline void buddy_clr_ ## flag(struct nvgpu_buddy *b)	\
+	{								\
+		b->flags &= ~BALLOC_BUDDY_ ## flag_up;			\
+	}
+
+nvgpu_buddy_allocator_flag_ops(alloced, ALLOCED);
+nvgpu_buddy_allocator_flag_ops(split,   SPLIT);
+nvgpu_buddy_allocator_flag_ops(in_list, IN_LIST);
+/**@} */
+
+/**
+ * Structure to keep information for a fixed allocation.
+ */
+struct nvgpu_fixed_alloc {
+	/**
+	 * List of buddies.
+	 */
+	struct nvgpu_list_node buddies;
+	/**
+	 * RB tree of fixed allocations.
+	 */
+	struct nvgpu_rbtree_node alloced_entry;
+	/**
+	 * Start of fixed block.
+	 */
+	u64 start;
+	/**
+	 * End address.
+	 */
+	u64 end;
+};
+
+/**
+ * @brief Given a tree node, retrieve the fixed allocation.
+ *
+ * @param[in] node	Pointer to the tree node.
+ *
+ * @return pointer to the struct nvgpu_fixed_alloc of the node.
+ */
+static inline struct nvgpu_fixed_alloc *
+nvgpu_fixed_alloc_from_rbtree_node(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_fixed_alloc *)
+	((uintptr_t)node - offsetof(struct nvgpu_fixed_alloc, alloced_entry));
+};
+
+/**
+ * GPU buddy allocator for the various GPU address spaces. Each addressable unit
+ * doesn't have to correspond to a byte. In some cases each unit is a more
+ * complex object such as a comp_tag line or the like.
+ *
+ * The max order is computed based on the size of the minimum order and the size
+ * of the address space.
+ *
+ * #blk_size is the size of an order 0 buddy.
+ */
+struct nvgpu_buddy_allocator {
+	/**
+	 * Pointer to the common allocator structure.
+	 */
+	struct nvgpu_allocator *owner;
+	/**
+	 * Parent VM - can be NULL.
+	 */
+	struct vm_gk20a *vm;
+
+	/**
+	 * Base address of the space.
+	 */
+	u64 base;
+	/**
+	 * Length of the space.
+	 */
+	u64 length;
+	/**
+	 * Size of order 0 allocation.
+	 */
+	u64 blk_size;
+	/**
+	 * Shift to divide by blk_size.
+	 */
+	u64 blk_shift;
+
+	/**
+	 * Internal: real start (aligned to #blk_size).
+	 */
+	u64 start;
+	/**
+	 * Internal: real end, trimmed if needed.
+	 */
+	u64 end;
+	/**
+	 * Internal: count of objects in space.
+	 */
+	u64 count;
+	/**
+	 * Internal: count of blks in the space.
+	 */
+	u64 blks;
+	/**
+	 * Internal: specific maximum order.
+	 */
+	u64 max_order;
+
+	/**
+	 * Outstanding allocations.
+	 */
+	struct nvgpu_rbtree_node *alloced_buddies;
+	/**
+	 * Outstanding fixed allocations.
+	 */
+	struct nvgpu_rbtree_node *fixed_allocs;
+
+	/**
+	 * List of carveouts.
+	 */
+	struct nvgpu_list_node co_list;
+
+	/**
+	 * Cache of allocations (contains address and size of allocations).
+	 */
+	struct nvgpu_kmem_cache *buddy_cache;
+
+	/**
+	 * Impose an upper bound on the maximum order.
+	 */
+#define GPU_BALLOC_ORDER_LIST_LEN	(GPU_BALLOC_MAX_ORDER + 1U)
+
+	/**
+	 * List of buddies.
+	 */
+	struct nvgpu_list_node buddy_list[GPU_BALLOC_ORDER_LIST_LEN];
+	/**
+	 * Length of the buddy list.
+	 */
+	u64 buddy_list_len[GPU_BALLOC_ORDER_LIST_LEN];
+	/**
+	 * Number of split nodes.
+	 */
+	u64 buddy_list_split[GPU_BALLOC_ORDER_LIST_LEN];
+	/**
+	 * Number of allocated nodes.
+	 */
+	u64 buddy_list_alloced[GPU_BALLOC_ORDER_LIST_LEN];
+
+	/**
+	 * This is for when the allocator is managing a GVA space (the
+	 * #GPU_ALLOC_GVA_SPACE bit is set in #flags). This requires
+	 * that we group like sized allocations into PDE blocks.
+	 */
+	u64 pte_blk_order;
+
+	/**
+	 * Boolean to indicate if the allocator has been fully initialized.
+	 */
+	bool initialized;
+	/**
+	 * Boolean set to true after the first allocation is made.
+	 */
+	bool alloc_made;
+
+	/**
+	 * Flags in used by the allocator as defined by @ref GPU_ALLOC_FLAGS
+	 */
+	u64 flags;
+
+	/**
+	 * Statistics: total number of bytes allocated.
+	 */
+	u64 bytes_alloced;
+	/**
+	 * Statistics: total number of bytes allocated taking into account the
+	 * buddy order.
+	 */
+	u64 bytes_alloced_real;
+	/**
+	 * Statistics: total number of bytes freed.
+	 */
+	u64 bytes_freed;
+};
+
+/**
+ * @brief Given a generic allocator context, retrieve a pointer to the buddy
+ * allocator context structure.
+ *
+ * @param[in] a		Pointer to nvgpu allocator.
+ *
+ * @return pointer to the struct nvgpu_bitmap_allocator.
+ */
+static inline struct nvgpu_buddy_allocator *buddy_allocator(
+	struct nvgpu_allocator *a)
+{
+	return (struct nvgpu_buddy_allocator *)(a)->priv;
+}
+
+/**
+ * @brief Given a buddy allocator, retrieve the list of buddies of the chosen
+ * order.
+ *
+ * @param[in] a		Pointer to the buddy allocator.
+ * @param[in] order	Buddy order.
+ *
+ * @return list of buddies whose order is \a order.
+ */
+static inline struct nvgpu_list_node *balloc_get_order_list(
+	struct nvgpu_buddy_allocator *a, u64 order)
+{
+	return &a->buddy_list[order];
+}
+
+/**
+ * @brief Convert a buddy order to a length in bytes, based on the block size.
+ *
+ * @param[in] a		Pointer to the buddy allocator.
+ * @param[in] order	Buddy order.
+ *
+ * @return length in bytes.
+ */
+static inline u64 balloc_order_to_len(struct nvgpu_buddy_allocator *a,
+				      u64 order)
+{
+	return nvgpu_safe_mult_u64(BIT64(order), a->blk_size);
+}
+
+/**
+ * @brief Given a base address, shift it by the base address of the buddy.
+ *
+ * @param[in] a		Pointer to the buddy allocator.
+ * @param[in] order	Base address.
+ *
+ * @return shifted address.
+ */
+static inline u64 balloc_base_shift(struct nvgpu_buddy_allocator *a,
+				    u64 base)
+{
+	return nvgpu_safe_sub_u64(base, a->start);
+}
+
+/**
+ * @brief Given a shifted address, unshift it by the base address of the buddy.
+ *
+ * @param[in] a		Pointer to the buddy allocator.
+ * @param[in] order	Shifted address.
+ *
+ * @return unshifted address.
+ */
+static inline u64 balloc_base_unshift(struct nvgpu_buddy_allocator *a,
+				      u64 base)
+{
+	return nvgpu_safe_add_u64(base, a->start);
+}
+
+/**
+ * @brief Given a buddy allocator context, retrieve a pointer to the generic
+ * allocator context structure.
+ *
+ * @param[in] a		Pointer to nvgpu buddy allocator.
+ *
+ * @return pointer to the struct nvgpu_allocator.
+ */
+static inline struct nvgpu_allocator *balloc_owner(
+	struct nvgpu_buddy_allocator *a)
+{
+	return a->owner;
+}
+
+#endif /* NVGPU_MM_BUDDY_ALLOCATOR_PRIV_H */
--- a/drivers/gpu/nvgpu/common/mm/allocators/nvgpu_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/allocators/nvgpu_allocator.c
@@ -0,0 +1,216 @@
+/*
+ * gk20a allocator
+ *
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/allocator.h>
+#include <nvgpu/gk20a.h>
+
+
+u64 nvgpu_alloc_length(struct nvgpu_allocator *a)
+{
+	if (a->ops->length != NULL) {
+		return a->ops->length(a);
+	}
+
+	return 0;
+}
+
+u64 nvgpu_alloc_base(struct nvgpu_allocator *a)
+{
+	if (a->ops->base != NULL) {
+		return a->ops->base(a);
+	}
+
+	return 0;
+}
+
+bool nvgpu_alloc_initialized(struct nvgpu_allocator *a)
+{
+	if ((a->ops == NULL) || (a->ops->inited == NULL)) {
+		return false;
+	}
+
+	return a->ops->inited(a);
+}
+
+u64 nvgpu_alloc_end(struct nvgpu_allocator *a)
+{
+	if (a->ops->end != NULL) {
+		return a->ops->end(a);
+	}
+
+	return 0;
+}
+
+u64 nvgpu_alloc_space(struct nvgpu_allocator *a)
+{
+	if (a->ops->space != NULL) {
+		return a->ops->space(a);
+	}
+
+	return 0;
+}
+
+u64 nvgpu_alloc(struct nvgpu_allocator *a, u64 len)
+{
+	return a->ops->alloc(a, len);
+}
+
+u64 nvgpu_alloc_pte(struct nvgpu_allocator *a, u64 len, u32 page_size)
+{
+	return a->ops->alloc_pte(a, len, page_size);
+}
+
+void nvgpu_free(struct nvgpu_allocator *a, u64 addr)
+{
+	a->ops->free_alloc(a, addr);
+}
+
+u64 nvgpu_alloc_fixed(struct nvgpu_allocator *a, u64 base, u64 len,
+		      u32 page_size)
+{
+	if ((U64_MAX - base) < len) {
+		return 0ULL;
+	}
+
+	if (a->ops->alloc_fixed != NULL) {
+		return a->ops->alloc_fixed(a, base, len, page_size);
+	}
+
+	return 0;
+}
+
+void nvgpu_free_fixed(struct nvgpu_allocator *a, u64 base, u64 len)
+{
+	/*
+	 * If this operation is not defined for the allocator then just do
+	 * nothing. The alternative would be to fall back on the regular
+	 * free but that may be harmful in unexpected ways.
+	 */
+	if (a->ops->free_fixed != NULL) {
+		a->ops->free_fixed(a, base, len);
+	}
+}
+
+int nvgpu_alloc_reserve_carveout(struct nvgpu_allocator *a,
+				 struct nvgpu_alloc_carveout *co)
+{
+	if (a->ops->reserve_carveout != NULL) {
+		return a->ops->reserve_carveout(a, co);
+	}
+
+	return -ENODEV;
+}
+
+void nvgpu_alloc_release_carveout(struct nvgpu_allocator *a,
+				  struct nvgpu_alloc_carveout *co)
+{
+	if (a->ops->release_carveout != NULL) {
+		a->ops->release_carveout(a, co);
+	}
+}
+
+void nvgpu_alloc_destroy(struct nvgpu_allocator *a)
+{
+	a->ops->fini(a);
+	nvgpu_mutex_destroy(&a->lock);
+	(void) memset(a, 0, sizeof(*a));
+}
+
+#ifdef __KERNEL__
+void nvgpu_alloc_print_stats(struct nvgpu_allocator *na,
+			     struct seq_file *s, int lock)
+{
+	na->ops->print_stats(na, s, lock);
+}
+#endif
+
+/*
+ * Handle the common init stuff for a nvgpu_allocator.
+ */
+int nvgpu_alloc_common_init(struct nvgpu_allocator *a, struct gk20a *g,
+			    const char *name, void *priv, bool dbg,
+			    const struct nvgpu_allocator_ops *ops)
+{
+	if (ops == NULL) {
+		return -EINVAL;
+	}
+
+	/*
+	 * This is the bare minimum operations required for a sensible
+	 * allocator.
+	 */
+	if ((ops->alloc == NULL) || (ops->free_alloc == NULL) ||
+		(ops->fini == NULL)) {
+		return -EINVAL;
+	}
+
+	nvgpu_mutex_init(&a->lock);
+
+	a->g = g;
+	a->ops = ops;
+	a->priv = priv;
+	a->debug = dbg;
+
+	(void) strncpy(a->name, name, sizeof(a->name));
+	a->name[sizeof(a->name) - 1U] = '\0';
+
+	return 0;
+}
+
+/*
+ * Initialize requested type of allocator
+ */
+
+int nvgpu_allocator_init(struct gk20a *g, struct nvgpu_allocator *na,
+			      struct vm_gk20a *vm, const char *name,
+			      u64 base, u64 length, u64 blk_size, u64 max_order,
+			      u64 flags, enum nvgpu_allocator_type alloc_type)
+{
+	int err = -EINVAL;
+
+	switch (alloc_type) {
+	case BUDDY_ALLOCATOR:
+		err = nvgpu_buddy_allocator_init(g, na, vm, name, base, length,
+						blk_size, max_order, flags);
+		break;
+#ifdef CONFIG_NVGPU_DGPU
+	case PAGE_ALLOCATOR:
+		err = nvgpu_page_allocator_init(g, na, name, base, length,
+							blk_size, flags);
+		break;
+#endif
+	case BITMAP_ALLOCATOR:
+		err = nvgpu_bitmap_allocator_init(g, na, name, base, length,
+							blk_size, flags);
+		break;
+	default:
+		nvgpu_err(g, "Incorrect allocator type, couldn't initialize");
+		break;
+	}
+
+	if (err < 0) {
+		nvgpu_err(g, "Failed!");
+	}
+	return err;
+}
--- a/drivers/gpu/nvgpu/common/mm/allocators/page_allocator.c
+++ b/drivers/gpu/nvgpu/common/mm/allocators/page_allocator.c
--- a/drivers/gpu/nvgpu/common/mm/as.c
+++ b/drivers/gpu/nvgpu/common/mm/as.c
@@ -0,0 +1,245 @@
+/*
+ * GK20A Address Spaces
+ *
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/trace.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/string.h>
+#include <nvgpu/nvgpu_init.h>
+
+#define VM_NAME_PREFIX	"as_"
+
+/* dumb allocator... */
+static int generate_as_share_id(struct gk20a_as *as)
+{
+	struct gk20a *g = gk20a_from_as(as);
+
+	nvgpu_log_fn(g, " ");
+	as->last_share_id = nvgpu_safe_add_s32(as->last_share_id, 1);
+	return as->last_share_id;
+}
+/* still dumb */
+static void release_as_share_id(struct gk20a_as_share *as_share)
+{
+	struct gk20a *g = gk20a_from_as(as_share->as);
+
+	nvgpu_log_fn(g, " ");
+	return;
+}
+
+/* address space interfaces for the gk20a module */
+static int gk20a_vm_alloc_share(struct gk20a_as_share *as_share,
+				u32 big_page_size, u32 flags,
+				u64 va_range_start, u64 va_range_end,
+				u64 va_range_split)
+{
+	struct gk20a_as *as = as_share->as;
+	struct gk20a *g = gk20a_from_as(as);
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm;
+	char name[NVGPU_VM_NAME_LEN] = VM_NAME_PREFIX;
+	char *p;
+	u64 user_size;
+	u64 kernel_size = mm->channel.kernel_size;
+	u64 pde_size, pde_size_mask;
+	bool big_pages;
+	const bool userspace_managed =
+		(flags & NVGPU_AS_ALLOC_USERSPACE_MANAGED) != 0U;
+	const bool unified_va =
+		nvgpu_is_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES) ||
+		((flags & NVGPU_AS_ALLOC_UNIFIED_VA) != 0U);
+
+	nvgpu_log_fn(g, " ");
+
+	if (big_page_size == 0U) {
+		big_pages = false;
+		big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+	} else {
+		if (!is_power_of_2(big_page_size)) {
+			return -EINVAL;
+		}
+
+		if ((big_page_size &
+		     nvgpu_mm_get_available_big_page_sizes(g)) == 0U) {
+			return -EINVAL;
+		}
+		big_pages = true;
+	}
+
+	pde_size = BIT64(nvgpu_vm_pde_coverage_bit_count(g, big_page_size));
+	pde_size_mask = nvgpu_safe_sub_u64(pde_size, U64(1));
+
+	if ((va_range_start == 0ULL) ||
+		((va_range_start & pde_size_mask) != 0ULL)) {
+		return -EINVAL;
+	}
+
+	if ((va_range_end == 0ULL) ||
+		((va_range_end & pde_size_mask) != 0ULL)) {
+		return -EINVAL;
+	}
+
+	if (va_range_start >= va_range_end) {
+		return -EINVAL;
+	}
+
+	user_size = nvgpu_safe_sub_u64(va_range_end, va_range_start);
+
+	if (unified_va || !big_pages) {
+		if (va_range_split != 0ULL) {
+			return -EINVAL;
+		}
+	} else {
+		/* non-unified VA: split required */
+		if ((va_range_split == 0ULL) ||
+			((va_range_split & pde_size_mask) != 0ULL)) {
+			return -EINVAL;
+		}
+
+		/* non-unified VA: split range checks */
+		if ((va_range_split <= va_range_start) ||
+		    (va_range_split >= va_range_end)) {
+			return -EINVAL;
+		}
+	}
+
+	nvgpu_log_info(g,
+		"vm: low_hole=0x%llx, user_size=0x%llx, kernel_size=0x%llx",
+		va_range_start, user_size, kernel_size);
+
+	p = name + strlen(name);
+	(void) nvgpu_strnadd_u32(p, nvgpu_safe_cast_s32_to_u32(as_share->id),
+				 sizeof(name) - sizeof(VM_NAME_PREFIX), 10U);
+
+	vm = nvgpu_vm_init(g, big_page_size,
+			   va_range_start,
+			   user_size,
+			   kernel_size,
+			   va_range_split,
+			   big_pages, userspace_managed, unified_va, name);
+	if (vm == NULL) {
+		return -ENOMEM;
+	}
+
+	as_share->vm = vm;
+	vm->as_share = as_share;
+	vm->enable_ctag = true;
+
+	return 0;
+}
+
+int gk20a_as_alloc_share(struct gk20a *g,
+			u32 big_page_size, u32 flags, u64 va_range_start,
+			u64 va_range_end, u64 va_range_split,
+			struct gk20a_as_share **out)
+{
+	struct gk20a_as_share *as_share;
+	int err = 0;
+
+	nvgpu_log_fn(g, " ");
+	g = nvgpu_get(g);
+	if (g == NULL) {
+		return -ENODEV;
+	}
+
+	*out = NULL;
+	as_share = nvgpu_kzalloc(g, sizeof(*as_share));
+	if (as_share == NULL) {
+		return -ENOMEM;
+	}
+
+	as_share->as = &g->as;
+	as_share->id = generate_as_share_id(as_share->as);
+
+	/* this will set as_share->vm. */
+	err = gk20a_busy(g);
+	if (err != 0) {
+		goto failed;
+	}
+	err = gk20a_vm_alloc_share(as_share, big_page_size, flags,
+		va_range_start, va_range_end, va_range_split);
+	gk20a_idle(g);
+
+	if (err != 0) {
+		goto failed;
+	}
+
+	*out = as_share;
+	return 0;
+
+failed:
+	nvgpu_kfree(g, as_share);
+	return err;
+}
+
+int gk20a_vm_release_share(struct gk20a_as_share *as_share)
+{
+	struct vm_gk20a *vm = as_share->vm;
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	nvgpu_log_fn(g, " ");
+
+	vm->as_share = NULL;
+	as_share->vm = NULL;
+
+	nvgpu_vm_put(vm);
+
+	return 0;
+}
+
+/*
+ * channels and the device nodes call this to release.
+ * once the ref_cnt hits zero the share is deleted.
+ */
+int gk20a_as_release_share(struct gk20a_as_share *as_share)
+{
+	struct gk20a *g = as_share->vm->mm->g;
+	int err;
+
+	nvgpu_log_fn(g, " ");
+
+	err = gk20a_busy(g);
+
+	if (err != 0) {
+		goto release_fail;
+	}
+
+	err = gk20a_vm_release_share(as_share);
+
+	gk20a_idle(g);
+
+release_fail:
+	release_as_share_id(as_share);
+	nvgpu_put(g);
+	nvgpu_kfree(g, as_share);
+
+	return err;
+}
+
+struct gk20a *gk20a_from_as(struct gk20a_as *as)
+{
+	return (struct gk20a *)((uintptr_t)as - offsetof(struct gk20a, as));
+}
--- a/drivers/gpu/nvgpu/common/mm/comptags.c
+++ b/drivers/gpu/nvgpu/common/mm/comptags.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/bitops.h>
+#include <nvgpu/comptags.h>
+#include <nvgpu/gk20a.h>
+
+int gk20a_comptaglines_alloc(struct gk20a_comptag_allocator *allocator,
+			     u32 *offset, u32 len)
+{
+	unsigned long addr;
+	int err = 0;
+
+	if (allocator->size == 0UL) {
+		return -EINVAL;
+	}
+
+	nvgpu_mutex_acquire(&allocator->lock);
+	addr = bitmap_find_next_zero_area(allocator->bitmap, allocator->size,
+			0, len, 0);
+	if (addr < allocator->size) {
+		/* number zero is reserved; bitmap base is 1 */
+		nvgpu_assert(addr < U64(U32_MAX));
+		*offset = 1U + U32(addr);
+		nvgpu_bitmap_set(allocator->bitmap, U32(addr), len);
+	} else {
+		err = -ENOMEM;
+	}
+	nvgpu_mutex_release(&allocator->lock);
+
+	return err;
+}
+
+void gk20a_comptaglines_free(struct gk20a_comptag_allocator *allocator,
+			     u32 offset, u32 len)
+{
+	/* number zero is reserved; bitmap base is 1 */
+	u32 addr = offset - 1U;
+
+	if (allocator->size == 0UL) {
+		return;
+	}
+
+	WARN_ON(offset == 0U);
+	WARN_ON(addr > allocator->size);
+	WARN_ON((unsigned long)addr + (unsigned long)len > allocator->size);
+
+	nvgpu_mutex_acquire(&allocator->lock);
+	nvgpu_bitmap_clear(allocator->bitmap, addr, len);
+	nvgpu_mutex_release(&allocator->lock);
+}
+
+int gk20a_comptag_allocator_init(struct gk20a *g,
+				 struct gk20a_comptag_allocator *allocator,
+				 unsigned long size)
+{
+	nvgpu_mutex_init(&allocator->lock);
+
+	/*
+	 * 0th comptag is special and is never used. The base for this bitmap
+	 * is 1, and its size is one less than the size of comptag store.
+	 */
+	size--;
+	allocator->bitmap = nvgpu_vzalloc(g,
+					  BITS_TO_LONGS(size) * sizeof(long));
+	if (allocator->bitmap == NULL) {
+		return -ENOMEM;
+	}
+
+	allocator->size = size;
+
+	return 0;
+}
+
+void gk20a_comptag_allocator_destroy(struct gk20a *g,
+				     struct gk20a_comptag_allocator *allocator)
+{
+	/*
+	 * called only when exiting the driver (gk20a_remove, or unwinding the
+	 * init stage); no users should be active, so taking the mutex is
+	 * unnecessary here.
+	 */
+	allocator->size = 0;
+	nvgpu_vfree(g, allocator->bitmap);
+}
--- a/drivers/gpu/nvgpu/common/mm/dma.c
+++ b/drivers/gpu/nvgpu/common/mm/dma.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/mm.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/nvgpu_mem.h>
+
+int nvgpu_dma_alloc(struct gk20a *g, size_t size, struct nvgpu_mem *mem)
+{
+	return nvgpu_dma_alloc_flags(g, 0, size, mem);
+}
+
+int nvgpu_dma_alloc_flags(struct gk20a *g, unsigned long flags, size_t size,
+		struct nvgpu_mem *mem)
+{
+#ifdef CONFIG_NVGPU_DGPU
+	if (!nvgpu_is_enabled(g, NVGPU_MM_UNIFIED_MEMORY)) {
+		/*
+		 * Force the no-kernel-mapping flag on because we don't support
+		 * the lack of it for vidmem - the user should not care when
+		 * using nvgpu_gmmu_alloc_map and it's vidmem, or if there's a
+		 * difference, the user should use the flag explicitly anyway.
+		 *
+		 * Incoming flags are ignored here, since bits other than the
+		 * no-kernel-mapping flag are ignored by the vidmem mapping
+		 * functions anyway.
+		 */
+		int err = nvgpu_dma_alloc_flags_vid(g,
+				NVGPU_DMA_NO_KERNEL_MAPPING,
+				size, mem);
+
+		if (err == 0) {
+			return 0;
+		}
+
+		/*
+		 * Fall back to sysmem (which may then also fail) in case
+		 * vidmem is exhausted.
+		 */
+	}
+#endif
+
+	return nvgpu_dma_alloc_flags_sys(g, flags, size, mem);
+}
+
+int nvgpu_dma_alloc_sys(struct gk20a *g, size_t size, struct nvgpu_mem *mem)
+{
+	return nvgpu_dma_alloc_flags_sys(g, 0, size, mem);
+}
+
+#ifdef CONFIG_NVGPU_DGPU
+int nvgpu_dma_alloc_vid(struct gk20a *g, size_t size, struct nvgpu_mem *mem)
+{
+	return nvgpu_dma_alloc_flags_vid(g,
+			NVGPU_DMA_NO_KERNEL_MAPPING, size, mem);
+}
+
+int nvgpu_dma_alloc_flags_vid(struct gk20a *g, unsigned long flags,
+		size_t size, struct nvgpu_mem *mem)
+{
+	return nvgpu_dma_alloc_flags_vid_at(g, flags, size, mem, 0);
+}
+
+int nvgpu_dma_alloc_vid_at(struct gk20a *g,
+		size_t size, struct nvgpu_mem *mem, u64 at)
+{
+	return nvgpu_dma_alloc_flags_vid_at(g,
+			NVGPU_DMA_NO_KERNEL_MAPPING, size, mem, at);
+}
+#endif
+
+int nvgpu_dma_alloc_map(struct vm_gk20a *vm, size_t size,
+		struct nvgpu_mem *mem)
+{
+	int err = nvgpu_dma_alloc_map_flags(vm, 0, size, mem);
+
+	if (err < 0) {
+		nvgpu_err(vm->mm->g, "Failed!");
+	}
+	return err;
+}
+
+int nvgpu_dma_alloc_map_flags(struct vm_gk20a *vm, unsigned long flags,
+		size_t size, struct nvgpu_mem *mem)
+{
+	int err = 0;
+
+#ifdef CONFIG_NVGPU_DGPU
+	if (!nvgpu_is_enabled(gk20a_from_vm(vm), NVGPU_MM_UNIFIED_MEMORY)) {
+		/*
+		 * Force the no-kernel-mapping flag on because we don't support
+		 * the lack of it for vidmem - the user should not care when
+		 * using nvgpu_dma_alloc_map and it's vidmem, or if there's a
+		 * difference, the user should use the flag explicitly anyway.
+		 */
+		err = nvgpu_dma_alloc_map_flags_vid(vm,
+				flags | NVGPU_DMA_NO_KERNEL_MAPPING,
+				size, mem);
+
+		if (err == 0) {
+			return 0;
+		}
+
+		/*
+		 * Fall back to sysmem (which may then also fail) in case
+		 * vidmem is exhausted.
+		 */
+	}
+#endif
+
+	err = nvgpu_dma_alloc_map_flags_sys(vm, flags, size, mem);
+	if (err < 0) {
+		nvgpu_err(vm->mm->g, "Failed!");
+	}
+	return err;
+}
+
+int nvgpu_dma_alloc_map_sys(struct vm_gk20a *vm, size_t size,
+		struct nvgpu_mem *mem)
+{
+	int err = 0;
+
+	err = nvgpu_dma_alloc_map_flags_sys(vm, 0, size, mem);
+	if (err < 0) {
+		nvgpu_err(vm->mm->g, "Failed!");
+	}
+	return err;
+}
+
+int nvgpu_dma_alloc_map_flags_sys(struct vm_gk20a *vm, unsigned long flags,
+		size_t size, struct nvgpu_mem *mem)
+{
+	int err = nvgpu_dma_alloc_flags_sys(vm->mm->g, flags, size, mem);
+
+	if (err != 0) {
+		return err;
+	}
+
+	mem->gpu_va = nvgpu_gmmu_map(vm, mem, size, 0,
+				     gk20a_mem_flag_none, false,
+				     mem->aperture);
+	if (mem->gpu_va == 0ULL) {
+		err = -ENOMEM;
+		goto fail_free;
+	}
+
+	return 0;
+
+fail_free:
+	nvgpu_dma_free(vm->mm->g, mem);
+	return err;
+}
+
+#ifdef CONFIG_NVGPU_DGPU
+int nvgpu_dma_alloc_map_vid(struct vm_gk20a *vm, size_t size,
+		struct nvgpu_mem *mem)
+{
+	return nvgpu_dma_alloc_map_flags_vid(vm,
+			NVGPU_DMA_NO_KERNEL_MAPPING, size, mem);
+}
+
+int nvgpu_dma_alloc_map_flags_vid(struct vm_gk20a *vm, unsigned long flags,
+		size_t size, struct nvgpu_mem *mem)
+{
+	int err = nvgpu_dma_alloc_flags_vid(vm->mm->g, flags, size, mem);
+
+	if (err != 0) {
+		return err;
+	}
+
+	mem->gpu_va = nvgpu_gmmu_map(vm, mem, size, 0,
+				     gk20a_mem_flag_none, false,
+				     mem->aperture);
+	if (mem->gpu_va == 0ULL) {
+		err = -ENOMEM;
+		goto fail_free;
+	}
+
+	return 0;
+
+fail_free:
+	nvgpu_dma_free(vm->mm->g, mem);
+	return err;
+}
+#endif
+
+void nvgpu_dma_free(struct gk20a *g, struct nvgpu_mem *mem)
+{
+	switch (mem->aperture) {
+	case APERTURE_SYSMEM:
+		nvgpu_dma_free_sys(g, mem);
+		break;
+#ifdef CONFIG_NVGPU_DGPU
+	case APERTURE_VIDMEM:
+		nvgpu_dma_free_vid(g, mem);
+		break;
+#endif
+	default:
+		/* like free() on "null" memory */
+		break;
+	}
+}
+
+void nvgpu_dma_unmap_free(struct vm_gk20a *vm, struct nvgpu_mem *mem)
+{
+	if (mem->gpu_va != 0ULL) {
+		nvgpu_gmmu_unmap(vm, mem, mem->gpu_va);
+	}
+	mem->gpu_va = 0;
+
+	nvgpu_dma_free(vm->mm->g, mem);
+}
--- a/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/log.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/static_analysis.h>
+
+#include "pd_cache_priv.h"
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_list_entry(struct nvgpu_list_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, list_entry));
+};
+
+static inline struct nvgpu_pd_mem_entry *
+nvgpu_pd_mem_entry_from_tree_entry(struct nvgpu_rbtree_node *node)
+{
+	return (struct nvgpu_pd_mem_entry *)
+		((uintptr_t)node -
+		 offsetof(struct nvgpu_pd_mem_entry, tree_entry));
+};
+
+static u32 nvgpu_pd_cache_nr(u32 bytes)
+{
+	unsigned long tmp = ilog2((unsigned long)bytes >>
+			((unsigned long)NVGPU_PD_CACHE_MIN_SHIFT - 1UL));
+
+	nvgpu_assert(tmp <= U32_MAX);
+	return (u32)tmp;
+}
+
+static u32 nvgpu_pd_cache_get_nr_entries(struct nvgpu_pd_mem_entry *pentry)
+{
+	BUG_ON(pentry->pd_size == 0);
+
+	return (nvgpu_safe_cast_u64_to_u32(NVGPU_PD_CACHE_SIZE)) /
+			pentry->pd_size;
+}
+
+/*
+ * Return the _physical_ address of a page directory.
+ */
+u64 nvgpu_pd_gpu_addr(struct gk20a *g, struct nvgpu_gmmu_pd *pd)
+{
+	u64 page_addr;
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_NVLINK)) {
+		page_addr = nvgpu_mem_get_phys_addr(g, pd->mem);
+	} else {
+		page_addr = nvgpu_mem_get_addr(g, pd->mem);
+	}
+
+	return nvgpu_safe_add_u64(page_addr, U64(pd->mem_offs));
+}
+
+u32 nvgpu_pd_offset_from_index(const struct gk20a_mmu_level *l, u32 pd_idx)
+{
+	return nvgpu_safe_mult_u32(pd_idx, l->entry_size) / U32(sizeof(u32));
+}
+
+void nvgpu_pd_write(struct gk20a *g, struct nvgpu_gmmu_pd *pd,
+		    size_t w, u32 data)
+{
+	u64 tmp_offset = nvgpu_safe_add_u64((pd->mem_offs / sizeof(u32)), w);
+
+	nvgpu_mem_wr32(g, pd->mem,
+		       nvgpu_safe_cast_u64_to_u32(tmp_offset),
+		       data);
+}
+
+int nvgpu_pd_cache_init(struct gk20a *g)
+{
+	struct nvgpu_pd_cache *cache;
+	u32 i;
+
+	/*
+	 * This gets called from finalize_poweron() so we need to make sure we
+	 * don't reinit the pd_cache over and over.
+	 */
+	if (g->mm.pd_cache != NULL) {
+		return 0;
+	}
+
+	cache = nvgpu_kzalloc(g, sizeof(*cache));
+	if (cache == NULL) {
+		nvgpu_err(g, "Failed to alloc pd_cache!");
+		return -ENOMEM;
+	}
+
+	for (i = 0U; i < NVGPU_PD_CACHE_COUNT; i++) {
+		nvgpu_init_list_node(&cache->full[i]);
+		nvgpu_init_list_node(&cache->partial[i]);
+	}
+
+	cache->mem_tree = NULL;
+
+	nvgpu_mutex_init(&cache->lock);
+
+	g->mm.pd_cache = cache;
+
+	pd_dbg(g, "PD cache initialized!");
+
+	return 0;
+}
+
+void nvgpu_pd_cache_fini(struct gk20a *g)
+{
+	u32 i;
+	struct nvgpu_pd_cache *cache = g->mm.pd_cache;
+
+	if (cache == NULL) {
+		return;
+	}
+
+	for (i = 0U; i < NVGPU_PD_CACHE_COUNT; i++) {
+		nvgpu_assert(nvgpu_list_empty(&cache->full[i]));
+		nvgpu_assert(nvgpu_list_empty(&cache->partial[i]));
+	}
+
+	nvgpu_kfree(g, g->mm.pd_cache);
+	g->mm.pd_cache = NULL;
+}
+
+/*
+ * This is the simple pass-through for greater than page or page sized PDs.
+ *
+ * Note: this does not need the cache lock since it does not modify any of the
+ * PD cache data structures.
+ */
+int nvgpu_pd_cache_alloc_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	int err;
+	unsigned long flags = 0;
+
+	pd_dbg(g, "PD-Alloc [D] %u bytes", bytes);
+
+	pd->mem = nvgpu_kzalloc(g, sizeof(*pd->mem));
+	if (pd->mem == NULL) {
+		nvgpu_err(g, "OOM allocating nvgpu_mem struct!");
+		return -ENOMEM;
+	}
+
+	/*
+	 * If bytes == NVGPU_CPU_PAGE_SIZE then it's impossible to get a discontiguous DMA
+	 * allocation. Some DMA implementations may, despite this fact, still
+	 * use the contiguous pool for page sized allocations. As such only
+	 * request explicitly contiguous allocs if the page directory is larger
+	 * than the page size. Also, of course, this is all only revelant for
+	 * GPUs not using an IOMMU. If there is an IOMMU DMA allocs are always
+	 * going to be virtually contiguous and we don't have to force the
+	 * underlying allocations to be physically contiguous as well.
+	 */
+	if (!nvgpu_iommuable(g) && (bytes > NVGPU_CPU_PAGE_SIZE)) {
+		flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags, bytes, pd->mem);
+	if (err != 0) {
+		nvgpu_err(g, "OOM allocating page directory!");
+		nvgpu_kfree(g, pd->mem);
+		return -ENOMEM;
+	}
+
+	pd->cached = false;
+	pd->mem_offs = 0;
+
+	return 0;
+}
+
+/*
+ * Make a new nvgpu_pd_cache_entry and allocate a PD from it. Update the passed
+ * pd to reflect this allocation.
+ */
+static int nvgpu_pd_cache_alloc_new(struct gk20a *g,
+				    struct nvgpu_pd_cache *cache,
+				    struct nvgpu_gmmu_pd *pd,
+				    u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+	u64 flags = 0UL;
+	int32_t err;
+
+	pd_dbg(g, "PD-Alloc [C]   New: offs=0");
+
+	pentry = nvgpu_kzalloc(g, sizeof(*pentry));
+	if (pentry == NULL) {
+		nvgpu_err(g, "OOM allocating pentry!");
+		return -ENOMEM;
+	}
+
+	if (!nvgpu_iommuable(g) && (NVGPU_PD_CACHE_SIZE > NVGPU_CPU_PAGE_SIZE)) {
+		flags = NVGPU_DMA_PHYSICALLY_ADDRESSED;
+	}
+
+	err = nvgpu_dma_alloc_flags(g, flags,
+				  NVGPU_PD_CACHE_SIZE, &pentry->mem);
+	if (err != 0) {
+		nvgpu_kfree(g, pentry);
+
+		/* Not enough contiguous space, but a direct
+		 * allocation may work
+		 */
+		if (err == -ENOMEM) {
+			return nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		}
+		nvgpu_err(g, "Unable to DMA alloc!");
+		return -ENOMEM;
+	}
+
+	pentry->pd_size = bytes;
+	nvgpu_list_add(&pentry->list_entry,
+		       &cache->partial[nvgpu_pd_cache_nr(bytes)]);
+
+	/*
+	 * This allocates the very first PD table in the set of tables in this
+	 * nvgpu_pd_mem_entry.
+	 */
+	nvgpu_set_bit(0U, pentry->alloc_map);
+	pentry->allocs = 1;
+
+	/*
+	 * Now update the nvgpu_gmmu_pd to reflect this allocation.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = 0;
+	pd->cached = true;
+
+	pentry->tree_entry.key_start = (u64)(uintptr_t)&pentry->mem;
+	nvgpu_rbtree_insert(&pentry->tree_entry, &cache->mem_tree);
+
+	return 0;
+}
+
+static int nvgpu_pd_cache_alloc_from_partial(struct gk20a *g,
+					     struct nvgpu_pd_cache *cache,
+					     struct nvgpu_pd_mem_entry *pentry,
+					     struct nvgpu_gmmu_pd *pd)
+{
+	u32 bit_offs;
+	u32 mem_offs;
+	u32 nr_bits = nvgpu_pd_cache_get_nr_entries(pentry);
+
+	/*
+	 * Find and allocate an open PD.
+	 */
+	bit_offs = nvgpu_safe_cast_u64_to_u32(
+			find_first_zero_bit(pentry->alloc_map, nr_bits));
+	mem_offs = nvgpu_safe_mult_u32(bit_offs, pentry->pd_size);
+
+	pd_dbg(g, "PD-Alloc [C]   Partial: offs=%u nr_bits=%d src=0x%p",
+	       bit_offs, nr_bits, pentry);
+
+	/* Bit map full. Somethings wrong. */
+	nvgpu_assert(bit_offs < nr_bits);
+
+	nvgpu_set_bit(bit_offs, pentry->alloc_map);
+	pentry->allocs = nvgpu_safe_add_u32(pentry->allocs, 1U);
+
+	/*
+	 * First update the pd.
+	 */
+	pd->mem = &pentry->mem;
+	pd->mem_offs = mem_offs;
+	pd->cached = true;
+
+	/*
+	 * Now make sure the pentry is in the correct list (full vs partial).
+	 */
+	if (pentry->allocs >= nr_bits) {
+		pd_dbg(g, "Adding pentry to full list!");
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->full[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	}
+
+	return 0;
+}
+
+/*
+ * Get a partially full nvgpu_pd_mem_entry. Returns NULL if there is no partial
+ * nvgpu_pd_mem_entry's.
+ */
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_get_partial(
+	struct nvgpu_pd_cache *cache, u32 bytes)
+{
+	struct nvgpu_list_node *list =
+		&cache->partial[nvgpu_pd_cache_nr(bytes)];
+
+	if (nvgpu_list_empty(list)) {
+		return NULL;
+	}
+
+	return nvgpu_list_first_entry(list,
+				      nvgpu_pd_mem_entry,
+				      list_entry);
+}
+
+/*
+ * Allocate memory from an nvgpu_mem for the page directory.
+ */
+static int nvgpu_pd_cache_alloc(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+	int err;
+	bool bytes_valid;
+
+	pd_dbg(g, "PD-Alloc [C] %u bytes", bytes);
+
+	bytes_valid = bytes >= NVGPU_PD_CACHE_MIN;
+	if (bytes_valid) {
+		bytes_valid = (bytes & nvgpu_safe_sub_u32(bytes, 1U)) == 0U;
+	}
+	if (!bytes_valid) {
+		pd_dbg(g, "PD-Alloc [C]   Invalid (bytes=%u)!", bytes);
+		return -EINVAL;
+	}
+
+	nvgpu_assert(bytes < NVGPU_PD_CACHE_SIZE);
+
+	pentry = nvgpu_pd_cache_get_partial(cache, bytes);
+	if (pentry == NULL) {
+		err = nvgpu_pd_cache_alloc_new(g, cache, pd, bytes);
+	} else {
+		err = nvgpu_pd_cache_alloc_from_partial(g, cache, pentry, pd);
+	}
+
+	if (err != 0) {
+		nvgpu_err(g, "PD-Alloc [C] Failed!");
+	}
+
+	return err;
+}
+
+/*
+ * Allocate the DMA memory for a page directory. This handles the necessary PD
+ * cache logistics. Since on Parker and later GPUs some of the page  directories
+ * are smaller than a page packing these PDs together saves a lot of memory.
+ */
+int nvgpu_pd_alloc(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd, u32 bytes)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	int err;
+
+	/*
+	 * Simple case: PD is bigger than a page so just do a regular DMA
+	 * alloc.
+	 */
+	if (bytes >= NVGPU_PD_CACHE_SIZE) {
+		err = nvgpu_pd_cache_alloc_direct(g, pd, bytes);
+		if (err != 0) {
+			return err;
+		}
+		pd->pd_size = bytes;
+
+		return 0;
+	}
+
+	if (g->mm.pd_cache == NULL) {
+		nvgpu_do_assert();
+		return -ENOMEM;
+	}
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	err = nvgpu_pd_cache_alloc(g, g->mm.pd_cache, pd, bytes);
+	if (err == 0) {
+		pd->pd_size = bytes;
+	}
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+
+	return err;
+}
+
+static void nvgpu_pd_cache_free_direct(struct gk20a *g,
+				       struct nvgpu_gmmu_pd *pd)
+{
+	pd_dbg(g, "PD-Free  [D] 0x%p", pd->mem);
+
+	if (pd->mem == NULL) {
+		return;
+	}
+
+	nvgpu_dma_free(g, pd->mem);
+	nvgpu_kfree(g, pd->mem);
+	pd->mem = NULL;
+}
+
+static void nvgpu_pd_cache_free_mem_entry(struct gk20a *g,
+					  struct nvgpu_pd_cache *cache,
+					  struct nvgpu_pd_mem_entry *pentry)
+{
+	nvgpu_dma_free(g, &pentry->mem);
+	nvgpu_list_del(&pentry->list_entry);
+	nvgpu_rbtree_unlink(&pentry->tree_entry, &cache->mem_tree);
+	nvgpu_kfree(g, pentry);
+}
+
+static void nvgpu_pd_cache_do_free(struct gk20a *g,
+				   struct nvgpu_pd_cache *cache,
+				   struct nvgpu_pd_mem_entry *pentry,
+				   struct nvgpu_gmmu_pd *pd)
+{
+	u32 bit = pd->mem_offs / pentry->pd_size;
+
+	/* Mark entry as free. */
+	nvgpu_clear_bit(bit, pentry->alloc_map);
+	pentry->allocs = nvgpu_safe_sub_u32(pentry->allocs, 1U);
+
+	if (pentry->allocs > 0U) {
+		/*
+		 * Partially full still. If it was already on the partial list
+		 * this just re-adds it.
+		 *
+		 * Since the memory used for the entries is still mapped, if
+		 * igpu make sure the entries are invalidated so that the hw
+		 * doesn't acccidentally try to prefetch non-existent fb memory.
+		 *
+		 * As IOMMU prefetching of invalid pd entries cause the IOMMU fault,
+		 * fill them with zero.
+		 */
+		if ((nvgpu_iommuable(g)) &&
+			(NVGPU_PD_CACHE_SIZE > NVGPU_CPU_SMALL_PAGE_SIZE) &&
+			(pd->mem->cpu_va != NULL)) {
+			(void)memset(((u8 *)pd->mem->cpu_va + pd->mem_offs), 0,
+					pd->pd_size);
+		}
+
+		nvgpu_list_del(&pentry->list_entry);
+		nvgpu_list_add(&pentry->list_entry,
+			&cache->partial[nvgpu_pd_cache_nr(pentry->pd_size)]);
+	} else {
+		/* Empty now so free it. */
+		nvgpu_pd_cache_free_mem_entry(g, cache, pentry);
+	}
+
+	pd->mem = NULL;
+}
+
+static struct nvgpu_pd_mem_entry *nvgpu_pd_cache_look_up(
+	struct nvgpu_pd_cache *cache,
+	struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_rbtree_node *node = NULL;
+
+	nvgpu_rbtree_search((u64)(uintptr_t)pd->mem, &node,
+			    cache->mem_tree);
+	if (node == NULL) {
+		return NULL;
+	}
+
+	return nvgpu_pd_mem_entry_from_tree_entry(node);
+}
+
+static void nvgpu_pd_cache_free(struct gk20a *g, struct nvgpu_pd_cache *cache,
+				struct nvgpu_gmmu_pd *pd)
+{
+	struct nvgpu_pd_mem_entry *pentry;
+
+	pd_dbg(g, "PD-Free  [C] 0x%p", pd->mem);
+
+	pentry = nvgpu_pd_cache_look_up(cache, pd);
+	if (pentry == NULL) {
+		nvgpu_do_assert_print(g, "Attempting to free non-existent pd");
+		return;
+	}
+
+	nvgpu_pd_cache_do_free(g, cache, pentry, pd);
+}
+
+void nvgpu_pd_free(struct vm_gk20a *vm, struct nvgpu_gmmu_pd *pd)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	/*
+	 * Simple case: just DMA free.
+	 */
+	if (!pd->cached) {
+		return nvgpu_pd_cache_free_direct(g, pd);
+	}
+
+	nvgpu_mutex_acquire(&g->mm.pd_cache->lock);
+	nvgpu_pd_cache_free(g, g->mm.pd_cache, pd);
+	nvgpu_mutex_release(&g->mm.pd_cache->lock);
+}
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pd_cache_priv.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GMMU_PD_CACHE_PRIV_H
+#define NVGPU_GMMU_PD_CACHE_PRIV_H
+
+/**
+ * @file
+ *
+ * Page directory cache private interface
+ * --------------------------------------
+ *
+ * To save memory when using sub-page sized PD levels in Pascal and beyond a way
+ * of packing PD tables together is necessary. If a PD table only requires 1024
+ * bytes, then it is possible to have 4 of these PDs in one page. This is even
+ * more pronounced for 256 byte PD tables.
+ *
+ * This also matters for page directories on any chip when using a 64K page
+ * granule. Having 4K PDs packed into a 64K page saves a bunch of memory. Even
+ * more so for the 256B PDs on Pascal+.
+ *
+ * The pd cache is basially just a slab allocator. Each instance of the nvgpu
+ * driver makes one of these structs:
+ *
+ *   struct nvgpu_pd_cache {
+ *      struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+ *      struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+ *
+ *      struct nvgpu_rbtree_node	*mem_tree;
+ *   };
+ *
+ * There are two sets of lists, the full and the partial. The full lists contain
+ * pages of memory for which all the memory in that page is in use. The partial
+ * lists contain partially full pages of memory which can be used for more PD
+ * allocations. There a couple of assumptions here:
+ *
+ *   1. PDs greater than or equal to the page size bypass the pd cache.
+ *   2. PDs are always power of 2 and greater than %NVGPU_PD_CACHE_MIN bytes.
+ *
+ * There are NVGPU_PD_CACHE_COUNT full lists and the same number of partial
+ * lists. For a 4Kb page NVGPU_PD_CACHE_COUNT is 4. This is enough space for
+ * 256, 512, 1024, and 2048 byte PDs.
+ *
+ * nvgpu_pd_alloc() will allocate a PD for the GMMU. It will check if the PD
+ * size is page size or larger and choose the correct allocation scheme - either
+ * from the PD cache or directly. Similarly nvgpu_pd_free() will free a PD
+ * allocated by nvgpu_pd_alloc().
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/log.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/list.h>
+#include <nvgpu/rbtree.h>
+#include <nvgpu/lock.h>
+
+#define pd_dbg(g, fmt, args...) nvgpu_log(g, gpu_dbg_pd_cache, fmt, ##args)
+
+/**
+ * Minimum size of a cache. The number of different caches in the nvgpu_pd_cache
+ * structure is of course depending on this.
+ */
+#define NVGPU_PD_CACHE_MIN		256UL
+/**
+ * MIN_SHIFT is the right number of bits to shift to determine
+ * which list to use in the array of lists.
+ */
+#define NVGPU_PD_CACHE_MIN_SHIFT	9UL
+
+/**
+ * Maximum PD cache count. This specifies the number of slabs; since each
+ * slab represents a PO2 increase in size a count of 8 leads to:
+ *
+ *   NVGPU_PD_CACHE_SIZE = 256B * 2^8 = 64KB
+ *
+ * For Linux with 4K pages, if the cache size is larger than 4KB then we
+ * need to allocate from CMA. This puts a lot of pressure on the CMA space.
+ * For kernel with a PAGE_SIZE of 64K this isn't the case, so allow the
+ * PD cache size to be 64K if PAGE_SIZE > 4K (i.e PAGE_SIZE == 64K).
+ */
+#ifdef __KERNEL__
+#  if NVGPU_CPU_PAGE_SIZE > 4096
+#    define NVGPU_PD_CACHE_COUNT	8UL
+#  else
+#    define NVGPU_PD_CACHE_COUNT	4UL
+#  endif
+#else
+#define NVGPU_PD_CACHE_COUNT		8UL
+#endif
+
+#define NVGPU_PD_CACHE_SIZE		(NVGPU_PD_CACHE_MIN * \
+						(1UL << NVGPU_PD_CACHE_COUNT))
+
+/**
+ * This structure describes a slab within the slab allocator.
+ */
+struct nvgpu_pd_mem_entry {
+	/**
+	 * Structure for storing the PD memory information.
+	 */
+	struct nvgpu_mem		mem;
+
+	/**
+	 * Size of the page directories (not the mem).
+	 */
+	u32				pd_size;
+	/**
+	 * alloc_map is a bitmap showing which PDs have been allocated.
+	 * The size of mem will always
+	 * be one page. pd_size will always be a power of 2.
+	 */
+	DECLARE_BITMAP(alloc_map, NVGPU_PD_CACHE_SIZE / NVGPU_PD_CACHE_MIN);
+	/**
+	 * Total number of allocations in this PD.
+	 */
+	u32				allocs;
+
+	/**
+	 * This is a list node within the list. The list node will be either from
+	 * a full or partial list in #nvgpu_pd_cache.
+	 */
+	struct nvgpu_list_node		list_entry;
+	/**
+	 * This is a tree node within the node.
+	 */
+	struct nvgpu_rbtree_node	tree_entry;
+};
+
+/**
+ * A cache for allocating PD memory. This enables smaller PDs to be packed
+ * into single pages.
+ */
+struct nvgpu_pd_cache {
+	/**
+	 * Array of lists of full nvgpu_pd_mem_entries and partially full
+	 * nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 full[NVGPU_PD_CACHE_COUNT];
+	/**
+	 * Array of lists of empty nvgpu_pd_mem_entries and partially
+	 * empty nvgpu_pd_mem_entries.
+	 */
+	struct nvgpu_list_node		 partial[NVGPU_PD_CACHE_COUNT];
+
+	/**
+	 * Tree of all allocated struct nvgpu_mem's for fast look up.
+	 */
+	struct nvgpu_rbtree_node	*mem_tree;
+
+	/**
+	 * All access to the cache much be locked. This protects the lists and
+	 * the rb tree.
+	 */
+	struct nvgpu_mutex		 lock;
+};
+
+#endif /* NVGPU_GMMU_PD_CACHE_PRIV_H */
--- a/drivers/gpu/nvgpu/common/mm/gmmu/pte.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/pte.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/static_analysis.h>
+#include <nvgpu/string.h>
+#include <nvgpu/gmmu.h>
+
+u32 nvgpu_gmmu_default_big_page_size(void)
+{
+	return U32(SZ_64K);
+}
+
+/*
+ * MSS NVLINK HW settings are in force_snoop mode.
+ * This will force all the GPU mappings to be coherent.
+ * By default the mem aperture is set to sysmem_non_coherent and will use L2
+ * atomics.
+ * Change target pte aperture to sysmem_coherent if mem attribute requests for
+ * platform atomics to use rmw atomic capability.
+ *
+ */
+u32 nvgpu_gmmu_aperture_mask(struct gk20a *g,
+				  enum nvgpu_aperture mem_ap,
+				  bool platform_atomic_attr,
+				  u32 sysmem_mask,
+				  u32 sysmem_coh_mask,
+				  u32 vidmem_mask)
+{
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_PLATFORM_ATOMIC) &&
+			     platform_atomic_attr) {
+		mem_ap = APERTURE_SYSMEM_COH;
+	}
+
+	return nvgpu_aperture_mask_raw(g, mem_ap,
+				sysmem_mask,
+				sysmem_coh_mask,
+				vidmem_mask);
+}
+
+static char *map_attrs_to_str(char *dest, struct nvgpu_gmmu_attrs *attrs)
+{
+	dest[0] = attrs->cacheable ? 'C' : '-';
+	dest[1] = attrs->sparse    ? 'S' : '-';
+	dest[2] = attrs->priv      ? 'P' : '-';
+	dest[3] = attrs->valid     ? 'V' : '-';
+	dest[4] = attrs->platform_atomic ? 'A' : '-';
+	dest[5] = '\0';
+
+	return dest;
+}
+
+void nvgpu_pte_dbg_print(struct gk20a *g,
+		struct nvgpu_gmmu_attrs *attrs,
+		const char *vm_name, u32 pd_idx, u32 mmu_level_entry_size,
+		u64 virt_addr, u64 phys_addr, u32 page_size, u32 *pte_w)
+{
+	char attrs_str[6];
+	char ctag_str[32] = "\0";
+	const char *aperture_str = nvgpu_aperture_str(attrs->aperture);
+	const char *perm_str = nvgpu_gmmu_perm_str(attrs->rw_flag);
+#ifdef CONFIG_NVGPU_COMPRESSION
+	u64 ctag_tmp = attrs->ctag;
+	u32 str_len = 0U;
+	u32 ctag_num = 0U;
+
+	/*
+	 * attrs->ctag is incremented to count current page size as well.
+	 * Subtract to get this page's ctag line number.
+	 */
+	if (ctag_tmp != 0ULL) {
+		ctag_tmp = nvgpu_safe_sub_u64(ctag_tmp, page_size);
+	}
+
+	ctag_num = nvgpu_safe_cast_u64_to_u32(ctag_tmp /
+					g->ops.fb.compression_page_size(g));
+	(void)strcpy(ctag_str, "ctag=0x\0");
+	str_len = (u32)strlen(ctag_str);
+	(void)nvgpu_strnadd_u32(ctag_str + str_len, ctag_num,
+		nvgpu_safe_sub_u32(31U, str_len), 16U);
+#endif
+	(void)map_attrs_to_str(attrs_str, attrs);
+	pte_dbg(g, attrs,
+		"vm=%s "
+		"PTE: i=%-4u size=%-2u | "
+		"GPU %#-12llx  phys %#-12llx "
+		"pgsz: %3dkb perm=%-2s kind=%#02x APT=%-6s %-5s "
+		"%s "
+		"[0x%08x, 0x%08x]",
+		vm_name,
+		pd_idx, mmu_level_entry_size,
+		virt_addr, phys_addr,
+		page_size >> 10,
+		perm_str,
+		attrs->kind_v,
+		aperture_str,
+		attrs_str,
+		ctag_str,
+		pte_w[1], pte_w[0]);
+}
--- a/drivers/gpu/nvgpu/common/mm/mm.c
+++ b/drivers/gpu/nvgpu/common/mm/mm.c
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/mm.h>
+#include <nvgpu/vm.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/vm_area.h>
+#include <nvgpu/acr.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/vidmem.h>
+#include <nvgpu/semaphore.h>
+#include <nvgpu/pramin.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/errata.h>
+#include <nvgpu/ce_app.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/engines.h>
+#include <nvgpu/static_analysis.h>
+#include <nvgpu/power_features/cg.h>
+
+int nvgpu_mm_suspend(struct gk20a *g)
+{
+	int err;
+
+	nvgpu_log_info(g, "MM suspend running...");
+
+#ifdef CONFIG_NVGPU_DGPU
+	nvgpu_vidmem_thread_pause_sync(&g->mm);
+#endif
+
+#ifdef CONFIG_NVGPU_COMPRESSION
+	g->ops.mm.cache.cbc_clean(g);
+#endif
+	err = g->ops.mm.cache.l2_flush(g, false);
+	if (err != 0) {
+		nvgpu_err(g, "l2_flush failed");
+		return err;
+	}
+
+	if (g->ops.fb.intr.disable != NULL) {
+		g->ops.fb.intr.disable(g);
+	}
+
+	if (g->ops.mm.mmu_fault.disable_hw != NULL) {
+		g->ops.mm.mmu_fault.disable_hw(g);
+	}
+
+	nvgpu_log_info(g, "MM suspend done!");
+
+	return err;
+}
+
+u64 nvgpu_inst_block_addr(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_NVLINK)) {
+		return nvgpu_mem_get_phys_addr(g, inst_block);
+	} else {
+		return nvgpu_mem_get_addr(g, inst_block);
+	}
+}
+
+u32 nvgpu_inst_block_ptr(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+	u64 addr = nvgpu_inst_block_addr(g, inst_block) >>
+			g->ops.ramin.base_shift();
+
+	nvgpu_assert(u64_hi32(addr) == 0U);
+	return u64_lo32(addr);
+}
+
+void nvgpu_free_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+	if (nvgpu_mem_is_valid(inst_block)) {
+		nvgpu_dma_free(g, inst_block);
+	}
+}
+
+int nvgpu_alloc_inst_block(struct gk20a *g, struct nvgpu_mem *inst_block)
+{
+	int err;
+
+	nvgpu_log_fn(g, " ");
+
+	err = nvgpu_dma_alloc(g, g->ops.ramin.alloc_size(), inst_block);
+	if (err != 0) {
+		nvgpu_err(g, "%s: memory allocation failed", __func__);
+		return err;
+	}
+
+	nvgpu_log_fn(g, "done");
+	return 0;
+}
+
+static int nvgpu_alloc_sysmem_flush(struct gk20a *g)
+{
+	return nvgpu_dma_alloc_sys(g, SZ_4K, &g->mm.sysmem_flush);
+}
+
+#ifdef CONFIG_NVGPU_DGPU
+static void nvgpu_remove_mm_ce_support(struct mm_gk20a *mm)
+{
+	struct gk20a *g = gk20a_from_mm(mm);
+
+	if (mm->vidmem.ce_ctx_id != NVGPU_CE_INVAL_CTX_ID) {
+		nvgpu_ce_app_delete_context(g, mm->vidmem.ce_ctx_id);
+	}
+	mm->vidmem.ce_ctx_id = NVGPU_CE_INVAL_CTX_ID;
+
+	nvgpu_vm_put(mm->ce.vm);
+}
+#endif
+
+static void nvgpu_remove_mm_support(struct mm_gk20a *mm)
+{
+	struct gk20a *g = gk20a_from_mm(mm);
+
+	nvgpu_dma_free(g, &mm->mmu_wr_mem);
+	nvgpu_dma_free(g, &mm->mmu_rd_mem);
+
+#if defined(CONFIG_NVGPU_HAL_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
+	if (nvgpu_fb_vab_teardown_hal(g) != 0) {
+		nvgpu_err(g, "failed to teardown VAB");
+	}
+
+#endif
+
+	if (g->ops.mm.mmu_fault.info_mem_destroy != NULL) {
+		g->ops.mm.mmu_fault.info_mem_destroy(g);
+	}
+
+	if (g->ops.mm.remove_bar2_vm != NULL) {
+		g->ops.mm.remove_bar2_vm(g);
+	}
+
+	nvgpu_free_inst_block(g, &mm->bar1.inst_block);
+	nvgpu_vm_put(mm->bar1.vm);
+
+	nvgpu_free_inst_block(g, &mm->pmu.inst_block);
+	nvgpu_free_inst_block(g, &mm->hwpm.inst_block);
+	nvgpu_vm_put(mm->pmu.vm);
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_VM)) {
+		nvgpu_free_inst_block(g, &mm->sec2.inst_block);
+		nvgpu_vm_put(mm->sec2.vm);
+	}
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_GSP_VM)) {
+		nvgpu_free_inst_block(g, &mm->gsp.inst_block);
+		nvgpu_vm_put(mm->gsp.vm);
+	}
+
+	if (g->has_cde) {
+		nvgpu_vm_put(mm->cde.vm);
+	}
+
+#ifdef CONFIG_NVGPU_SW_SEMAPHORE
+	nvgpu_semaphore_sea_destroy(g);
+#endif
+#ifdef CONFIG_NVGPU_DGPU
+	nvgpu_vidmem_destroy(g);
+
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) {
+		g->ops.ramin.deinit_pdb_cache_errata(g);
+	}
+#endif
+	nvgpu_pd_cache_fini(g);
+}
+
+/* pmu vm, share channel_vm interfaces */
+static int nvgpu_init_system_vm(struct mm_gk20a *mm)
+{
+	int err;
+	struct gk20a *g = gk20a_from_mm(mm);
+	struct nvgpu_mem *inst_block = &mm->pmu.inst_block;
+	u32 big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+	u64 low_hole, aperture_size;
+
+	/*
+	 * For some reason the maxwell PMU code is dependent on the large page
+	 * size. No reason AFAICT for this. Probably a bug somewhere.
+	 */
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_MM_FORCE_128K_PMU_VM)) {
+		big_page_size = nvgpu_safe_cast_u64_to_u32(SZ_128K);
+	}
+
+	/*
+	 * No user region - so we will pass that as zero sized.
+	 */
+	low_hole = SZ_4K * 16UL;
+	aperture_size = GK20A_PMU_VA_SIZE;
+
+	mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
+	nvgpu_log_info(g, "pmu vm size = 0x%x", mm->pmu.aperture_size);
+
+	mm->pmu.vm = nvgpu_vm_init(g, big_page_size,
+				   low_hole,
+				   0ULL,
+				   nvgpu_safe_sub_u64(aperture_size, low_hole),
+				   0ULL,
+				   true,
+				   false,
+				   false,
+				   "system");
+	if (mm->pmu.vm == NULL) {
+		return -ENOMEM;
+	}
+
+	err = nvgpu_alloc_inst_block(g, inst_block);
+	if (err != 0) {
+		goto clean_up_vm;
+	}
+	g->ops.mm.init_inst_block(inst_block, mm->pmu.vm, big_page_size);
+
+	return 0;
+
+clean_up_vm:
+	nvgpu_vm_put(mm->pmu.vm);
+	return err;
+}
+
+static int nvgpu_init_hwpm(struct mm_gk20a *mm)
+{
+	int err;
+	struct gk20a *g = gk20a_from_mm(mm);
+	struct nvgpu_mem *inst_block = &mm->hwpm.inst_block;
+
+	err = nvgpu_alloc_inst_block(g, inst_block);
+	if (err != 0) {
+		return err;
+	}
+	g->ops.mm.init_inst_block(inst_block, mm->pmu.vm, 0);
+
+	return 0;
+}
+
+static int nvgpu_init_cde_vm(struct mm_gk20a *mm)
+{
+	struct gk20a *g = gk20a_from_mm(mm);
+	u64 user_size, kernel_size;
+	u32 big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+
+	g->ops.mm.get_default_va_sizes(NULL, &user_size, &kernel_size);
+
+	mm->cde.vm = nvgpu_vm_init(g, big_page_size,
+				U64(big_page_size) << U64(10),
+				nvgpu_safe_sub_u64(user_size,
+					U64(big_page_size) << U64(10)),
+				kernel_size,
+				0ULL,
+				false, false, false, "cde");
+	if (mm->cde.vm == NULL) {
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int nvgpu_init_ce_vm(struct mm_gk20a *mm)
+{
+	struct gk20a *g = gk20a_from_mm(mm);
+	u64 user_size, kernel_size;
+	u32 big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+
+	g->ops.mm.get_default_va_sizes(NULL, &user_size, &kernel_size);
+
+	mm->ce.vm = nvgpu_vm_init(g, big_page_size,
+				U64(big_page_size) << U64(10),
+				nvgpu_safe_sub_u64(user_size,
+					U64(big_page_size) << U64(10)),
+				kernel_size,
+				0ULL,
+				false, false, false, "ce");
+	if (mm->ce.vm == NULL) {
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int nvgpu_init_mmu_debug(struct mm_gk20a *mm)
+{
+	struct gk20a *g = gk20a_from_mm(mm);
+	int err;
+
+	if (!nvgpu_mem_is_valid(&mm->mmu_wr_mem)) {
+		err = nvgpu_dma_alloc_sys(g, SZ_4K, &mm->mmu_wr_mem);
+		if (err != 0) {
+			goto err;
+		}
+	}
+
+	if (!nvgpu_mem_is_valid(&mm->mmu_rd_mem)) {
+		err = nvgpu_dma_alloc_sys(g, SZ_4K, &mm->mmu_rd_mem);
+		if (err != 0) {
+			goto err_free_wr_mem;
+		}
+	}
+	return 0;
+
+ err_free_wr_mem:
+	nvgpu_dma_free(g, &mm->mmu_wr_mem);
+ err:
+	return -ENOMEM;
+}
+
+#if defined(CONFIG_NVGPU_DGPU)
+void nvgpu_init_mm_ce_context(struct gk20a *g)
+{
+	if (g->mm.vidmem.size > 0U &&
+	   (g->mm.vidmem.ce_ctx_id == NVGPU_CE_INVAL_CTX_ID)) {
+		g->mm.vidmem.ce_ctx_id =
+			nvgpu_ce_app_create_context(g,
+				nvgpu_engine_get_fast_ce_runlist_id(g),
+				-1,
+				-1);
+
+		if (g->mm.vidmem.ce_ctx_id == NVGPU_CE_INVAL_CTX_ID) {
+			nvgpu_err(g,
+				"Failed to allocate CE context for vidmem page clearing support");
+		}
+	}
+}
+#endif
+
+static int nvgpu_init_bar1_vm(struct mm_gk20a *mm)
+{
+	int err;
+	struct gk20a *g = gk20a_from_mm(mm);
+	struct nvgpu_mem *inst_block = &mm->bar1.inst_block;
+	u32 big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+
+	mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
+	nvgpu_log_info(g, "bar1 vm size = 0x%x", mm->bar1.aperture_size);
+	mm->bar1.vm = nvgpu_vm_init(g,
+			big_page_size,
+			SZ_64K,
+			0ULL,
+			nvgpu_safe_sub_u64(mm->bar1.aperture_size, SZ_64K),
+			0ULL,
+			true, false, false,
+			"bar1");
+	if (mm->bar1.vm == NULL) {
+		return -ENOMEM;
+	}
+
+	err = nvgpu_alloc_inst_block(g, inst_block);
+	if (err != 0) {
+		goto clean_up_vm;
+	}
+	g->ops.mm.init_inst_block(inst_block, mm->bar1.vm, big_page_size);
+
+	return 0;
+
+clean_up_vm:
+	nvgpu_vm_put(mm->bar1.vm);
+	return err;
+}
+
+static int nvgpu_init_engine_ucode_vm(struct gk20a *g,
+	struct engine_ucode *ucode, const char *address_space_name)
+{
+	int err;
+	struct nvgpu_mem *inst_block = &ucode->inst_block;
+	u32 big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+
+	/* ucode aperture size is 32MB */
+	ucode->aperture_size = U32(32) << 20U;
+	nvgpu_log_info(g, "%s vm size = 0x%x", address_space_name,
+		ucode->aperture_size);
+
+	ucode->vm = nvgpu_vm_init(g, big_page_size, SZ_4K,
+		0ULL, nvgpu_safe_sub_u64(ucode->aperture_size, SZ_4K), 0ULL,
+		false, false, false,
+		address_space_name);
+	if (ucode->vm == NULL) {
+		return -ENOMEM;
+	}
+
+	/* allocate instance mem for engine ucode */
+	err = nvgpu_alloc_inst_block(g, inst_block);
+	if (err != 0) {
+		goto clean_up_va;
+	}
+
+	g->ops.mm.init_inst_block(inst_block, ucode->vm, big_page_size);
+
+	return 0;
+
+clean_up_va:
+	nvgpu_vm_put(ucode->vm);
+	return err;
+}
+
+static int nvgpu_init_mm_setup_bar(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	int err;
+
+	err = nvgpu_init_bar1_vm(mm);
+	if (err != 0) {
+		return err;
+	}
+
+	if (g->ops.mm.init_bar2_vm != NULL) {
+		err = g->ops.mm.init_bar2_vm(g);
+		if (err != 0) {
+			return err;
+		}
+	}
+	err = nvgpu_init_system_vm(mm);
+	if (err != 0) {
+		return err;
+	}
+
+	err = nvgpu_init_hwpm(mm);
+	if (err != 0) {
+		return err;
+	}
+
+	return err;
+}
+
+static int nvgpu_init_mm_setup_vm(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	int err;
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_VM)) {
+		err = nvgpu_init_engine_ucode_vm(g, &mm->sec2, "sec2");
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_GSP_VM)) {
+		err = nvgpu_init_engine_ucode_vm(g, &mm->gsp, "gsp");
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	if (g->has_cde) {
+		err = nvgpu_init_cde_vm(mm);
+			if (err != 0) {
+				return err;
+			}
+	}
+
+	err = nvgpu_init_ce_vm(mm);
+	if (err != 0) {
+		return err;
+	}
+
+	return err;
+}
+
+static int nvgpu_init_mm_components(struct gk20a *g)
+{
+	int err = 0;
+	struct mm_gk20a *mm = &g->mm;
+
+	err = nvgpu_alloc_sysmem_flush(g);
+	if (err != 0) {
+		return err;
+	}
+
+	err = nvgpu_init_mm_setup_bar(g);
+	if (err != 0) {
+		return err;
+	}
+
+	err = nvgpu_init_mm_setup_vm(g);
+	if (err != 0) {
+		return err;
+	}
+
+	err = nvgpu_init_mmu_debug(mm);
+	if (err != 0) {
+		return err;
+	}
+
+	/*
+	 * Some chips support replayable MMU faults. For such chips make sure
+	 * SW is initialized.
+	 */
+	if (g->ops.mm.mmu_fault.setup_sw != NULL) {
+		err = g->ops.mm.mmu_fault.setup_sw(g);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static int nvgpu_init_mm_setup_sw(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	int err = 0;
+
+	if (mm->sw_ready) {
+		nvgpu_log_info(g, "skip init");
+		return 0;
+	}
+
+	mm->g = g;
+	nvgpu_mutex_init(&mm->l2_op_lock);
+
+	/*TBD: make channel vm size configurable */
+	g->ops.mm.get_default_va_sizes(NULL, &mm->channel.user_size,
+		&mm->channel.kernel_size);
+
+	nvgpu_log_info(g, "channel vm size: user %uMB  kernel %uMB",
+		nvgpu_safe_cast_u64_to_u32(mm->channel.user_size >> U64(20)),
+		nvgpu_safe_cast_u64_to_u32(mm->channel.kernel_size >> U64(20)));
+
+#ifdef CONFIG_NVGPU_DGPU
+	mm->vidmem.ce_ctx_id = NVGPU_CE_INVAL_CTX_ID;
+
+	nvgpu_init_pramin(mm);
+
+	err = nvgpu_vidmem_init(mm);
+	if (err != 0) {
+		return err;
+	}
+
+	/*
+	 * this requires fixed allocations in vidmem which must be
+	 * allocated before all other buffers
+	 */
+
+	if (!nvgpu_is_enabled(g, NVGPU_MM_UNIFIED_MEMORY) &&
+			nvgpu_is_enabled(g, NVGPU_SEC_PRIVSECURITY)) {
+		err = nvgpu_acr_alloc_blob_prerequisite(g, g->acr, 0);
+		if (err != 0) {
+			return err;
+		}
+	}
+#endif
+
+	err = nvgpu_init_mm_components(g);
+	if (err != 0) {
+		return err;
+	}
+
+	if ((g->ops.fb.ecc.init != NULL) && !g->ecc.initialized) {
+		err = g->ops.fb.ecc.init(g);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+#if defined(CONFIG_NVGPU_HAL_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
+	if (nvgpu_fb_vab_init_hal(g) != 0) {
+		nvgpu_err(g, "failed to init VAB");
+	}
+#endif
+
+	mm->remove_support = nvgpu_remove_mm_support;
+#ifdef CONFIG_NVGPU_DGPU
+	mm->remove_ce_support = nvgpu_remove_mm_ce_support;
+#endif
+
+	mm->sw_ready = true;
+
+	return 0;
+}
+
+#ifdef CONFIG_NVGPU_DGPU
+static int nvgpu_init_mm_pdb_cache_errata(struct gk20a *g)
+{
+	int err;
+
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_INIT_PDB_CACHE)) {
+		err = g->ops.ramin.init_pdb_cache_errata(g);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	if (nvgpu_is_errata_present(g, NVGPU_ERRATA_FB_PDB_CACHE)) {
+		err = g->ops.fb.apply_pdb_cache_errata(g);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	return 0;
+}
+#endif
+
+/*
+ * Called through the HAL to handle vGPU: the vGPU doesn't have HW to initialize
+ * here.
+ */
+int nvgpu_mm_setup_hw(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	int err;
+
+	nvgpu_log_fn(g, " ");
+
+	if (g->ops.fb.set_mmu_page_size != NULL) {
+		g->ops.fb.set_mmu_page_size(g);
+	}
+
+#ifdef CONFIG_NVGPU_COMPRESSION
+	if (g->ops.fb.set_use_full_comp_tag_line != NULL) {
+		mm->use_full_comp_tag_line =
+			g->ops.fb.set_use_full_comp_tag_line(g);
+	}
+#endif
+
+	g->ops.fb.init_hw(g);
+
+	if (g->ops.bus.bar1_bind != NULL) {
+		err = g->ops.bus.bar1_bind(g, &mm->bar1.inst_block);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	if (g->ops.bus.bar2_bind != NULL) {
+		err = g->ops.bus.bar2_bind(g, &mm->bar2.inst_block);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	if ((g->ops.mm.cache.fb_flush(g) != 0) ||
+		(g->ops.mm.cache.fb_flush(g) != 0)) {
+		return -EBUSY;
+	}
+
+	if (g->ops.mm.mmu_fault.setup_hw != NULL) {
+		g->ops.mm.mmu_fault.setup_hw(g);
+	}
+
+	nvgpu_log_fn(g, "done");
+	return 0;
+}
+
+int nvgpu_init_mm_support(struct gk20a *g)
+{
+	int err;
+
+#ifdef CONFIG_NVGPU_DGPU
+	err = nvgpu_init_mm_pdb_cache_errata(g);
+	if (err != 0) {
+		return err;
+	}
+#endif
+
+	err = nvgpu_init_mm_setup_sw(g);
+	if (err != 0) {
+		return err;
+	}
+
+	if (g->ops.mm.setup_hw != NULL) {
+		err = g->ops.mm.setup_hw(g);
+	}
+
+	return err;
+}
+
+u32 nvgpu_mm_get_default_big_page_size(struct gk20a *g)
+{
+	u32 big_page_size;
+
+	big_page_size = g->ops.mm.gmmu.get_default_big_page_size();
+
+	if (g->mm.disable_bigpage) {
+		big_page_size = 0;
+	}
+
+	return big_page_size;
+}
+
+u32 nvgpu_mm_get_available_big_page_sizes(struct gk20a *g)
+{
+	u32 available_big_page_sizes = 0;
+
+	if (g->mm.disable_bigpage) {
+		return available_big_page_sizes;
+	}
+
+	available_big_page_sizes = g->ops.mm.gmmu.get_default_big_page_size();
+	if (g->ops.mm.gmmu.get_big_page_sizes != NULL) {
+		available_big_page_sizes |= g->ops.mm.gmmu.get_big_page_sizes();
+	}
+
+	return available_big_page_sizes;
+}
--- a/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_mem.c
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/nvgpu_sgt.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/vidmem.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/pramin.h>
+#include <nvgpu/string.h>
+
+/*
+ * Make sure to use the right coherency aperture if you use this function! This
+ * will not add any checks. If you want to simply use the default coherency then
+ * use nvgpu_aperture_mask().
+ */
+u32 nvgpu_aperture_mask_raw(struct gk20a *g, enum nvgpu_aperture aperture,
+			    u32 sysmem_mask, u32 sysmem_coh_mask,
+			    u32 vidmem_mask)
+{
+	u32 ret_mask = 0;
+
+	if ((aperture == APERTURE_INVALID) || (aperture >= APERTURE_MAX_ENUM)) {
+		nvgpu_do_assert_print(g, "Bad aperture");
+		return 0;
+	}
+
+	/*
+	 * Some iGPUs treat sysmem (i.e SoC DRAM) as vidmem. In these cases the
+	 * "sysmem" aperture should really be translated to VIDMEM.
+	 */
+	if (!nvgpu_is_enabled(g, NVGPU_MM_HONORS_APERTURE)) {
+		aperture = APERTURE_VIDMEM;
+	}
+
+	switch (aperture) {
+	case APERTURE_SYSMEM_COH:
+		ret_mask = sysmem_coh_mask;
+		break;
+	case APERTURE_SYSMEM:
+		ret_mask = sysmem_mask;
+		break;
+	case APERTURE_VIDMEM:
+		ret_mask = vidmem_mask;
+		break;
+	default:
+		nvgpu_do_assert_print(g, "Bad aperture");
+		ret_mask = 0;
+		break;
+	}
+	return ret_mask;
+}
+
+u32 nvgpu_aperture_mask(struct gk20a *g, struct nvgpu_mem *mem,
+			u32 sysmem_mask, u32 sysmem_coh_mask, u32 vidmem_mask)
+{
+	enum nvgpu_aperture ap = mem->aperture;
+
+	return nvgpu_aperture_mask_raw(g, ap,
+				       sysmem_mask,
+				       sysmem_coh_mask,
+				       vidmem_mask);
+}
+
+bool nvgpu_aperture_is_sysmem(enum nvgpu_aperture ap)
+{
+	return (ap == APERTURE_SYSMEM_COH) || (ap == APERTURE_SYSMEM);
+}
+
+bool nvgpu_mem_is_sysmem(struct nvgpu_mem *mem)
+{
+	return nvgpu_aperture_is_sysmem(mem->aperture);
+}
+
+u64 nvgpu_mem_iommu_translate(struct gk20a *g, u64 phys)
+{
+	/* ensure it is not vidmem allocation */
+#ifdef CONFIG_NVGPU_DGPU
+	WARN_ON(nvgpu_addr_is_vidmem_page_alloc(phys));
+#endif
+
+	if (nvgpu_iommuable(g) && (g->ops.mm.gmmu.get_iommu_bit != NULL)) {
+		return phys | (1ULL << g->ops.mm.gmmu.get_iommu_bit(g));
+	}
+
+	return phys;
+}
+
+u32 nvgpu_mem_rd32(struct gk20a *g, struct nvgpu_mem *mem, u64 w)
+{
+	u32 data = 0;
+
+	if (mem->aperture == APERTURE_SYSMEM) {
+		u32 *ptr = mem->cpu_va;
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+		WARN_ON(ptr == NULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+		data = ptr[w];
+	}
+#ifdef CONFIG_NVGPU_DGPU
+	else if (mem->aperture == APERTURE_VIDMEM) {
+		nvgpu_pramin_rd_n(g, mem, w * (u64)sizeof(u32),
+				(u64)sizeof(u32), &data);
+	}
+#endif
+	else {
+		nvgpu_do_assert_print(g, "Accessing unallocated nvgpu_mem");
+	}
+
+	return data;
+}
+
+u64 nvgpu_mem_rd32_pair(struct gk20a *g, struct nvgpu_mem *mem, u32 lo, u32 hi)
+{
+	u64 lo_data = U64(nvgpu_mem_rd32(g, mem, lo));
+	u64 hi_data = U64(nvgpu_mem_rd32(g, mem, hi));
+
+	return lo_data | (hi_data << 32ULL);
+}
+
+u32 nvgpu_mem_rd(struct gk20a *g, struct nvgpu_mem *mem, u64 offset)
+{
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+	WARN_ON((offset & 3ULL) != 0ULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+	return nvgpu_mem_rd32(g, mem, offset / (u64)sizeof(u32));
+}
+
+void nvgpu_mem_rd_n(struct gk20a *g, struct nvgpu_mem *mem,
+		u64 offset, void *dest, u64 size)
+{
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 2, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 2, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 2, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+	WARN_ON((offset & 3ULL) != 0ULL);
+	WARN_ON((size & 3ULL) != 0ULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+
+	if (mem->aperture == APERTURE_SYSMEM) {
+		u8 *src = (u8 *)mem->cpu_va + offset;
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+		WARN_ON(mem->cpu_va == NULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+		nvgpu_memcpy((u8 *)dest, src, size);
+	}
+#ifdef CONFIG_NVGPU_DGPU
+	else if (mem->aperture == APERTURE_VIDMEM) {
+		nvgpu_pramin_rd_n(g, mem, offset, size, dest);
+	}
+#endif
+	else {
+		nvgpu_do_assert_print(g, "Accessing unallocated nvgpu_mem");
+	}
+}
+
+void nvgpu_mem_wr32(struct gk20a *g, struct nvgpu_mem *mem, u64 w, u32 data)
+{
+	if (mem->aperture == APERTURE_SYSMEM) {
+		u32 *ptr = mem->cpu_va;
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+		WARN_ON(ptr == NULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+		ptr[w] = data;
+	}
+#ifdef CONFIG_NVGPU_DGPU
+	else if (mem->aperture == APERTURE_VIDMEM) {
+		nvgpu_pramin_wr_n(g, mem, w * (u64)sizeof(u32),
+				  (u64)sizeof(u32), &data);
+
+		if (!mem->skip_wmb) {
+			nvgpu_wmb();
+		}
+	}
+#endif
+	else {
+		nvgpu_do_assert_print(g, "Accessing unallocated nvgpu_mem");
+	}
+}
+
+void nvgpu_mem_wr(struct gk20a *g, struct nvgpu_mem *mem, u64 offset, u32 data)
+{
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+	WARN_ON((offset & 3ULL) != 0ULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+	nvgpu_mem_wr32(g, mem, offset / (u64)sizeof(u32), data);
+}
+
+void nvgpu_mem_wr_n(struct gk20a *g, struct nvgpu_mem *mem, u64 offset,
+		void *src, u64 size)
+{
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 2, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 2, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 2, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+	WARN_ON((offset & 3ULL) != 0ULL);
+	WARN_ON((size & 3ULL) != 0ULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+
+	if (mem->aperture == APERTURE_SYSMEM) {
+		u8 *dest = (u8 *)mem->cpu_va + offset;
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+		WARN_ON(mem->cpu_va == NULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+		nvgpu_memcpy(dest, (u8 *)src, size);
+	}
+#ifdef CONFIG_NVGPU_DGPU
+	else if (mem->aperture == APERTURE_VIDMEM) {
+		nvgpu_pramin_wr_n(g, mem, offset, size, src);
+		if (!mem->skip_wmb) {
+			nvgpu_wmb();
+		}
+	}
+#endif
+	else {
+		nvgpu_do_assert_print(g, "Accessing unallocated nvgpu_mem");
+	}
+}
+
+void nvgpu_memset(struct gk20a *g, struct nvgpu_mem *mem, u64 offset,
+		u32 c, u64 size)
+{
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 3, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 3, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 3, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+	WARN_ON((offset & 3ULL) != 0ULL);
+	WARN_ON((size & 3ULL) != 0ULL);
+	WARN_ON((c & ~0xffU) != 0U);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+
+	c &= 0xffU;
+
+	if (mem->aperture == APERTURE_SYSMEM) {
+		u8 *dest = (u8 *)mem->cpu_va + offset;
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 10_3), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 14_4), "Bug 2277532")
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 15_6), "Bug 2277532")
+		WARN_ON(mem->cpu_va == NULL);
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 10_3))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 14_4))
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 15_6))
+		(void) memset(dest, (int)c, size);
+	}
+#ifdef CONFIG_NVGPU_DGPU
+	else if (mem->aperture == APERTURE_VIDMEM) {
+		u32 repeat_value = c | (c << 8) | (c << 16) | (c << 24);
+
+		nvgpu_pramin_memset(g, mem, offset, size, repeat_value);
+		if (!mem->skip_wmb) {
+			nvgpu_wmb();
+		}
+	}
+#endif
+	else {
+		nvgpu_do_assert_print(g, "Accessing unallocated nvgpu_mem");
+	}
+}
+
+static void *nvgpu_mem_phys_sgl_next(void *sgl)
+{
+	struct nvgpu_mem_sgl *sgl_impl = (struct nvgpu_mem_sgl *)sgl;
+
+	return (void *)(void *)sgl_impl->next;
+}
+
+/*
+ * Provided for compatibility - the DMA address is the same as the phys address
+ * for these nvgpu_mem's.
+ */
+static u64 nvgpu_mem_phys_sgl_dma(void *sgl)
+{
+	struct nvgpu_mem_sgl *sgl_impl = (struct nvgpu_mem_sgl *)sgl;
+
+	return sgl_impl->phys;
+}
+
+static u64 nvgpu_mem_phys_sgl_phys(struct gk20a *g, void *sgl)
+{
+	struct nvgpu_mem_sgl *sgl_impl = (struct nvgpu_mem_sgl *)sgl;
+
+	return sgl_impl->phys;
+}
+
+static u64 nvgpu_mem_phys_sgl_ipa_to_pa(struct gk20a *g,
+		void *sgl, u64 ipa, u64 *pa_len)
+{
+	return ipa;
+}
+
+static u64 nvgpu_mem_phys_sgl_length(void *sgl)
+{
+	struct nvgpu_mem_sgl *sgl_impl = (struct nvgpu_mem_sgl *)sgl;
+
+	return sgl_impl->length;
+}
+
+static u64 nvgpu_mem_phys_sgl_gpu_addr(struct gk20a *g, void *sgl,
+					 struct nvgpu_gmmu_attrs *attrs)
+{
+	struct nvgpu_mem_sgl *sgl_impl = (struct nvgpu_mem_sgl *)sgl;
+
+	return sgl_impl->phys;
+}
+
+static void nvgpu_mem_phys_sgt_free(struct gk20a *g, struct nvgpu_sgt *sgt)
+{
+	/*
+	 * No-op here. The free is handled by freeing the nvgpu_mem itself.
+	 */
+}
+
+NVGPU_COV_WHITELIST_BLOCK_BEGIN(false_positive, 1, NVGPU_MISRA(Rule, 8_7), "Bug 2823817")
+static const struct nvgpu_sgt_ops nvgpu_mem_phys_ops = {
+NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
+	.sgl_next      = nvgpu_mem_phys_sgl_next,
+	.sgl_dma       = nvgpu_mem_phys_sgl_dma,
+	.sgl_phys      = nvgpu_mem_phys_sgl_phys,
+	.sgl_ipa       = nvgpu_mem_phys_sgl_phys,
+	.sgl_ipa_to_pa = nvgpu_mem_phys_sgl_ipa_to_pa,
+	.sgl_length    = nvgpu_mem_phys_sgl_length,
+	.sgl_gpu_addr  = nvgpu_mem_phys_sgl_gpu_addr,
+	.sgt_free      = nvgpu_mem_phys_sgt_free,
+
+	/*
+	 * The physical nvgpu_mems are never IOMMU'able by definition.
+	 */
+	.sgt_iommuable = NULL
+};
+
+int nvgpu_mem_create_from_phys(struct gk20a *g, struct nvgpu_mem *dest,
+			       u64 src_phys, u64 nr_pages)
+{
+	int ret = 0;
+	struct nvgpu_sgt *sgt;
+	struct nvgpu_mem_sgl *sgl;
+
+	/*
+	 * Do the two operations that can fail before touching *dest.
+	 */
+	sgt = nvgpu_kzalloc(g, sizeof(*sgt));
+	sgl = nvgpu_kzalloc(g, sizeof(*sgl));
+	if ((sgt == NULL) || (sgl == NULL)) {
+		nvgpu_kfree(g, sgt);
+		nvgpu_kfree(g, sgl);
+		return -ENOMEM;
+	}
+
+	(void) memset(dest, 0, sizeof(*dest));
+
+	dest->aperture     = APERTURE_SYSMEM;
+	dest->size         = nvgpu_safe_mult_u64(nr_pages,
+			(u64)NVGPU_CPU_PAGE_SIZE);
+	dest->aligned_size = dest->size;
+	dest->mem_flags    = NVGPU_MEM_FLAG_NO_DMA;
+	dest->phys_sgt     = sgt;
+
+	sgl->next   = NULL;
+	sgl->phys   = src_phys;
+	sgl->length = dest->size;
+	sgt->sgl    = (void *)sgl;
+	sgt->ops    = &nvgpu_mem_phys_ops;
+
+	return ret;
+}
--- a/drivers/gpu/nvgpu/common/mm/nvgpu_sgt.c
+++ b/drivers/gpu/nvgpu/common/mm/nvgpu_sgt.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/dma.h>
+#include <nvgpu/bitops.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/nvgpu_sgt.h>
+#include <nvgpu/nvgpu_sgt_os.h>
+#include <nvgpu/log.h>
+
+void *nvgpu_sgt_get_next(struct nvgpu_sgt *sgt, void *sgl)
+{
+	return sgt->ops->sgl_next(sgl);
+}
+
+u64 nvgpu_sgt_get_phys(struct gk20a *g, struct nvgpu_sgt *sgt, void *sgl)
+{
+	return sgt->ops->sgl_phys(g, sgl);
+}
+
+u64 nvgpu_sgt_get_ipa(struct gk20a *g, struct nvgpu_sgt *sgt, void *sgl)
+{
+	return sgt->ops->sgl_ipa(g, sgl);
+}
+
+u64 nvgpu_sgt_ipa_to_pa(struct gk20a *g, struct nvgpu_sgt *sgt,
+				void *sgl, u64 ipa, u64 *pa_len)
+{
+	return sgt->ops->sgl_ipa_to_pa(g, sgl, ipa, pa_len);
+}
+
+u64 nvgpu_sgt_get_dma(struct nvgpu_sgt *sgt, void *sgl)
+{
+	return sgt->ops->sgl_dma(sgl);
+}
+
+u64 nvgpu_sgt_get_length(struct nvgpu_sgt *sgt, void *sgl)
+{
+	return sgt->ops->sgl_length(sgl);
+}
+
+u64 nvgpu_sgt_get_gpu_addr(struct gk20a *g, struct nvgpu_sgt *sgt, void *sgl,
+				struct nvgpu_gmmu_attrs *attrs)
+{
+	return sgt->ops->sgl_gpu_addr(g, sgl, attrs);
+}
+
+bool nvgpu_sgt_iommuable(struct gk20a *g, struct nvgpu_sgt *sgt)
+{
+	if (sgt->ops->sgt_iommuable != NULL) {
+		return sgt->ops->sgt_iommuable(g, sgt);
+	}
+	return false;
+}
+
+void nvgpu_sgt_free(struct gk20a *g, struct nvgpu_sgt *sgt)
+{
+	if ((sgt != NULL) && (sgt->ops->sgt_free != NULL)) {
+		sgt->ops->sgt_free(g, sgt);
+	}
+}
+
+/*
+ * Determine alignment for a passed buffer. Necessary since the buffer may
+ * appear big enough to map with large pages but the SGL may have chunks that
+ * are not aligned on a 64/128kB large page boundary. There's also the
+ * possibility chunks are odd sizes which will necessitate small page mappings
+ * to correctly glue them together into a contiguous virtual mapping.
+ */
+u64 nvgpu_sgt_alignment(struct gk20a *g, struct nvgpu_sgt *sgt)
+{
+	u64 align = 0, chunk_align = 0;
+	void *sgl;
+
+	/*
+	 * If this SGT is iommuable and we want to use the IOMMU address then
+	 * the SGT's first entry has the IOMMU address. We will align on this
+	 * and double check length of buffer later. Also, since there's an
+	 * IOMMU we know that this DMA address is contiguous.
+	 */
+	if (nvgpu_iommuable(g) &&
+		nvgpu_sgt_iommuable(g, sgt) &&
+		(nvgpu_sgt_get_dma(sgt, sgt->sgl) != 0ULL)) {
+		return 1ULL << (nvgpu_ffs(nvgpu_sgt_get_dma(sgt, sgt->sgl))
+						- 1UL);
+	}
+
+	/*
+	 * Otherwise the buffer is not iommuable (VIDMEM, for example) or we are
+	 * bypassing the IOMMU and need to use the underlying physical entries
+	 * of the SGT.
+	 */
+	nvgpu_sgt_for_each_sgl(sgl, sgt) {
+		chunk_align = 1ULL << nvgpu_safe_sub_u64(nvgpu_ffs(
+					nvgpu_sgt_get_phys(g, sgt, sgl) |
+					nvgpu_sgt_get_length(sgt, sgl)), 1UL);
+
+		if (align != 0ULL) {
+			align = min(align, chunk_align);
+		} else {
+			align = chunk_align;
+		}
+	}
+
+	return align;
+}
+
+struct nvgpu_sgt *nvgpu_sgt_create_from_mem(struct gk20a *g,
+					    struct nvgpu_mem *mem)
+{
+	if ((mem->mem_flags & NVGPU_MEM_FLAG_NO_DMA) != 0U) {
+		return mem->phys_sgt;
+	}
+
+	return nvgpu_sgt_os_create_from_mem(g, mem);
+}
--- a/drivers/gpu/nvgpu/common/mm/vidmem.c
+++ b/drivers/gpu/nvgpu/common/mm/vidmem.c
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/bug.h>
+#include <nvgpu/ce_app.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/dma.h>
+#include <nvgpu/vidmem.h>
+#include <nvgpu/page_allocator.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/sizes.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/nvgpu_sgt.h>
+#include <nvgpu/fence.h>
+
+
+/*
+ * This is expected to be called from the shutdown path (or the error path in
+ * the vidmem init code). As such we do not expect new vidmem frees to be
+ * enqueued.
+ */
+void nvgpu_vidmem_destroy(struct gk20a *g)
+{
+	struct nvgpu_timeout timeout;
+	int err;
+
+	if (g->ops.fb.get_vidmem_size == NULL) {
+		return;
+	}
+
+	err = nvgpu_timeout_init(g, &timeout, 100, NVGPU_TIMER_RETRY_TIMER);
+	if (err != 0) {
+		nvgpu_err(g, "nvgpu_timeout_init() failed err=%d", err);
+	}
+
+	/*
+	 * Ensure that the thread runs one last time to flush anything in the
+	 * queue.
+	 */
+	nvgpu_cond_signal_interruptible(&g->mm.vidmem.clearing_thread_cond);
+
+	/*
+	 * Wait for at most 1 second before just continuing on. It doesn't make
+	 * sense to hang the system over some potential memory leaks.
+	 */
+	do {
+		bool empty;
+
+		nvgpu_mutex_acquire(&g->mm.vidmem.clear_list_mutex);
+		empty = nvgpu_list_empty(&g->mm.vidmem.clear_list_head);
+		nvgpu_mutex_release(&g->mm.vidmem.clear_list_mutex);
+
+		if (empty) {
+			break;
+		}
+
+		nvgpu_msleep(10);
+	} while (nvgpu_timeout_expired(&timeout) == 0);
+
+	/*
+	 * Kill the vidmem clearing thread now. This will wake the thread up
+	 * automatically and cause the wait_interruptible condition trigger.
+	 */
+	nvgpu_thread_stop(&g->mm.vidmem.clearing_thread);
+
+	if (nvgpu_alloc_initialized(&g->mm.vidmem.allocator)) {
+		nvgpu_alloc_destroy(&g->mm.vidmem.allocator);
+	}
+
+	if (nvgpu_alloc_initialized(&g->mm.vidmem.bootstrap_allocator)) {
+		nvgpu_alloc_destroy(&g->mm.vidmem.bootstrap_allocator);
+	}
+}
+
+static int nvgpu_vidmem_clear_fence_wait(struct gk20a *g,
+		struct nvgpu_fence_type *fence_out)
+{
+	struct nvgpu_timeout timeout;
+	bool done;
+	int err;
+
+	err = nvgpu_timeout_init(g, &timeout,
+		   nvgpu_get_poll_timeout(g),
+		   NVGPU_TIMER_CPU_TIMER);
+	if (err != 0) {
+		nvgpu_err(g, "nvgpu_timeout_init() failed err=%d", err);
+		return err;
+	}
+
+	do {
+		err = nvgpu_fence_wait(g, fence_out,
+				       nvgpu_get_poll_timeout(g));
+		if (err != -ERESTARTSYS) {
+			done = true;
+		} else if (nvgpu_timeout_expired(&timeout) != 0) {
+			done = true;
+		} else {
+			done = false;
+		}
+	} while (!done);
+
+	nvgpu_fence_put(fence_out);
+	if (err != 0) {
+		nvgpu_err(g,
+			"fence wait failed for CE execute ops");
+		return err;
+	}
+
+	return 0;
+}
+
+static int nvgpu_vidmem_do_clear_all(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	struct nvgpu_fence_type *fence_out = NULL;
+	int err = 0;
+
+	if (mm->vidmem.ce_ctx_id == NVGPU_CE_INVAL_CTX_ID) {
+		return -EINVAL;
+	}
+
+	vidmem_dbg(g, "Clearing all VIDMEM:");
+
+#ifdef CONFIG_NVGPU_DGPU
+	err = nvgpu_ce_execute_ops(g,
+			mm->vidmem.ce_ctx_id,
+			0,
+			mm->vidmem.base,
+			mm->vidmem.bootstrap_base - mm->vidmem.base,
+			0x00000000,
+			NVGPU_CE_DST_LOCATION_LOCAL_FB,
+			NVGPU_CE_MEMSET,
+			0,
+			&fence_out);
+	if (err != 0) {
+		nvgpu_err(g,
+			"Failed to clear vidmem : %d", err);
+		return err;
+	}
+#else
+	/* fail due to lack of ce app support */
+	return -ENOSYS;
+#endif
+
+	if (fence_out != NULL) {
+		err = nvgpu_vidmem_clear_fence_wait(g, fence_out);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	mm->vidmem.cleared = true;
+
+	vidmem_dbg(g, "Done!");
+
+	return 0;
+}
+
+void nvgpu_vidmem_thread_pause_sync(struct mm_gk20a *mm)
+{
+	/*
+	 * On the first increment of the pause_count (0 -> 1) take the pause
+	 * lock and prevent the vidmem clearing thread from processing work
+	 * items.
+	 *
+	 * Otherwise the increment is all that's needed - it's essentially a
+	 * ref-count for the number of pause() calls.
+	 *
+	 * The sync component is implemented by waiting for the lock to be
+	 * released by the clearing thread in case the thread is currently
+	 * processing work items.
+	 */
+	if (nvgpu_atomic_inc_return(&mm->vidmem.pause_count) == 1) {
+		nvgpu_mutex_acquire(&mm->vidmem.clearing_thread_lock);
+	}
+
+	vidmem_dbg(mm->g, "Clearing thread paused; new count=%d",
+		   nvgpu_atomic_read(&mm->vidmem.pause_count));
+}
+
+void nvgpu_vidmem_thread_unpause(struct mm_gk20a *mm)
+{
+	vidmem_dbg(mm->g, "Unpausing clearing thread; current count=%d",
+		   nvgpu_atomic_read(&mm->vidmem.pause_count));
+
+	/*
+	 * And on the last decrement (1 -> 0) release the pause lock and let
+	 * the vidmem clearing thread continue.
+	 */
+	if (nvgpu_atomic_dec_return(&mm->vidmem.pause_count) == 0) {
+		nvgpu_mutex_release(&mm->vidmem.clearing_thread_lock);
+		vidmem_dbg(mm->g, "  > Clearing thread really unpaused!");
+	}
+}
+
+int nvgpu_vidmem_clear_list_enqueue(struct gk20a *g, struct nvgpu_mem *mem)
+{
+	struct mm_gk20a *mm = &g->mm;
+
+	/*
+	 * Crap. Can't enqueue new vidmem bufs! CE may be gone!
+	 *
+	 * However, an errant app can hold a vidmem dma_buf FD open past when
+	 * the nvgpu driver has exited. Thus when the FD does get closed
+	 * eventually the dma_buf release function will try to call the vidmem
+	 * free function which will attempt to enqueue the vidmem into the
+	 * vidmem clearing thread.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_DRIVER_IS_DYING)) {
+		return -ENOSYS;
+	}
+
+	nvgpu_mutex_acquire(&mm->vidmem.clear_list_mutex);
+	nvgpu_list_add_tail(&mem->clear_list_entry,
+			    &mm->vidmem.clear_list_head);
+	nvgpu_atomic64_add((long)mem->aligned_size, &mm->vidmem.bytes_pending);
+	nvgpu_mutex_release(&mm->vidmem.clear_list_mutex);
+
+	nvgpu_cond_signal_interruptible(&mm->vidmem.clearing_thread_cond);
+
+	return 0;
+}
+
+static struct nvgpu_mem *nvgpu_vidmem_clear_list_dequeue(struct mm_gk20a *mm)
+{
+	struct nvgpu_mem *mem = NULL;
+
+	nvgpu_mutex_acquire(&mm->vidmem.clear_list_mutex);
+	if (!nvgpu_list_empty(&mm->vidmem.clear_list_head)) {
+		mem = nvgpu_list_first_entry(&mm->vidmem.clear_list_head,
+				nvgpu_mem, clear_list_entry);
+		nvgpu_list_del(&mem->clear_list_entry);
+	}
+	nvgpu_mutex_release(&mm->vidmem.clear_list_mutex);
+
+	return mem;
+}
+
+static void nvgpu_vidmem_clear_pending_allocs(struct mm_gk20a *mm)
+{
+	struct gk20a *g = mm->g;
+	struct nvgpu_mem *mem;
+	int err;
+
+	vidmem_dbg(g, "Running VIDMEM clearing thread:");
+
+	while ((mem = nvgpu_vidmem_clear_list_dequeue(mm)) != NULL) {
+		err = nvgpu_vidmem_clear(g, mem);
+		if (err != 0) {
+			nvgpu_err(g, "nvgpu_vidmem_clear() failed err=%d", err);
+		}
+
+		WARN_ON(nvgpu_atomic64_sub_return((long)mem->aligned_size,
+					&g->mm.vidmem.bytes_pending) < 0);
+		mem->size = 0;
+		mem->aperture = APERTURE_INVALID;
+
+		nvgpu_mem_free_vidmem_alloc(g, mem);
+		nvgpu_kfree(g, mem);
+	}
+
+	vidmem_dbg(g, "Done!");
+}
+
+static int nvgpu_vidmem_clear_pending_allocs_thr(void *mm_ptr)
+{
+	struct mm_gk20a *mm = mm_ptr;
+
+	/*
+	 * Simple thread who's sole job is to periodically clear userspace
+	 * vidmem allocations that have been recently freed.
+	 *
+	 * Since it doesn't make sense to run unless there's pending work a
+	 * condition field is used to wait for work. When the DMA API frees a
+	 * userspace vidmem buf it enqueues it into the clear list and alerts us
+	 * that we have some work to do.
+	 */
+
+	while (!nvgpu_thread_should_stop(&mm->vidmem.clearing_thread)) {
+		int ret;
+
+		/*
+		 * Wait for work but also make sure we should not be paused.
+		 */
+		ret = NVGPU_COND_WAIT_INTERRUPTIBLE(
+				&mm->vidmem.clearing_thread_cond,
+				nvgpu_thread_should_stop(
+					&mm->vidmem.clearing_thread) ||
+				!nvgpu_list_empty(&mm->vidmem.clear_list_head),
+				0U);
+		if (ret == -ERESTARTSYS) {
+			continue;
+		}
+
+		/*
+		 * Use this lock to implement a pause mechanism. By taking this
+		 * lock some other code can prevent this thread from processing
+		 * work items.
+		 */
+		if (nvgpu_mutex_tryacquire(&mm->vidmem.clearing_thread_lock)
+									== 0) {
+			continue;
+		}
+
+		nvgpu_vidmem_clear_pending_allocs(mm);
+
+		nvgpu_mutex_release(&mm->vidmem.clearing_thread_lock);
+	}
+
+	return 0;
+}
+
+int nvgpu_vidmem_init(struct mm_gk20a *mm)
+{
+	struct gk20a *g = mm->g;
+	u64 bootstrap_base, base;
+	u64 bootstrap_size = SZ_512M;
+	u64 default_page_size = SZ_64K;
+	size_t size;
+	int err;
+	static struct nvgpu_alloc_carveout bootstrap_co =
+		NVGPU_CARVEOUT("bootstrap-region", 0, 0);
+
+	if (g->ops.fb.get_vidmem_size ==  NULL) {
+
+		/*
+		 * As it is a common function, the return value
+		 * need to be handled for igpu.
+		 */
+		return 0;
+	} else {
+		size = g->ops.fb.get_vidmem_size(g);
+		if (size == 0UL) {
+			nvgpu_err(g, "Found zero vidmem");
+			return -ENOMEM;
+		}
+	}
+
+	vidmem_dbg(g, "init begin");
+
+#ifdef CONFIG_NVGPU_SIM
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		bootstrap_size = SZ_32M;
+	}
+#endif
+
+	bootstrap_co.base = size - bootstrap_size;
+	bootstrap_co.length = bootstrap_size;
+
+	bootstrap_base = bootstrap_co.base;
+	base = default_page_size;
+
+	/*
+	 * Bootstrap allocator for use before the CE is initialized (CE
+	 * initialization requires vidmem but we want to use the CE to zero
+	 * out vidmem before allocating it...
+	 */
+	err = nvgpu_allocator_init(g, &g->mm.vidmem.bootstrap_allocator,
+				NULL, "vidmem-bootstrap", bootstrap_base,
+				bootstrap_size,	SZ_4K, 0ULL,
+				GPU_ALLOC_FORCE_CONTIG, PAGE_ALLOCATOR);
+
+	err = nvgpu_allocator_init(g, &g->mm.vidmem.allocator, NULL,
+			"vidmem", base, size - base, default_page_size, 0ULL,
+			GPU_ALLOC_4K_VIDMEM_PAGES, PAGE_ALLOCATOR);
+	if (err != 0) {
+		nvgpu_err(g, "Failed to register vidmem for size %zu: %d",
+				size, err);
+		return err;
+	}
+
+	/* Reserve bootstrap region in vidmem allocator */
+	err = nvgpu_alloc_reserve_carveout(&g->mm.vidmem.allocator,
+		&bootstrap_co);
+	if (err != 0) {
+		nvgpu_err(g, "nvgpu_alloc_reserve_carveout() failed err=%d",
+			err);
+		goto fail;
+	}
+
+	mm->vidmem.base = base;
+	mm->vidmem.size = size - base;
+	mm->vidmem.bootstrap_base = bootstrap_base;
+	mm->vidmem.bootstrap_size = bootstrap_size;
+
+	err = nvgpu_cond_init(&mm->vidmem.clearing_thread_cond);
+	if (err != 0) {
+		goto fail;
+	}
+
+	nvgpu_atomic64_set(&mm->vidmem.bytes_pending, 0);
+	nvgpu_init_list_node(&mm->vidmem.clear_list_head);
+
+	nvgpu_mutex_init(&mm->vidmem.clear_list_mutex);
+	nvgpu_mutex_init(&mm->vidmem.clearing_thread_lock);
+	nvgpu_mutex_init(&mm->vidmem.first_clear_mutex);
+
+	nvgpu_atomic_set(&mm->vidmem.pause_count, 0);
+
+	/*
+	 * Start the thread off in the paused state. The thread doesn't have to
+	 * be running for this to work. It will be woken up later on in
+	 * finalize_poweron(). We won't necessarily have a CE context yet
+	 * either, so hypothetically one could cause a race where we try to
+	 * clear a vidmem struct before we have a CE context to do so.
+	 */
+	nvgpu_vidmem_thread_pause_sync(mm);
+
+	err = nvgpu_thread_create(&mm->vidmem.clearing_thread, mm,
+				  nvgpu_vidmem_clear_pending_allocs_thr,
+				  "vidmem-clear");
+	if (err != 0) {
+		goto fail;
+	}
+
+	vidmem_dbg(g, "VIDMEM Total: %zu MB", size >> 20);
+	vidmem_dbg(g, "VIDMEM Ranges:");
+	vidmem_dbg(g, "  0x%-10llx -> 0x%-10llx Primary",
+		   mm->vidmem.base, mm->vidmem.base + mm->vidmem.size);
+	vidmem_dbg(g, "  0x%-10llx -> 0x%-10llx Bootstrap",
+		   mm->vidmem.bootstrap_base,
+		   mm->vidmem.bootstrap_base + mm->vidmem.bootstrap_size);
+	vidmem_dbg(g, "VIDMEM carveouts:");
+	vidmem_dbg(g, "  0x%-10llx -> 0x%-10llx %s",
+		   bootstrap_co.base, bootstrap_co.base + bootstrap_co.length,
+		   bootstrap_co.name);
+
+	return 0;
+
+fail:
+	nvgpu_cond_destroy(&mm->vidmem.clearing_thread_cond);
+	nvgpu_vidmem_destroy(g);
+	return err;
+}
+
+int nvgpu_vidmem_get_space(struct gk20a *g, u64 *space)
+{
+	struct nvgpu_allocator *allocator = &g->mm.vidmem.allocator;
+
+	nvgpu_log_fn(g, " ");
+
+	if (!nvgpu_alloc_initialized(allocator)) {
+		return -ENOSYS;
+	}
+
+	*space = nvgpu_alloc_space(allocator) +
+		U64(nvgpu_atomic64_read(&g->mm.vidmem.bytes_pending));
+	return 0;
+}
+
+int nvgpu_vidmem_clear(struct gk20a *g, struct nvgpu_mem *mem)
+{
+	struct nvgpu_fence_type *fence_out = NULL;
+	struct nvgpu_fence_type *last_fence = NULL;
+	struct nvgpu_page_alloc *alloc = NULL;
+	void *sgl = NULL;
+	int err = 0;
+
+	if (g->mm.vidmem.ce_ctx_id == NVGPU_CE_INVAL_CTX_ID) {
+		return -EINVAL;
+	}
+
+	alloc = mem->vidmem_alloc;
+
+	nvgpu_sgt_for_each_sgl(sgl, &alloc->sgt) {
+		if (last_fence != NULL) {
+			nvgpu_fence_put(last_fence);
+		}
+
+#ifdef CONFIG_NVGPU_DGPU
+		err = nvgpu_ce_execute_ops(g,
+			g->mm.vidmem.ce_ctx_id,
+			0,
+			nvgpu_sgt_get_phys(g, &alloc->sgt, sgl),
+			nvgpu_sgt_get_length(&alloc->sgt, sgl),
+			0x00000000,
+			NVGPU_CE_DST_LOCATION_LOCAL_FB,
+			NVGPU_CE_MEMSET,
+			0,
+			&fence_out);
+#else
+		/* fail due to lack of ce app support */
+		err = -ENOSYS;
+#endif
+
+		if (err != 0) {
+#ifdef CONFIG_NVGPU_DGPU
+			nvgpu_err(g,
+				"Failed nvgpu_ce_execute_ops[%d]", err);
+#endif
+			return err;
+		}
+
+		vidmem_dbg(g, "  > [0x%llx  +0x%llx]",
+			   nvgpu_sgt_get_phys(g, &alloc->sgt, sgl),
+			   nvgpu_sgt_get_length(&alloc->sgt, sgl));
+
+		last_fence = fence_out;
+	}
+
+	if (last_fence != NULL) {
+		err = nvgpu_vidmem_clear_fence_wait(g, last_fence);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	vidmem_dbg(g, "  Done");
+
+	return err;
+}
+
+static int nvgpu_vidmem_clear_all(struct gk20a *g)
+{
+	int err;
+
+	if (g->mm.vidmem.cleared) {
+		return 0;
+	}
+
+	nvgpu_mutex_acquire(&g->mm.vidmem.first_clear_mutex);
+	if (!g->mm.vidmem.cleared) {
+		err = nvgpu_vidmem_do_clear_all(g);
+		if (err != 0) {
+			nvgpu_mutex_release(&g->mm.vidmem.first_clear_mutex);
+			nvgpu_err(g, "failed to clear whole vidmem");
+			return err;
+		}
+	}
+	nvgpu_mutex_release(&g->mm.vidmem.first_clear_mutex);
+
+	return 0;
+}
+
+int nvgpu_vidmem_user_alloc(struct gk20a *g, size_t bytes,
+				struct nvgpu_vidmem_buf **vidmem_buf)
+{
+	struct nvgpu_vidmem_buf *buf;
+	int err;
+
+	if (vidmem_buf == NULL) {
+		return -EINVAL;
+	}
+
+	err = nvgpu_vidmem_clear_all(g);
+	if (err != 0) {
+		return -ENOMEM;
+	}
+
+	buf = nvgpu_kzalloc(g, sizeof(*buf));
+	if (buf == NULL) {
+		return -ENOMEM;
+	}
+
+	buf->g = g;
+	buf->mem = nvgpu_kzalloc(g, sizeof(*buf->mem));
+	if (buf->mem == NULL) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	err = nvgpu_dma_alloc_vid(g, bytes, buf->mem);
+	if (err != 0) {
+		goto fail;
+	}
+
+	/*
+	 * Alerts the DMA API that when we free this vidmem buf we have to
+	 * clear it to avoid leaking data to userspace.
+	 */
+	buf->mem->mem_flags |= NVGPU_MEM_FLAG_USER_MEM;
+
+	*vidmem_buf = buf;
+
+	return 0;
+
+fail:
+	/* buf will never be NULL here. */
+	nvgpu_kfree(g, buf->mem);
+	nvgpu_kfree(g, buf);
+	return err;
+}
+
+void nvgpu_vidmem_buf_free(struct gk20a *g, struct nvgpu_vidmem_buf *buf)
+{
+	/*
+	 * In some error paths it's convenient to be able to "free" a NULL buf.
+	 */
+	if (buf == NULL) {
+		return;
+	}
+
+	nvgpu_dma_free(g, buf->mem);
+
+	/*
+	 * We don't free buf->mem here. This is handled by nvgpu_dma_free()!
+	 * Since these buffers are cleared in the background the nvgpu_mem
+	 * struct must live on through that. We transfer ownership here to the
+	 * DMA API and let the DMA API free the buffer.
+	 */
+	nvgpu_kfree(g, buf);
+}
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
--- a/drivers/gpu/nvgpu/common/mm/vm_area.c
+++ b/drivers/gpu/nvgpu/common/mm/vm_area.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/vm.h>
+#include <nvgpu/vm_area.h>
+#include <nvgpu/barrier.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/static_analysis.h>
+
+struct nvgpu_vm_area *nvgpu_vm_area_find(struct vm_gk20a *vm, u64 addr)
+{
+	struct nvgpu_vm_area *vm_area;
+
+	nvgpu_list_for_each_entry(vm_area, &vm->vm_area_list,
+				  nvgpu_vm_area, vm_area_list) {
+		if (addr >= vm_area->addr) {
+			if (addr < nvgpu_safe_add_u64(vm_area->addr,
+							vm_area->size)) {
+				return vm_area;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+int nvgpu_vm_area_validate_buffer(struct vm_gk20a *vm,
+				  u64 map_addr, u64 map_size, u32 pgsz_idx,
+				  struct nvgpu_vm_area **pvm_area)
+{
+	struct gk20a *g = vm->mm->g;
+	struct nvgpu_vm_area *vm_area;
+	struct nvgpu_mapped_buf *buffer;
+	u64 map_end;
+
+	/* can wrap around with insane map_size; zero is disallowed too */
+	if (((U64_MAX - map_size) < map_addr) || (map_size == 0ULL)) {
+		nvgpu_warn(g, "fixed offset mapping with invalid map_size");
+		return -EINVAL;
+	}
+	map_end = map_addr + map_size;
+
+	if ((map_addr &
+	     nvgpu_safe_sub_u64(U64(vm->gmmu_page_sizes[pgsz_idx]), U64(1)))
+	     != 0ULL) {
+		nvgpu_err(g, "map offset must be buffer page size aligned 0x%llx",
+			  map_addr);
+		return -EINVAL;
+	}
+
+	/* Find the space reservation, but it's ok to have none for
+	 * userspace-managed address spaces */
+	vm_area = nvgpu_vm_area_find(vm, map_addr);
+	if ((vm_area == NULL) && !vm->userspace_managed) {
+		nvgpu_warn(g, "fixed offset mapping without space allocation");
+		return -EINVAL;
+	}
+
+	/* Mapped area should fit inside va, if there's one */
+	if (vm_area != NULL) {
+		if (map_end > nvgpu_safe_add_u64(vm_area->addr,
+							    vm_area->size)) {
+			nvgpu_warn(g,
+				"fixed offset mapping size overflows va node");
+			return -EINVAL;
+		}
+	}
+
+	/* check that this mapping does not collide with existing
+	 * mappings by checking the buffer with the highest GPU VA
+	 * that is less than our buffer end */
+	buffer = nvgpu_vm_find_mapped_buf_less_than(
+		vm, map_end);
+	if (buffer != NULL) {
+		if (nvgpu_safe_add_u64(buffer->addr, buffer->size) > map_addr) {
+			nvgpu_warn(g, "overlapping buffer map requested");
+			return -EINVAL;
+		}
+	}
+
+	*pvm_area = vm_area;
+
+	return 0;
+}
+
+static int nvgpu_vm_area_alloc_get_pagesize_index(struct vm_gk20a *vm,
+					u32 *pgsz_idx_ptr, u32 page_size)
+{
+	u32 pgsz_idx = *pgsz_idx_ptr;
+
+	for (; pgsz_idx < GMMU_NR_PAGE_SIZES; pgsz_idx++) {
+		if (vm->gmmu_page_sizes[pgsz_idx] == page_size) {
+			break;
+		}
+	}
+
+	*pgsz_idx_ptr = pgsz_idx;
+
+	if (pgsz_idx > GMMU_PAGE_SIZE_BIG) {
+		return -EINVAL;
+	}
+
+	/*
+	 * pgsz_idx isn't likely to get too crazy, since it starts at 0 and
+	 * increments but this ensures that we still have a definitely valid
+	 * page size before proceeding.
+	 */
+	nvgpu_speculation_barrier();
+
+	if (!vm->big_pages && (pgsz_idx == GMMU_PAGE_SIZE_BIG)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nvgpu_vm_area_alloc_memory(struct nvgpu_allocator *vma, u64 our_addr,
+			u64 pages, u32 page_size, u32 flags,
+			u64 *vaddr_start_ptr)
+{
+	u64 vaddr_start = 0;
+
+	if ((flags & NVGPU_VM_AREA_ALLOC_FIXED_OFFSET) != 0U) {
+		vaddr_start = nvgpu_alloc_fixed(vma, our_addr,
+						pages *
+						(u64)page_size,
+						page_size);
+	} else {
+		vaddr_start = nvgpu_alloc_pte(vma,
+					      pages *
+					      (u64)page_size,
+					      page_size);
+	}
+
+	if (vaddr_start == 0ULL) {
+		return -ENOMEM;
+	}
+
+	*vaddr_start_ptr = vaddr_start;
+	return 0;
+}
+
+static int nvgpu_vm_area_alloc_gmmu_map(struct vm_gk20a *vm,
+			struct nvgpu_vm_area *vm_area, u64 vaddr_start,
+			u32 pgsz_idx, u32 flags)
+{
+	struct gk20a *g = vm->mm->g;
+
+	if ((flags & NVGPU_VM_AREA_ALLOC_SPARSE) != 0U) {
+		u64 map_addr = g->ops.mm.gmmu.map(vm, vaddr_start,
+					 NULL,
+					 0,
+					 vm_area->size,
+					 pgsz_idx,
+					 0,
+					 0,
+					 flags,
+					 gk20a_mem_flag_none,
+					 false,
+					 true,
+					 false,
+					 NULL,
+					 APERTURE_INVALID);
+		if (map_addr == 0ULL) {
+			return -ENOMEM;
+		}
+
+		vm_area->sparse = true;
+	}
+	nvgpu_list_add_tail(&vm_area->vm_area_list, &vm->vm_area_list);
+
+	return 0;
+}
+
+int nvgpu_vm_area_alloc(struct vm_gk20a *vm, u64 pages, u32 page_size,
+			u64 *addr, u32 flags)
+{
+	struct gk20a *g = vm->mm->g;
+	struct nvgpu_allocator *vma;
+	struct nvgpu_vm_area *vm_area;
+	u64 vaddr_start = 0;
+	u64 our_addr = *addr;
+	u32 pgsz_idx = GMMU_PAGE_SIZE_SMALL;
+
+	/*
+	 * If we have a fixed address then use the passed address in *addr. This
+	 * corresponds to the o_a field in the IOCTL. But since we do not
+	 * support specific alignments in the buddy allocator we ignore the
+	 * field if it isn't a fixed offset.
+	 */
+	if ((flags & NVGPU_VM_AREA_ALLOC_FIXED_OFFSET) != 0U) {
+		our_addr = *addr;
+	}
+
+	nvgpu_log(g, gpu_dbg_map,
+		  "ADD vm_area: pgsz=%#-8x pages=%-9llu a/o=%#-14llx flags=0x%x",
+		  page_size, pages, our_addr, flags);
+
+	if (nvgpu_vm_area_alloc_get_pagesize_index(vm, &pgsz_idx,
+							page_size) != 0) {
+		return -EINVAL;
+	}
+
+	vm_area = nvgpu_kzalloc(g, sizeof(*vm_area));
+	if (vm_area == NULL) {
+		return -ENOMEM;
+	}
+
+	vma = vm->vma[pgsz_idx];
+	if (nvgpu_vm_area_alloc_memory(vma, our_addr, pages,
+				page_size, flags, &vaddr_start) != 0) {
+		goto free_vm_area;
+	}
+
+	vm_area->flags = flags;
+	vm_area->addr = vaddr_start;
+	vm_area->size = (u64)page_size * pages;
+	vm_area->pgsz_idx = pgsz_idx;
+	nvgpu_init_list_node(&vm_area->buffer_list_head);
+	nvgpu_init_list_node(&vm_area->vm_area_list);
+
+	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+
+	if (nvgpu_vm_area_alloc_gmmu_map(vm, vm_area, vaddr_start,
+						pgsz_idx, flags) != 0) {
+		nvgpu_mutex_release(&vm->update_gmmu_lock);
+		goto free_vaddr;
+	}
+
+	nvgpu_mutex_release(&vm->update_gmmu_lock);
+
+	*addr = vaddr_start;
+	return 0;
+
+free_vaddr:
+	nvgpu_free(vma, vaddr_start);
+free_vm_area:
+	nvgpu_kfree(g, vm_area);
+	return -ENOMEM;
+}
+
+int nvgpu_vm_area_free(struct vm_gk20a *vm, u64 addr)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct nvgpu_mapped_buf *buffer;
+	struct nvgpu_vm_area *vm_area;
+
+	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+	vm_area = nvgpu_vm_area_find(vm, addr);
+	if (vm_area == NULL) {
+		nvgpu_mutex_release(&vm->update_gmmu_lock);
+		return 0;
+	}
+	nvgpu_list_del(&vm_area->vm_area_list);
+
+	nvgpu_log(g, gpu_dbg_map,
+		  "DEL vm_area: pgsz=%#-8x pages=%-9llu "
+		  "addr=%#-14llx flags=0x%x",
+		  vm->gmmu_page_sizes[vm_area->pgsz_idx],
+		  vm_area->size / vm->gmmu_page_sizes[vm_area->pgsz_idx],
+		  vm_area->addr,
+		  vm_area->flags);
+
+	/* Decrement the ref count on all buffers in this vm_area. This
+	 * allows userspace to let the kernel free mappings that are
+	 * only used by this vm_area. */
+	while (!nvgpu_list_empty(&vm_area->buffer_list_head)) {
+		buffer = nvgpu_list_first_entry(&vm_area->buffer_list_head,
+					nvgpu_mapped_buf, buffer_list);
+		nvgpu_list_del(&buffer->buffer_list);
+		nvgpu_ref_put(&buffer->ref, nvgpu_vm_unmap_ref_internal);
+	}
+
+	/* if this was a sparse mapping, free the va */
+	if (vm_area->sparse) {
+		g->ops.mm.gmmu.unmap(vm,
+				     vm_area->addr,
+				     vm_area->size,
+				     vm_area->pgsz_idx,
+				     false,
+				     gk20a_mem_flag_none,
+				     true,
+				     NULL);
+	}
+
+	nvgpu_mutex_release(&vm->update_gmmu_lock);
+
+	nvgpu_free(vm->vma[vm_area->pgsz_idx], vm_area->addr);
+	nvgpu_kfree(g, vm_area);
+
+	return 0;
+}