gpu: nvgpu: Separate GMMU mapping impl from mm_gk20a.c

Separate the non-chip specific GMMU mapping implementation code
out of mm_gk20a.c. This puts all of the chip-agnostic code into
common/mm/gmmu.c in preparation for rewriting it.

JIRA NVGPU-12
JIRA NVGPU-30

Change-Id: I6f7fdac3422703f5e80bb22ad304dc27bba4814d
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1480228
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Alex Waterman
2017-05-11 18:25:47 +01:00
committed by mobile promotions
parent c21f5bca9a
commit 048c6b062a
5 changed files with 539 additions and 530 deletions

View File

@@ -15,14 +15,81 @@
*/
#include <nvgpu/log.h>
#include <nvgpu/list.h>
#include <nvgpu/dma.h>
#include <nvgpu/gmmu.h>
#include <nvgpu/nvgpu_mem.h>
#include <nvgpu/enabled.h>
#include <nvgpu/page_allocator.h>
#include "gk20a/gk20a.h"
#include "gk20a/mm_gk20a.h"
#define gmmu_dbg(g, fmt, args...) \
nvgpu_log(g, gpu_dbg_map, fmt, ##args)
#define gmmu_dbg_v(g, fmt, args...) \
nvgpu_log(g, gpu_dbg_map_v, fmt, ##args)
static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
{
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.priv.sgt->sgl->length);
return 0;
}
static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry)
{
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.priv.sgt->sgl->length);
}
static int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL))
return map_gmmu_phys_pages(entry);
if (IS_ENABLED(CONFIG_ARM64)) {
if (entry->mem.aperture == APERTURE_VIDMEM)
return 0;
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.size);
} else {
int err = nvgpu_mem_begin(g, &entry->mem);
if (err)
return err;
}
return 0;
}
static void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
unmap_gmmu_phys_pages(entry);
return;
}
if (IS_ENABLED(CONFIG_ARM64)) {
if (entry->mem.aperture == APERTURE_VIDMEM)
return;
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.size);
} else {
nvgpu_mem_end(g, &entry->mem);
}
}
static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
struct gk20a_mm_entry *entry)
{
@@ -97,6 +164,44 @@ static int nvgpu_alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
return 0;
}
static void free_gmmu_phys_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
/* note: mem_desc slightly abused (wrt. nvgpu_free_gmmu_pages) */
free_pages((unsigned long)entry->mem.cpu_va, get_order(entry->mem.size));
entry->mem.cpu_va = NULL;
sg_free_table(entry->mem.priv.sgt);
nvgpu_kfree(vm->mm->g, entry->mem.priv.sgt);
entry->mem.priv.sgt = NULL;
entry->mem.size = 0;
entry->mem.aperture = APERTURE_INVALID;
}
void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry)
{
struct gk20a *g = gk20a_from_vm(vm);
gk20a_dbg_fn("");
if (!entry->mem.size)
return;
if (entry->woffset) /* fake shadow mem */
return;
if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
free_gmmu_phys_pages(vm, entry);
return;
}
nvgpu_dma_free(g, &entry->mem);
}
/*
* Allocate a phys contig region big enough for a full
* sized gmmu page table for the given gmmu_page_size.
@@ -202,6 +307,9 @@ static u64 __nvgpu_gmmu_map(struct vm_gk20a *vm,
return vaddr;
}
/*
* Convenience wrapper over __nvgpu_gmmu_map() for non-fixed mappings.
*/
u64 nvgpu_gmmu_map(struct vm_gk20a *vm,
struct nvgpu_mem *mem,
u64 size,
@@ -246,3 +354,412 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem, u64 gpu_va)
nvgpu_mutex_release(&vm->update_gmmu_lock);
}
static int update_gmmu_level_locked(struct vm_gk20a *vm,
struct gk20a_mm_entry *pte,
enum gmmu_pgsz_gk20a pgsz_idx,
struct scatterlist **sgl,
u64 *offset,
u64 *iova,
u64 gpu_va, u64 gpu_end,
u8 kind_v, u64 *ctag,
bool cacheable, bool unmapped_pte,
int rw_flag,
bool sparse,
int lvl,
bool priv,
enum nvgpu_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
int err = 0;
u32 pde_i;
u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
gk20a_dbg_fn("");
pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
>> (u64)l->lo_bit[pgsz_idx];
gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
while (gpu_va < gpu_end) {
u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
/* Allocate next level */
if (next_l->update_entry) {
if (!pte->entries) {
int num_entries =
1 <<
(l->hi_bit[pgsz_idx]
- l->lo_bit[pgsz_idx] + 1);
pte->entries =
nvgpu_vzalloc(g,
sizeof(struct gk20a_mm_entry) *
num_entries);
if (!pte->entries)
return -ENOMEM;
pte->pgsz = pgsz_idx;
pte->num_entries = num_entries;
}
prev_pte = next_pte;
next_pte = pte->entries + pde_i;
if (!next_pte->mem.size) {
err = nvgpu_zalloc_gmmu_page_table(vm,
pgsz_idx, next_l, next_pte, prev_pte);
if (err)
return err;
}
}
err = l->update_entry(vm, pte, pde_i, pgsz_idx,
sgl, offset, iova,
kind_v, ctag, cacheable, unmapped_pte,
rw_flag, sparse, priv, aperture);
if (err)
return err;
if (next_l->update_entry) {
/* get cpu access to the ptes */
err = map_gmmu_pages(g, next_pte);
if (err) {
nvgpu_err(g,
"couldn't map ptes for update as=%d",
vm_aspace_id(vm));
return err;
}
err = update_gmmu_level_locked(vm, next_pte,
pgsz_idx,
sgl,
offset,
iova,
gpu_va,
next,
kind_v, ctag, cacheable, unmapped_pte,
rw_flag, sparse, lvl+1, priv, aperture);
unmap_gmmu_pages(g, next_pte);
if (err)
return err;
}
pde_i++;
gpu_va = next;
}
gk20a_dbg_fn("done");
return 0;
}
/*
* This is the true top level GMMU mapping logic. This breaks down the incoming
* scatter gather table and does actual programming of GPU virtual address to
* physical* address.
*
* The update of each level of the page tables is farmed out to chip specific
* implementations. But the logic around that is generic to all chips. Every chip
* has some number of PDE levels and then a PTE level.
*
* Each chunk of the incoming SGT is sent to the chip specific implementation
* of page table update.
*
* [*] Note: the "physical" address may actually be an IO virtual address in the
* case of SMMU usage.
*/
static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
struct sg_table *sgt,
u64 buffer_offset,
u64 gpu_va, u64 gpu_end,
u8 kind_v, u32 ctag_offset,
bool cacheable, bool unmapped_pte,
int rw_flag,
bool sparse,
bool priv,
enum nvgpu_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
int ctag_granularity = g->ops.fb.compression_page_size(g);
u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
u64 iova = 0;
u64 space_to_skip = buffer_offset;
u64 map_size = gpu_end - gpu_va;
u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
int err;
struct scatterlist *sgl = NULL;
struct nvgpu_page_alloc *alloc = NULL;
struct page_alloc_chunk *chunk = NULL;
u64 length;
/* note: here we need to map kernel to small, since the
* low-level mmu code assumes 0 is small and 1 is big pages */
if (pgsz_idx == gmmu_page_size_kernel)
pgsz_idx = gmmu_page_size_small;
if (space_to_skip & (page_size - 1))
return -EINVAL;
err = map_gmmu_pages(g, &vm->pdb);
if (err) {
nvgpu_err(g,
"couldn't map ptes for update as=%d",
vm_aspace_id(vm));
return err;
}
if (aperture == APERTURE_VIDMEM) {
gmmu_dbg_v(g, "vidmem map size_idx=%d, gpu_va=[%llx,%llx]",
pgsz_idx, gpu_va, gpu_end-1);
if (sgt) {
alloc = get_vidmem_page_alloc(sgt->sgl);
nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
page_alloc_chunk, list_entry) {
if (space_to_skip &&
space_to_skip > chunk->length) {
space_to_skip -= chunk->length;
} else {
iova = chunk->base + space_to_skip;
length = chunk->length - space_to_skip;
length = min(length, map_size);
space_to_skip = 0;
err = update_gmmu_level_locked(vm,
&vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_va + length,
kind_v, &ctag,
cacheable, unmapped_pte,
rw_flag, sparse, 0, priv,
aperture);
if (err)
break;
/* need to set explicit zero here */
space_to_skip = 0;
gpu_va += length;
map_size -= length;
if (!map_size)
break;
}
}
} else {
err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_end,
kind_v, &ctag,
cacheable, unmapped_pte, rw_flag,
sparse, 0, priv,
aperture);
}
} else {
gmmu_dbg_v(g,
"pgsz=%-6d, gpu_va: %#-12llx +%#-6llx phys: %#-12llx "
"buffer offset: %-4lld, nents: %d",
page_size,
gpu_va, gpu_end - gpu_va,
sgt ? g->ops.mm.get_iova_addr(g, sgt->sgl, 0) : 0ULL,
buffer_offset,
sgt ? sgt->nents : 0);
if (sgt) {
iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
if (!vm->mm->bypass_smmu && iova) {
iova += space_to_skip;
} else {
sgl = sgt->sgl;
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
(u64)sg_phys(sgl),
sgl->length);
while (space_to_skip && sgl &&
space_to_skip + page_size > sgl->length) {
space_to_skip -= sgl->length;
sgl = sg_next(sgl);
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
(u64)sg_phys(sgl),
sgl->length);
}
iova = sg_phys(sgl) + space_to_skip;
}
}
err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_end,
kind_v, &ctag,
cacheable, unmapped_pte, rw_flag,
sparse, 0, priv,
aperture);
}
unmap_gmmu_pages(g, &vm->pdb);
mb();
gk20a_dbg_fn("done");
return err;
}
/**
* gk20a_locked_gmmu_map - Map a buffer into the GMMU
*
* This is for non-vGPU chips. It's part of the HAL at the moment but really
* should not be. Chip specific stuff is handled at the PTE/PDE programming
* layer. The rest of the logic is essentially generic for all chips.
*
* To call this function you must have locked the VM lock: vm->update_gmmu_lock.
* However, note: this function is not called directly. It's used through the
* mm.gmmu_lock() HAL. So before calling the mm.gmmu_lock() HAL make sure you
* have the update_gmmu_lock aquired.
*/
u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
u64 map_offset,
struct sg_table *sgt,
u64 buffer_offset,
u64 size,
int pgsz_idx,
u8 kind_v,
u32 ctag_offset,
u32 flags,
int rw_flag,
bool clear_ctags,
bool sparse,
bool priv,
struct vm_gk20a_mapping_batch *batch,
enum nvgpu_aperture aperture)
{
int err = 0;
bool allocated = false;
struct gk20a *g = gk20a_from_vm(vm);
int ctag_granularity = g->ops.fb.compression_page_size(g);
u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
/* Allocate (or validate when map_offset != 0) the virtual address. */
if (!map_offset) {
map_offset = __nvgpu_vm_alloc_va(vm, size,
pgsz_idx);
if (!map_offset) {
nvgpu_err(g, "failed to allocate va space");
err = -ENOMEM;
goto fail_alloc;
}
allocated = true;
}
gmmu_dbg(g,
"gv: 0x%04x_%08x + 0x%-7llx "
"[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
"pgsz=%-3dKb as=%-2d ctags=%d start=%d "
"kind=0x%x flags=0x%x apt=%s",
u64_hi32(map_offset), u64_lo32(map_offset), size,
sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
ctag_lines, ctag_offset,
kind_v, flags, nvgpu_aperture_str(aperture));
err = update_gmmu_ptes_locked(vm, pgsz_idx,
sgt,
buffer_offset,
map_offset, map_offset + size,
kind_v,
ctag_offset,
flags &
NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
flags &
NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
rw_flag,
sparse,
priv,
aperture);
if (err) {
nvgpu_err(g, "failed to update ptes on map");
goto fail_validate;
}
if (!batch)
g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
else
batch->need_tlb_invalidate = true;
return map_offset;
fail_validate:
if (allocated)
__nvgpu_vm_free_va(vm, map_offset, pgsz_idx);
fail_alloc:
nvgpu_err(g, "%s: failed with err=%d", __func__, err);
return 0;
}
void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
u64 vaddr,
u64 size,
int pgsz_idx,
bool va_allocated,
int rw_flag,
bool sparse,
struct vm_gk20a_mapping_batch *batch)
{
int err = 0;
struct gk20a *g = gk20a_from_vm(vm);
if (va_allocated) {
err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
if (err) {
nvgpu_err(g, "failed to free va");
return;
}
}
/* unmap here needs to know the page size we assigned at mapping */
err = update_gmmu_ptes_locked(vm,
pgsz_idx,
NULL, /* n/a for unmap */
0,
vaddr,
vaddr + size,
0, 0, false /* n/a for unmap */,
false, rw_flag,
sparse, 0,
APERTURE_INVALID); /* don't care for unmap */
if (err)
nvgpu_err(g, "failed to update gmmu ptes on unmap");
/* flush l2 so any dirty lines are written out *now*.
* also as we could potentially be switching this buffer
* from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
* some point in the future we need to invalidate l2. e.g. switching
* from a render buffer unmap (here) to later using the same memory
* for gmmu ptes. note the positioning of this relative to any smmu
* unmapping (below). */
if (!batch) {
gk20a_mm_l2_flush(g, true);
g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
} else {
if (!batch->gpu_l2_flushed) {
gk20a_mm_l2_flush(g, true);
batch->gpu_l2_flushed = true;
}
batch->need_tlb_invalidate = true;
}
}

View File

@@ -18,6 +18,7 @@
#include <nvgpu/dma.h>
#include <nvgpu/vm.h>
#include <nvgpu/vm_area.h>
#include <nvgpu/gmmu.h>
#include <nvgpu/lock.h>
#include <nvgpu/list.h>
#include <nvgpu/rbtree.h>
@@ -34,6 +35,22 @@ int vm_aspace_id(struct vm_gk20a *vm)
return vm->as_share ? vm->as_share->id : -1;
}
static void nvgpu_vm_free_entries(struct vm_gk20a *vm,
struct gk20a_mm_entry *parent,
int level)
{
int i;
if (parent->entries)
for (i = 0; i < parent->num_entries; i++)
nvgpu_vm_free_entries(vm, &parent->entries[i], level+1);
if (parent->mem.size)
nvgpu_free_gmmu_pages(vm, parent);
nvgpu_vfree(vm->mm->g, parent->entries);
parent->entries = NULL;
}
u64 __nvgpu_vm_alloc_va(struct vm_gk20a *vm, u64 size,
enum gmmu_pgsz_gk20a pgsz_idx)
@@ -421,7 +438,7 @@ clean_up_allocators:
clean_up_page_tables:
/* Cleans up nvgpu_vm_init_page_tables() */
nvgpu_vfree(g, vm->pdb.entries);
free_gmmu_pages(vm, &vm->pdb);
nvgpu_free_gmmu_pages(vm, &vm->pdb);
clean_up_vgpu_vm:
#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
if (g->is_virtual)
@@ -537,7 +554,7 @@ static void __nvgpu_vm_remove(struct vm_gk20a *vm)
if (nvgpu_alloc_initialized(&vm->user_lp))
nvgpu_alloc_destroy(&vm->user_lp);
gk20a_vm_free_entries(vm, &vm->pdb, 0);
nvgpu_vm_free_entries(vm, &vm->pdb, 0);
#ifdef CONFIG_TEGRA_GR_VIRTUALIZATION
if (g->is_virtual)

View File

@@ -124,15 +124,6 @@ struct nvgpu_page_alloc *get_vidmem_page_alloc(struct scatterlist *sgl)
*
*/
static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
struct sg_table *sgt, u64 buffer_offset,
u64 first_vaddr, u64 last_vaddr,
u8 kind_v, u32 ctag_offset, bool cacheable,
bool umapped_pte, int rw_flag,
bool sparse,
bool priv,
enum nvgpu_aperture aperture);
static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
@@ -781,104 +772,6 @@ void gk20a_init_mm_ce_context(struct gk20a *g)
#endif
}
static void free_gmmu_phys_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
/* note: mem_desc slightly abused (wrt. free_gmmu_pages) */
free_pages((unsigned long)entry->mem.cpu_va, get_order(entry->mem.size));
entry->mem.cpu_va = NULL;
sg_free_table(entry->mem.priv.sgt);
nvgpu_kfree(vm->mm->g, entry->mem.priv.sgt);
entry->mem.priv.sgt = NULL;
entry->mem.size = 0;
entry->mem.aperture = APERTURE_INVALID;
}
static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
{
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.priv.sgt->sgl->length);
return 0;
}
static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry)
{
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.priv.sgt->sgl->length);
}
void free_gmmu_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry)
{
struct gk20a *g = gk20a_from_vm(vm);
gk20a_dbg_fn("");
if (!entry->mem.size)
return;
if (entry->woffset) /* fake shadow mem */
return;
if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
free_gmmu_phys_pages(vm, entry);
return;
}
nvgpu_dma_free(g, &entry->mem);
}
int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL))
return map_gmmu_phys_pages(entry);
if (IS_ENABLED(CONFIG_ARM64)) {
if (entry->mem.aperture == APERTURE_VIDMEM)
return 0;
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.size);
} else {
int err = nvgpu_mem_begin(g, &entry->mem);
if (err)
return err;
}
return 0;
}
void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
unmap_gmmu_phys_pages(entry);
return;
}
if (IS_ENABLED(CONFIG_ARM64)) {
if (entry->mem.aperture == APERTURE_VIDMEM)
return;
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.priv.sgt->sgl),
entry->mem.size);
} else {
nvgpu_mem_end(g, &entry->mem);
}
}
int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
{
return vm->mmu_levels[0].lo_bit[0];
@@ -909,21 +802,6 @@ static u32 pte_from_index(u32 i)
return i * gmmu_pte__size_v() / sizeof(u32);
}
u32 pte_index_from_vaddr(struct vm_gk20a *vm,
u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
{
u32 ret;
/* mask off pde part */
addr = addr & ((1ULL << gk20a_mm_pde_coverage_bit_count(vm)) - 1ULL);
/* shift over to get pte index. note assumption that pte index
* doesn't leak over into the high 32b */
ret = (u32)(addr >> ilog2(vm->gmmu_page_sizes[pgsz_idx]));
gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
return ret;
}
int nvgpu_vm_get_buffers(struct vm_gk20a *vm,
struct nvgpu_mapped_buf ***mapped_buffers,
int *num_buffers)
@@ -1096,141 +974,6 @@ int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
return 0;
}
u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
u64 map_offset,
struct sg_table *sgt,
u64 buffer_offset,
u64 size,
int pgsz_idx,
u8 kind_v,
u32 ctag_offset,
u32 flags,
int rw_flag,
bool clear_ctags,
bool sparse,
bool priv,
struct vm_gk20a_mapping_batch *batch,
enum nvgpu_aperture aperture)
{
int err = 0;
bool allocated = false;
struct gk20a *g = gk20a_from_vm(vm);
int ctag_granularity = g->ops.fb.compression_page_size(g);
u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
/* Allocate (or validate when map_offset != 0) the virtual address. */
if (!map_offset) {
map_offset = __nvgpu_vm_alloc_va(vm, size,
pgsz_idx);
if (!map_offset) {
nvgpu_err(g, "failed to allocate va space");
err = -ENOMEM;
goto fail_alloc;
}
allocated = true;
}
gk20a_dbg(gpu_dbg_map,
"gv: 0x%04x_%08x + 0x%-7llx "
"[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
"pgsz=%-3dKb as=%-2d ctags=%d start=%d "
"kind=0x%x flags=0x%x apt=%s",
u64_hi32(map_offset), u64_lo32(map_offset), size,
sgt ? u64_hi32((u64)sg_dma_address(sgt->sgl)) : 0,
sgt ? u64_lo32((u64)sg_dma_address(sgt->sgl)) : 0,
sgt ? u64_hi32((u64)sg_phys(sgt->sgl)) : 0,
sgt ? u64_lo32((u64)sg_phys(sgt->sgl)) : 0,
vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
ctag_lines, ctag_offset,
kind_v, flags, nvgpu_aperture_str(aperture));
err = update_gmmu_ptes_locked(vm, pgsz_idx,
sgt,
buffer_offset,
map_offset, map_offset + size,
kind_v,
ctag_offset,
flags &
NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
flags &
NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
rw_flag,
sparse,
priv,
aperture);
if (err) {
nvgpu_err(g, "failed to update ptes on map");
goto fail_validate;
}
if (!batch)
g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
else
batch->need_tlb_invalidate = true;
return map_offset;
fail_validate:
if (allocated)
__nvgpu_vm_free_va(vm, map_offset, pgsz_idx);
fail_alloc:
nvgpu_err(g, "%s: failed with err=%d", __func__, err);
return 0;
}
void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
u64 vaddr,
u64 size,
int pgsz_idx,
bool va_allocated,
int rw_flag,
bool sparse,
struct vm_gk20a_mapping_batch *batch)
{
int err = 0;
struct gk20a *g = gk20a_from_vm(vm);
if (va_allocated) {
err = __nvgpu_vm_free_va(vm, vaddr, pgsz_idx);
if (err) {
nvgpu_err(g, "failed to free va");
return;
}
}
/* unmap here needs to know the page size we assigned at mapping */
err = update_gmmu_ptes_locked(vm,
pgsz_idx,
NULL, /* n/a for unmap */
0,
vaddr,
vaddr + size,
0, 0, false /* n/a for unmap */,
false, rw_flag,
sparse, 0,
APERTURE_INVALID); /* don't care for unmap */
if (err)
nvgpu_err(g, "failed to update gmmu ptes on unmap");
/* flush l2 so any dirty lines are written out *now*.
* also as we could potentially be switching this buffer
* from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
* some point in the future we need to invalidate l2. e.g. switching
* from a render buffer unmap (here) to later using the same memory
* for gmmu ptes. note the positioning of this relative to any smmu
* unmapping (below). */
if (!batch) {
gk20a_mm_l2_flush(g, true);
g->ops.fb.tlb_invalidate(g, &vm->pdb.mem);
} else {
if (!batch->gpu_l2_flushed) {
gk20a_mm_l2_flush(g, true);
batch->gpu_l2_flushed = true;
}
batch->need_tlb_invalidate = true;
}
}
enum nvgpu_aperture gk20a_dmabuf_aperture(struct gk20a *g,
struct dma_buf *dmabuf)
{
@@ -2036,254 +1779,6 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
return 0;
}
static int update_gmmu_level_locked(struct vm_gk20a *vm,
struct gk20a_mm_entry *pte,
enum gmmu_pgsz_gk20a pgsz_idx,
struct scatterlist **sgl,
u64 *offset,
u64 *iova,
u64 gpu_va, u64 gpu_end,
u8 kind_v, u64 *ctag,
bool cacheable, bool unmapped_pte,
int rw_flag,
bool sparse,
int lvl,
bool priv,
enum nvgpu_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
int err = 0;
u32 pde_i;
u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
gk20a_dbg_fn("");
pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
>> (u64)l->lo_bit[pgsz_idx];
gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
while (gpu_va < gpu_end) {
u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
/* Allocate next level */
if (next_l->update_entry) {
if (!pte->entries) {
int num_entries =
1 <<
(l->hi_bit[pgsz_idx]
- l->lo_bit[pgsz_idx] + 1);
pte->entries =
nvgpu_vzalloc(g,
sizeof(struct gk20a_mm_entry) *
num_entries);
if (!pte->entries)
return -ENOMEM;
pte->pgsz = pgsz_idx;
pte->num_entries = num_entries;
}
prev_pte = next_pte;
next_pte = pte->entries + pde_i;
if (!next_pte->mem.size) {
err = nvgpu_zalloc_gmmu_page_table(vm,
pgsz_idx, next_l, next_pte, prev_pte);
if (err)
return err;
}
}
err = l->update_entry(vm, pte, pde_i, pgsz_idx,
sgl, offset, iova,
kind_v, ctag, cacheable, unmapped_pte,
rw_flag, sparse, priv, aperture);
if (err)
return err;
if (next_l->update_entry) {
/* get cpu access to the ptes */
err = map_gmmu_pages(g, next_pte);
if (err) {
nvgpu_err(g,
"couldn't map ptes for update as=%d",
vm_aspace_id(vm));
return err;
}
err = update_gmmu_level_locked(vm, next_pte,
pgsz_idx,
sgl,
offset,
iova,
gpu_va,
next,
kind_v, ctag, cacheable, unmapped_pte,
rw_flag, sparse, lvl+1, priv, aperture);
unmap_gmmu_pages(g, next_pte);
if (err)
return err;
}
pde_i++;
gpu_va = next;
}
gk20a_dbg_fn("done");
return 0;
}
static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
struct sg_table *sgt,
u64 buffer_offset,
u64 gpu_va, u64 gpu_end,
u8 kind_v, u32 ctag_offset,
bool cacheable, bool unmapped_pte,
int rw_flag,
bool sparse,
bool priv,
enum nvgpu_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
int ctag_granularity = g->ops.fb.compression_page_size(g);
u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
u64 iova = 0;
u64 space_to_skip = buffer_offset;
u64 map_size = gpu_end - gpu_va;
u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
int err;
struct scatterlist *sgl = NULL;
struct nvgpu_page_alloc *alloc = NULL;
struct page_alloc_chunk *chunk = NULL;
u64 length;
/* note: here we need to map kernel to small, since the
* low-level mmu code assumes 0 is small and 1 is big pages */
if (pgsz_idx == gmmu_page_size_kernel)
pgsz_idx = gmmu_page_size_small;
if (space_to_skip & (page_size - 1))
return -EINVAL;
err = map_gmmu_pages(g, &vm->pdb);
if (err) {
nvgpu_err(g,
"couldn't map ptes for update as=%d",
vm_aspace_id(vm));
return err;
}
if (aperture == APERTURE_VIDMEM) {
gk20a_dbg(gpu_dbg_map_v, "vidmem map size_idx=%d, gpu_va=[%llx,%llx], alloc=%llx",
pgsz_idx, gpu_va, gpu_end-1, iova);
if (sgt) {
alloc = get_vidmem_page_alloc(sgt->sgl);
nvgpu_list_for_each_entry(chunk, &alloc->alloc_chunks,
page_alloc_chunk, list_entry) {
if (space_to_skip &&
space_to_skip > chunk->length) {
space_to_skip -= chunk->length;
} else {
iova = chunk->base + space_to_skip;
length = chunk->length - space_to_skip;
length = min(length, map_size);
space_to_skip = 0;
err = update_gmmu_level_locked(vm,
&vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_va + length,
kind_v, &ctag,
cacheable, unmapped_pte,
rw_flag, sparse, 0, priv,
aperture);
if (err)
break;
/* need to set explicit zero here */
space_to_skip = 0;
gpu_va += length;
map_size -= length;
if (!map_size)
break;
}
}
} else {
err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_end,
kind_v, &ctag,
cacheable, unmapped_pte, rw_flag,
sparse, 0, priv,
aperture);
}
} else {
gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
pgsz_idx,
sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
: 0ULL,
buffer_offset,
sgt ? sgt->nents : 0);
gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
pgsz_idx, gpu_va, gpu_end-1, iova);
if (sgt) {
iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
if (!vm->mm->bypass_smmu && iova) {
iova += space_to_skip;
} else {
sgl = sgt->sgl;
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
(u64)sg_phys(sgl),
sgl->length);
while (space_to_skip && sgl &&
space_to_skip + page_size > sgl->length) {
space_to_skip -= sgl->length;
sgl = sg_next(sgl);
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
(u64)sg_phys(sgl),
sgl->length);
}
iova = sg_phys(sgl) + space_to_skip;
}
}
err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_end,
kind_v, &ctag,
cacheable, unmapped_pte, rw_flag,
sparse, 0, priv,
aperture);
}
unmap_gmmu_pages(g, &vm->pdb);
smp_mb();
gk20a_dbg_fn("done");
return err;
}
/* NOTE! mapped_buffers lock must be held */
void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer,
struct vm_gk20a_mapping_batch *batch)
@@ -2341,22 +1836,6 @@ void nvgpu_vm_unmap_locked(struct nvgpu_mapped_buf *mapped_buffer,
return;
}
void gk20a_vm_free_entries(struct vm_gk20a *vm,
struct gk20a_mm_entry *parent,
int level)
{
int i;
if (parent->entries)
for (i = 0; i < parent->num_entries; i++)
gk20a_vm_free_entries(vm, &parent->entries[i], level+1);
if (parent->mem.size)
free_gmmu_pages(vm, parent);
nvgpu_vfree(vm->mm->g, parent->entries);
parent->entries = NULL;
}
const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
{.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
.lo_bit = {26, 26},

View File

@@ -433,17 +433,10 @@ int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev);
int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct gk20a *g,
u64 offset, struct gk20a_buffer_state **state);
int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry);
void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry);
void pde_range_from_vaddr_range(struct vm_gk20a *vm,
u64 addr_lo, u64 addr_hi,
u32 *pde_lo, u32 *pde_hi);
int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm);
u32 pte_index_from_vaddr(struct vm_gk20a *vm,
u64 addr, enum gmmu_pgsz_gk20a pgsz_idx);
void free_gmmu_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry);
u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g);
struct gpu_ops;

View File

@@ -105,4 +105,7 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm,
struct nvgpu_mem *mem,
u64 gpu_va);
void nvgpu_free_gmmu_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry);
#endif