Files
linux-nvgpu/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
Terje Bergstrom 2eb6dcb469 gpu: nvgpu: Implement 64k large page support
Implement support for 64kB large page size. Add an API to create an
address space via IOCTL so that we can accept flags, and assign one
flag for enabling 64kB large page size.

Also adds APIs to set per-context large page size. This is possible
only on Maxwell, so return error if caller tries to set large page
size on Kepler.

Default large page size is still 128kB.

Change-Id: I20b51c8f6d4a984acae8411ace3de9000c78e82f
Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
2015-03-18 12:11:46 -07:00

298 lines
7.4 KiB
C

/*
* GM20B MMU
*
* Copyright (c) 2014, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/pm_runtime.h>
#include "gk20a/gk20a.h"
#include "mm_gm20b.h"
#include "hw_gmmu_gm20b.h"
#include "hw_fb_gm20b.h"
#include "hw_gr_gm20b.h"
#include "hw_ram_gm20b.h"
static int allocate_gmmu_ptes_sparse(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
u64 first_vaddr, u64 last_vaddr,
bool clear, bool refplus)
{
int err;
u32 pte_lo, pte_hi;
u32 pde_lo, pde_hi;
u32 pte_w[2] = {0, 0}; /* invalid pte */
u64 addr = 0;
u32 pte_cur;
void *pte_kv_cur;
struct page_table_gk20a *pte;
gk20a_dbg_fn("");
pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
&pde_lo, &pde_hi);
gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
pgsz_idx, pde_lo, pde_hi);
/* Expect ptes of the same pde */
BUG_ON(pde_lo != pde_hi);
pte = vm->pdes.ptes[pgsz_idx] + pde_lo;
if (refplus)
pte->ref_cnt++;
pte_lo = pte_index_from_vaddr(vm, first_vaddr, pgsz_idx);
pte_hi = pte_index_from_vaddr(vm, last_vaddr, pgsz_idx);
/* get cpu access to the ptes */
err = map_gmmu_pages(pte->ref, pte->sgt, &pte_kv_cur, pte->size);
if (err)
goto fail;
gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
pte_w[0] = gmmu_pte_valid_false_f();
pte_w[1] = clear ? 0 : gmmu_pte_vol_true_f();
gk20a_dbg(gpu_dbg_pte,
"pte_cur=%d addr=%llx refs=%d"
" [0x%08x,0x%08x]",
pte_cur, addr,
pte->ref_cnt, pte_w[1], pte_w[0]);
gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 0, pte_w[0]);
gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 1, pte_w[1]);
}
unmap_gmmu_pages(pte->ref, pte->sgt, pte_kv_cur);
smp_mb();
vm->tlb_dirty = true;
gk20a_dbg_fn("set tlb dirty");
return 0;
fail:
return err;
}
static bool gm20b_vm_is_pde_in_range(struct vm_gk20a *vm, u64 vaddr_lo,
u64 vaddr_hi, u32 pde)
{
u64 pde_vaddr_lo, pde_vaddr_hi;
gk20a_dbg_fn("");
pde_vaddr_lo = (u64)pde << vm->pde_stride_shift;
pde_vaddr_hi = pde_vaddr_lo |
((0x1UL << (vm->pde_stride_shift)) - 1);
return ((vaddr_lo <= pde_vaddr_lo) && (vaddr_hi) >= pde_vaddr_hi);
}
static int gm20b_vm_put_sparse(struct vm_gk20a *vm, u64 vaddr,
u32 num_pages, u32 pgsz_idx, bool refplus)
{
struct mm_gk20a *mm = vm->mm;
u32 pgsz = vm->gmmu_page_sizes[pgsz_idx];
u32 pde_shift = vm->pde_stride_shift;
u64 vaddr_hi;
u64 vaddr_pde_start;
u32 i;
u32 pde_lo, pde_hi;
int err;
gk20a_dbg_fn("");
vaddr_hi = vaddr + pgsz * num_pages - 1;
pde_range_from_vaddr_range(vm,
vaddr,
vaddr_hi,
&pde_lo, &pde_hi);
gk20a_dbg_info("vaddr: 0x%llx, vaddr_hi: 0x%llx, pde_lo: 0x%x, "
"pde_hi: 0x%x, pgsz: %d, pde_stride_shift: %d",
vaddr, vaddr_hi, pde_lo, pde_hi, pgsz,
vm->pde_stride_shift);
for (i = pde_lo; i <= pde_hi; i++) {
/* Mark all ptes as sparse. */
err = validate_gmmu_page_table_gk20a_locked(vm, i,
pgsz_idx);
if (err) {
gk20a_err(dev_from_vm(vm),
"failed to validate page table %d: %d",
i, err);
goto fail;
}
if (gm20b_vm_is_pde_in_range(vm, vaddr, vaddr_hi, i)) {
/* entire pde is marked as sparse */
vaddr_pde_start = (u64)i << pde_shift;
allocate_gmmu_ptes_sparse(vm, pgsz_idx,
vaddr_pde_start,
PDE_ADDR_END(vaddr_pde_start,
pde_shift), false, refplus);
} else {
/* Check leading and trailing spaces which doesn't fit
* into entire pde. */
if (pde_lo == pde_hi)
allocate_gmmu_ptes_sparse(vm, pgsz_idx, vaddr,
vaddr_hi, false, refplus);
else if (i == pde_lo)
allocate_gmmu_ptes_sparse(vm, pgsz_idx, vaddr,
PDE_ADDR_END(vaddr, pde_shift), false,
refplus);
else
allocate_gmmu_ptes_sparse(vm, pgsz_idx,
PDE_ADDR_START(vaddr_hi, pde_shift),
vaddr_hi, false,
refplus);
}
}
gk20a_mm_l2_flush(mm->g, true);
return 0;
fail:
WARN_ON(1);
return err;
}
static int gm20b_mm_mmu_vpr_info_fetch_wait(struct gk20a *g,
const unsigned int msec)
{
unsigned long timeout;
if (tegra_platform_is_silicon())
timeout = jiffies + msecs_to_jiffies(msec);
else
timeout = msecs_to_jiffies(msec);
while (1) {
u32 val;
val = gk20a_readl(g, fb_mmu_vpr_info_r());
if (fb_mmu_vpr_info_fetch_v(val) ==
fb_mmu_vpr_info_fetch_false_v())
break;
if (tegra_platform_is_silicon()) {
if (WARN_ON(time_after(jiffies, timeout)))
return -ETIME;
} else if (--timeout == 0)
return -ETIME;
}
return 0;
}
int gm20b_mm_mmu_vpr_info_fetch(struct gk20a *g)
{
int ret = 0;
gk20a_busy_noresume(g->dev);
#ifdef CONFIG_PM_RUNTIME
if (!pm_runtime_active(&g->dev->dev))
goto fail;
#endif
if (gm20b_mm_mmu_vpr_info_fetch_wait(g, VPR_INFO_FETCH_WAIT)) {
ret = -ETIME;
goto fail;
}
gk20a_writel(g, fb_mmu_vpr_info_r(),
fb_mmu_vpr_info_fetch_true_v());
ret = gm20b_mm_mmu_vpr_info_fetch_wait(g, VPR_INFO_FETCH_WAIT);
fail:
pm_runtime_put(&g->dev->dev);
return ret;
}
void gm20b_vm_clear_sparse(struct vm_gk20a *vm, u64 vaddr,
u64 size, u32 pgsz_idx) {
u64 vaddr_hi;
u32 pde_lo, pde_hi, pde_i;
gk20a_dbg_fn("");
vaddr_hi = vaddr + size - 1;
pde_range_from_vaddr_range(vm,
vaddr,
vaddr_hi,
&pde_lo, &pde_hi);
gk20a_dbg_info("vaddr: 0x%llx, vaddr_hi: 0x%llx, pde_lo: 0x%x, "
"pde_hi: 0x%x, pgsz_idx: %d, pde_stride_shift: %d",
vaddr, vaddr_hi, pde_lo, pde_hi, pgsz_idx,
vm->pde_stride_shift);
for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
struct page_table_gk20a *pte = vm->pdes.ptes[pgsz_idx] + pde_i;
pte->ref_cnt--;
if (pte->ref_cnt == 0) {
free_gmmu_pages(vm, pte->ref, pte->sgt,
vm->page_table_sizing[pgsz_idx].order,
pte->size);
pte->ref = NULL;
update_gmmu_pde_locked(vm, pde_i);
}
}
return;
}
bool gm20b_mm_mmu_debug_mode_enabled(struct gk20a *g)
{
u32 debug_ctrl = gk20a_readl(g, gr_gpcs_pri_mmu_debug_ctrl_r());
return gr_gpcs_pri_mmu_debug_ctrl_debug_v(debug_ctrl) ==
gr_gpcs_pri_mmu_debug_ctrl_debug_enabled_v();
}
void gm20b_mm_set_big_page_size(struct gk20a *g, void *inst_ptr, int size)
{
u32 val;
gk20a_dbg_fn("");
gk20a_dbg_info("big page size %d\n", size);
val = gk20a_mem_rd32(inst_ptr, ram_in_big_page_size_w());
val &= ~ram_in_big_page_size_m();
if (size == SZ_64K)
val |= ram_in_big_page_size_64kb_f();
else
val |= ram_in_big_page_size_128kb_f();
gk20a_mem_wr32(inst_ptr, ram_in_big_page_size_w(), val);
gk20a_dbg_fn("done");
}
void gm20b_init_mm(struct gpu_ops *gops)
{
gops->mm.set_sparse = gm20b_vm_put_sparse;
gops->mm.clear_sparse = gm20b_vm_clear_sparse;
gops->mm.is_debug_mode_enabled = gm20b_mm_mmu_debug_mode_enabled;
gops->mm.gmmu_map = gk20a_locked_gmmu_map;
gops->mm.gmmu_unmap = gk20a_locked_gmmu_unmap;
gops->mm.vm_remove = gk20a_vm_remove_support;
gops->mm.vm_alloc_share = gk20a_vm_alloc_share;
gops->mm.vm_bind_channel = gk20a_vm_bind_channel;
gops->mm.fb_flush = gk20a_mm_fb_flush;
gops->mm.l2_invalidate = gk20a_mm_l2_invalidate;
gops->mm.l2_flush = gk20a_mm_l2_flush;
gops->mm.tlb_invalidate = gk20a_mm_tlb_invalidate;
gops->mm.set_big_page_size = gm20b_mm_set_big_page_size;
}