Files
linux-nvgpu/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
Alex Waterman d630f1d99f gpu: nvgpu: Unify the small and large page address spaces
The basic structure of this patch is to make the small page allocator
and the large page allocator into pointers (where they used to be just
structs). Then assign each of those pointers to the same actual
allocator since the buddy allocator has supported mixed page sizes
since its inception.

For the rest of the driver some changes had to be made in order to
actually support mixed pages in a single address space.

1. Unifying the allocation page size determination

   Since the allocation and map operations happen at distinct
   times both mapping and allocation of GVA space must agree
   on page size. This is because the allocation has to separate
   allocations into separate PDEs to avoid the necessity of
   supporting mixed PDEs.

   To this end a function __get_pte_size() was introduced which
   is used both by the balloc code and the core GPU MM code. It
   determines page size based only on the length of the mapping/
   allocation.

2. Fixed address allocation + page size

   Similar to regular mappings/GVA allocations fixed address
   mapping page size determination had to be modified. In the
   past the address of the mapping determined page size since
   the address space split was by address (low addresses were
   small pages, high addresses large pages). Since that is no
   longer the case the page size field in the reserve memory
   ioctl is now honored by the mapping code. When, for instance,
   CUDA makes a memory reservation it specifies small or large
   pages. When CUDA requests mappings to be made within that
   address range the page size is then looked up in the reserved
   memory struct.

   Fixed address reservations were also modified to now always
   allocate at a PDE granularity (64M or 128M depending on
   large page size. This prevents non-fixed allocations from
   ending up in the same PDE and causing kernel panics or GMMU
   faults.

3. The rest...

   The rest of the changes are just by products of the above.
   Lots of places required minor updates to use a pointer to
   the GVA allocator struct instead of the struct itself.

Lastly, this change is not truly complete. More work remains to be
done in order to fully remove the notion that there was such a thing
as separate address spaces for different page sizes. Basically after
this patch what remains is cleanup and proper documentation.

Bug 1396644
Bug 1729947

Change-Id: If51ab396a37ba16c69e434adb47edeef083dce57
Signed-off-by: Alex Waterman <alexw@nvidia.com>
Reviewed-on: http://git-master/r/1265300
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
2017-01-31 16:23:07 -08:00

5497 lines
134 KiB
C

/*
* GK20A memory management
*
* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/log2.h>
#include <linux/nvhost.h>
#include <linux/pm_runtime.h>
#include <linux/scatterlist.h>
#include <linux/nvmap.h>
#include <soc/tegra/chip-id.h>
#include <linux/vmalloc.h>
#include <linux/dma-buf.h>
#include <linux/lcm.h>
#include <linux/fdtable.h>
#include <uapi/linux/nvgpu.h>
#include <trace/events/gk20a.h>
#include <nvgpu/timers.h>
#include <nvgpu/allocator.h>
#include <nvgpu/page_allocator.h>
#include "gk20a.h"
#include "mm_gk20a.h"
#include "fence_gk20a.h"
#include "kind_gk20a.h"
#include "semaphore_gk20a.h"
#include <nvgpu/hw/gk20a/hw_gmmu_gk20a.h>
#include <nvgpu/hw/gk20a/hw_fb_gk20a.h>
#include <nvgpu/hw/gk20a/hw_bus_gk20a.h>
#include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
#include <nvgpu/hw/gk20a/hw_pram_gk20a.h>
#include <nvgpu/hw/gk20a/hw_mc_gk20a.h>
#include <nvgpu/hw/gk20a/hw_flush_gk20a.h>
#include <nvgpu/hw/gk20a/hw_ltc_gk20a.h>
/*
* Flip this to force all gk20a_mem* accesses via PRAMIN from the start of the
* boot, even for buffers that would work via cpu_va. In runtime, the flag is
* in debugfs, called "force_pramin".
*/
#define GK20A_FORCE_PRAMIN_DEFAULT false
#if defined(CONFIG_GK20A_VIDMEM)
static void gk20a_vidmem_clear_mem_worker(struct work_struct *work);
#endif
static inline void
set_vidmem_page_alloc(struct scatterlist *sgl, u64 addr)
{
/* set bit 0 to indicate vidmem allocation */
sg_dma_address(sgl) = (addr | 1ULL);
}
static inline bool
is_vidmem_page_alloc(u64 addr)
{
return !!(addr & 1ULL);
}
static inline struct nvgpu_page_alloc *
get_vidmem_page_alloc(struct scatterlist *sgl)
{
u64 addr;
addr = sg_dma_address(sgl);
if (is_vidmem_page_alloc(addr))
addr = addr & ~1ULL;
else
WARN_ON(1);
return (struct nvgpu_page_alloc *)(uintptr_t)addr;
}
int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
{
void *cpu_va;
if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin)
return 0;
if (WARN_ON(mem->cpu_va)) {
gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
return -EBUSY;
}
cpu_va = vmap(mem->pages,
PAGE_ALIGN(mem->size) >> PAGE_SHIFT,
0, pgprot_writecombine(PAGE_KERNEL));
if (WARN_ON(!cpu_va))
return -ENOMEM;
mem->cpu_va = cpu_va;
return 0;
}
void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
{
if (mem->aperture != APERTURE_SYSMEM || g->mm.force_pramin)
return;
vunmap(mem->cpu_va);
mem->cpu_va = NULL;
}
/* WARNING: returns pramin_window_lock taken, complement with pramin_exit() */
static u32 gk20a_pramin_enter(struct gk20a *g, struct mem_desc *mem,
struct page_alloc_chunk *chunk, u32 w)
{
u64 bufbase = chunk->base;
u64 addr = bufbase + w * sizeof(u32);
u32 hi = (u32)((addr & ~(u64)0xfffff)
>> bus_bar0_window_target_bar0_window_base_shift_v());
u32 lo = (u32)(addr & 0xfffff);
u32 win = gk20a_aperture_mask(g, mem,
bus_bar0_window_target_sys_mem_noncoherent_f(),
bus_bar0_window_target_vid_mem_f()) |
bus_bar0_window_base_f(hi);
gk20a_dbg(gpu_dbg_mem,
"0x%08x:%08x begin for %p,%p at [%llx,%llx] (sz %llx)",
hi, lo, mem, chunk, bufbase,
bufbase + chunk->length, chunk->length);
WARN_ON(!bufbase);
spin_lock(&g->mm.pramin_window_lock);
if (g->mm.pramin_window != win) {
gk20a_writel(g, bus_bar0_window_r(), win);
gk20a_readl(g, bus_bar0_window_r());
g->mm.pramin_window = win;
}
return lo;
}
static void gk20a_pramin_exit(struct gk20a *g, struct mem_desc *mem,
struct page_alloc_chunk *chunk)
{
gk20a_dbg(gpu_dbg_mem, "end for %p,%p", mem, chunk);
spin_unlock(&g->mm.pramin_window_lock);
}
/*
* Batch innerloop for the function below once per each PRAMIN range (some
* 4B..1MB at a time). "start" reg goes as-is to gk20a_{readl,writel}.
*/
typedef void (*pramin_access_batch_fn)(struct gk20a *g, u32 start, u32 words,
u32 **arg);
/*
* The PRAMIN range is 1 MB, must change base addr if a buffer crosses that.
* This same loop is used for read/write/memset. Offset and size in bytes.
* One call to "loop" is done per range, with "arg" supplied.
*/
static inline void pramin_access_batched(struct gk20a *g, struct mem_desc *mem,
u32 offset, u32 size, pramin_access_batch_fn loop, u32 **arg)
{
struct nvgpu_page_alloc *alloc = NULL;
struct page_alloc_chunk *chunk = NULL;
u32 byteoff, start_reg, until_end, n;
alloc = get_vidmem_page_alloc(mem->sgt->sgl);
list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
if (offset >= chunk->length)
offset -= chunk->length;
else
break;
}
offset /= sizeof(u32);
while (size) {
byteoff = gk20a_pramin_enter(g, mem, chunk, offset);
start_reg = pram_data032_r(byteoff / sizeof(u32));
until_end = SZ_1M - (byteoff & (SZ_1M - 1));
n = min3(size, until_end, (u32)(chunk->length - offset));
loop(g, start_reg, n / sizeof(u32), arg);
/* read back to synchronize accesses */
gk20a_readl(g, start_reg);
gk20a_pramin_exit(g, mem, chunk);
size -= n;
if (n == (chunk->length - offset)) {
chunk = list_next_entry(chunk, list_entry);
offset = 0;
} else {
offset += n / sizeof(u32);
}
}
}
static inline void pramin_access_batch_rd_n(struct gk20a *g, u32 start,
u32 words, u32 **arg)
{
u32 r = start, *dest_u32 = *arg;
if (!g->regs) {
__gk20a_warn_on_no_regs();
return;
}
while (words--) {
*dest_u32++ = gk20a_readl(g, r);
r += sizeof(u32);
}
*arg = dest_u32;
}
static inline void pramin_access_batch_wr_n(struct gk20a *g, u32 start,
u32 words, u32 **arg)
{
u32 r = start, *src_u32 = *arg;
if (!g->regs) {
__gk20a_warn_on_no_regs();
return;
}
while (words--) {
writel_relaxed(*src_u32++, g->regs + r);
r += sizeof(u32);
}
*arg = src_u32;
}
static inline void pramin_access_batch_set(struct gk20a *g, u32 start,
u32 words, u32 **arg)
{
u32 r = start, repeat = **arg;
if (!g->regs) {
__gk20a_warn_on_no_regs();
return;
}
while (words--) {
writel_relaxed(repeat, g->regs + r);
r += sizeof(u32);
}
}
u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w)
{
u32 data = 0;
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u32 *ptr = mem->cpu_va;
WARN_ON(!ptr);
data = ptr[w];
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
#endif
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 value;
u32 *p = &value;
pramin_access_batched(g, mem, w * sizeof(u32), sizeof(u32),
pramin_access_batch_rd_n, &p);
data = value;
} else {
WARN_ON("Accessing unallocated mem_desc");
}
return data;
}
u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
{
WARN_ON(offset & 3);
return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
}
void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
u32 offset, void *dest, u32 size)
{
WARN_ON(offset & 3);
WARN_ON(size & 3);
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u8 *src = (u8 *)mem->cpu_va + offset;
WARN_ON(!mem->cpu_va);
memcpy(dest, src, size);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
if (size)
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x ... [%d bytes]",
src, *dest, size);
#endif
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 *dest_u32 = dest;
pramin_access_batched(g, mem, offset, size,
pramin_access_batch_rd_n, &dest_u32);
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
{
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u32 *ptr = mem->cpu_va;
WARN_ON(!ptr);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
#endif
ptr[w] = data;
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 value = data;
u32 *p = &value;
pramin_access_batched(g, mem, w * sizeof(u32), sizeof(u32),
pramin_access_batch_wr_n, &p);
if (!mem->skip_wmb)
wmb();
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
{
WARN_ON(offset & 3);
gk20a_mem_wr32(g, mem, offset / sizeof(u32), data);
}
void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
void *src, u32 size)
{
WARN_ON(offset & 3);
WARN_ON(size & 3);
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u8 *dest = (u8 *)mem->cpu_va + offset;
WARN_ON(!mem->cpu_va);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
if (size)
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x ... [%d bytes]",
dest, *src, size);
#endif
memcpy(dest, src, size);
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 *src_u32 = src;
pramin_access_batched(g, mem, offset, size,
pramin_access_batch_wr_n, &src_u32);
if (!mem->skip_wmb)
wmb();
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
u32 c, u32 size)
{
WARN_ON(offset & 3);
WARN_ON(size & 3);
WARN_ON(c & ~0xff);
c &= 0xff;
if (mem->aperture == APERTURE_SYSMEM && !g->mm.force_pramin) {
u8 *dest = (u8 *)mem->cpu_va + offset;
WARN_ON(!mem->cpu_va);
#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
if (size)
gk20a_dbg(gpu_dbg_mem, " %p = 0x%x [times %d]",
dest, c, size);
#endif
memset(dest, c, size);
} else if (mem->aperture == APERTURE_VIDMEM || g->mm.force_pramin) {
u32 repeat_value = c | (c << 8) | (c << 16) | (c << 24);
u32 *p = &repeat_value;
pramin_access_batched(g, mem, offset, size,
pramin_access_batch_set, &p);
if (!mem->skip_wmb)
wmb();
} else {
WARN_ON("Accessing unallocated mem_desc");
}
}
/*
* GPU mapping life cycle
* ======================
*
* Kernel mappings
* ---------------
*
* Kernel mappings are created through vm.map(..., false):
*
* - Mappings to the same allocations are reused and refcounted.
* - This path does not support deferred unmapping (i.e. kernel must wait for
* all hw operations on the buffer to complete before unmapping).
* - References to dmabuf are owned and managed by the (kernel) clients of
* the gk20a_vm layer.
*
*
* User space mappings
* -------------------
*
* User space mappings are created through as.map_buffer -> vm.map(..., true):
*
* - Mappings to the same allocations are reused and refcounted.
* - This path supports deferred unmapping (i.e. we delay the actual unmapping
* until all hw operations have completed).
* - References to dmabuf are owned and managed by the vm_gk20a
* layer itself. vm.map acquires these refs, and sets
* mapped_buffer->own_mem_ref to record that we must release the refs when we
* actually unmap.
*
*/
static inline int vm_aspace_id(struct vm_gk20a *vm)
{
/* -1 is bar1 or pmu, etc. */
return vm->as_share ? vm->as_share->id : -1;
}
static inline u32 hi32(u64 f)
{
return (u32)(f >> 32);
}
static inline u32 lo32(u64 f)
{
return (u32)(f & 0xffffffff);
}
static struct mapped_buffer_node *find_mapped_buffer_locked(
struct rb_root *root, u64 addr);
static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
struct rb_root *root, struct dma_buf *dmabuf,
u32 kind);
static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
struct sg_table *sgt, u64 buffer_offset,
u64 first_vaddr, u64 last_vaddr,
u8 kind_v, u32 ctag_offset, bool cacheable,
bool umapped_pte, int rw_flag,
bool sparse,
bool priv,
enum gk20a_aperture aperture);
static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
static int __must_check gk20a_init_cde_vm(struct mm_gk20a *mm);
static int __must_check gk20a_init_ce_vm(struct mm_gk20a *mm);
static struct gk20a *gk20a_vidmem_buf_owner(struct dma_buf *dmabuf);
struct gk20a_dmabuf_priv {
struct mutex lock;
struct gk20a_comptag_allocator *comptag_allocator;
struct gk20a_comptags comptags;
struct dma_buf_attachment *attach;
struct sg_table *sgt;
int pin_count;
struct list_head states;
u64 buffer_id;
};
struct gk20a_vidmem_buf {
struct gk20a *g;
struct mem_desc *mem;
struct dma_buf *dmabuf;
void *dmabuf_priv;
void (*dmabuf_priv_delete)(void *);
};
static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm);
static int gk20a_comptaglines_alloc(struct gk20a_comptag_allocator *allocator,
u32 *offset, u32 len)
{
unsigned long addr;
int err = 0;
mutex_lock(&allocator->lock);
addr = bitmap_find_next_zero_area(allocator->bitmap, allocator->size,
0, len, 0);
if (addr < allocator->size) {
/* number zero is reserved; bitmap base is 1 */
*offset = 1 + addr;
bitmap_set(allocator->bitmap, addr, len);
} else {
err = -ENOMEM;
}
mutex_unlock(&allocator->lock);
return err;
}
static void gk20a_comptaglines_free(struct gk20a_comptag_allocator *allocator,
u32 offset, u32 len)
{
/* number zero is reserved; bitmap base is 1 */
u32 addr = offset - 1;
WARN_ON(offset == 0);
WARN_ON(addr > allocator->size);
WARN_ON(addr + len > allocator->size);
mutex_lock(&allocator->lock);
bitmap_clear(allocator->bitmap, addr, len);
mutex_unlock(&allocator->lock);
}
static void gk20a_mm_delete_priv(void *_priv)
{
struct gk20a_buffer_state *s, *s_tmp;
struct gk20a_dmabuf_priv *priv = _priv;
if (!priv)
return;
if (priv->comptags.lines) {
BUG_ON(!priv->comptag_allocator);
gk20a_comptaglines_free(priv->comptag_allocator,
priv->comptags.offset,
priv->comptags.allocated_lines);
}
/* Free buffer states */
list_for_each_entry_safe(s, s_tmp, &priv->states, list) {
gk20a_fence_put(s->fence);
list_del(&s->list);
kfree(s);
}
kfree(priv);
}
struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
{
struct gk20a_dmabuf_priv *priv;
priv = dma_buf_get_drvdata(dmabuf, dev);
if (WARN_ON(!priv))
return ERR_PTR(-EINVAL);
mutex_lock(&priv->lock);
if (priv->pin_count == 0) {
priv->attach = dma_buf_attach(dmabuf, dev);
if (IS_ERR(priv->attach)) {
mutex_unlock(&priv->lock);
return (struct sg_table *)priv->attach;
}
priv->sgt = dma_buf_map_attachment(priv->attach,
DMA_BIDIRECTIONAL);
if (IS_ERR(priv->sgt)) {
dma_buf_detach(dmabuf, priv->attach);
mutex_unlock(&priv->lock);
return priv->sgt;
}
}
priv->pin_count++;
mutex_unlock(&priv->lock);
return priv->sgt;
}
void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
struct sg_table *sgt)
{
struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
dma_addr_t dma_addr;
if (IS_ERR(priv) || !priv)
return;
mutex_lock(&priv->lock);
WARN_ON(priv->sgt != sgt);
priv->pin_count--;
WARN_ON(priv->pin_count < 0);
dma_addr = sg_dma_address(priv->sgt->sgl);
if (priv->pin_count == 0) {
dma_buf_unmap_attachment(priv->attach, priv->sgt,
DMA_BIDIRECTIONAL);
dma_buf_detach(dmabuf, priv->attach);
}
mutex_unlock(&priv->lock);
}
void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf,
struct gk20a_comptags *comptags)
{
struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
if (!comptags)
return;
if (!priv) {
memset(comptags, 0, sizeof(*comptags));
return;
}
*comptags = priv->comptags;
}
static int gk20a_alloc_comptags(struct gk20a *g,
struct device *dev,
struct dma_buf *dmabuf,
struct gk20a_comptag_allocator *allocator,
u32 lines, bool user_mappable,
u64 *ctag_map_win_size,
u32 *ctag_map_win_ctagline)
{
struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
u32 ctaglines_allocsize;
u32 ctagline_align;
u32 offset;
u32 alignment_lines;
const u32 aggregate_cacheline_sz =
g->gr.cacheline_size * g->gr.slices_per_ltc *
g->ltc_count;
const u32 small_pgsz = 4096;
int err;
if (!priv)
return -ENOSYS;
if (!lines)
return -EINVAL;
if (!user_mappable) {
ctaglines_allocsize = lines;
ctagline_align = 1;
} else {
/*
* For security, align the allocation on a page, and reserve
* whole pages. Unfortunately, we cannot ask the allocator to
* align here, since compbits per cacheline is not always a
* power of two. So, we just have to allocate enough extra that
* we're guaranteed to find a ctagline inside the allocation so
* that: 1) it is the first ctagline in a cacheline that starts
* at a page boundary, and 2) we can add enough overallocation
* that the ctaglines of the succeeding allocation are on
* different page than ours.
*/
ctagline_align =
(lcm(aggregate_cacheline_sz, small_pgsz) /
aggregate_cacheline_sz) *
g->gr.comptags_per_cacheline;
ctaglines_allocsize =
/* for alignment */
ctagline_align +
/* lines rounded up to cachelines */
DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline) *
g->gr.comptags_per_cacheline +
/* trail-padding */
DIV_ROUND_UP(aggregate_cacheline_sz, small_pgsz) *
g->gr.comptags_per_cacheline;
if (ctaglines_allocsize < lines)
return -EINVAL; /* integer overflow */
}
/* store the allocator so we can use it when we free the ctags */
priv->comptag_allocator = allocator;
err = gk20a_comptaglines_alloc(allocator, &offset,
ctaglines_allocsize);
if (err)
return err;
/*
* offset needs to be at the start of a page/cacheline boundary;
* prune the preceding ctaglines that were allocated for alignment.
*/
alignment_lines =
DIV_ROUND_UP(offset, ctagline_align) * ctagline_align - offset;
if (alignment_lines) {
gk20a_comptaglines_free(allocator, offset, alignment_lines);
offset += alignment_lines;
ctaglines_allocsize -= alignment_lines;
}
/*
* check if we can prune the trailing, too; we just need to reserve
* whole pages and ctagcachelines.
*/
if (user_mappable) {
u32 needed_cachelines =
DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline);
u32 needed_bytes = round_up(needed_cachelines *
aggregate_cacheline_sz,
small_pgsz);
u32 first_unneeded_cacheline =
DIV_ROUND_UP(needed_bytes, aggregate_cacheline_sz);
u32 needed_ctaglines = first_unneeded_cacheline *
g->gr.comptags_per_cacheline;
u64 win_size;
if (needed_ctaglines < ctaglines_allocsize) {
gk20a_comptaglines_free(allocator,
offset + needed_ctaglines,
ctaglines_allocsize - needed_ctaglines);
ctaglines_allocsize = needed_ctaglines;
}
*ctag_map_win_ctagline = offset;
win_size =
DIV_ROUND_UP(lines, g->gr.comptags_per_cacheline) *
aggregate_cacheline_sz;
*ctag_map_win_size = round_up(win_size, small_pgsz);
}
priv->comptags.offset = offset;
priv->comptags.lines = lines;
priv->comptags.allocated_lines = ctaglines_allocsize;
priv->comptags.user_mappable = user_mappable;
return 0;
}
static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
{
gk20a_dbg_fn("");
if (g->ops.fb.reset)
g->ops.fb.reset(g);
if (g->ops.clock_gating.slcg_fb_load_gating_prod)
g->ops.clock_gating.slcg_fb_load_gating_prod(g,
g->slcg_enabled);
if (g->ops.clock_gating.slcg_ltc_load_gating_prod)
g->ops.clock_gating.slcg_ltc_load_gating_prod(g,
g->slcg_enabled);
if (g->ops.clock_gating.blcg_fb_load_gating_prod)
g->ops.clock_gating.blcg_fb_load_gating_prod(g,
g->blcg_enabled);
if (g->ops.clock_gating.blcg_ltc_load_gating_prod)
g->ops.clock_gating.blcg_ltc_load_gating_prod(g,
g->blcg_enabled);
if (g->ops.fb.init_fs_state)
g->ops.fb.init_fs_state(g);
return 0;
}
void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block)
{
struct gk20a *g = vm->mm->g;
gk20a_dbg_fn("");
gk20a_free_inst_block(g, inst_block);
gk20a_vm_remove_support_nofree(vm);
}
static void gk20a_vidmem_destroy(struct gk20a *g)
{
#if defined(CONFIG_GK20A_VIDMEM)
if (nvgpu_alloc_initialized(&g->mm.vidmem.allocator))
nvgpu_alloc_destroy(&g->mm.vidmem.allocator);
#endif
}
static void gk20a_remove_mm_ce_support(struct mm_gk20a *mm)
{
struct gk20a *g = gk20a_from_mm(mm);
struct gk20a_platform *platform = gk20a_get_platform(g->dev);
if (mm->vidmem.ce_ctx_id != (u32)~0)
gk20a_ce_delete_context(g->dev, mm->vidmem.ce_ctx_id);
mm->vidmem.ce_ctx_id = (u32)~0;
if (platform->has_ce)
gk20a_vm_remove_support_nofree(&mm->ce.vm);
}
static void gk20a_remove_mm_support(struct mm_gk20a *mm)
{
struct gk20a *g = gk20a_from_mm(mm);
if (g->ops.mm.remove_bar2_vm)
g->ops.mm.remove_bar2_vm(g);
if (g->ops.mm.is_bar1_supported(g))
gk20a_remove_vm(&mm->bar1.vm, &mm->bar1.inst_block);
gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
gk20a_vm_remove_support_nofree(&mm->cde.vm);
gk20a_vidmem_destroy(g);
}
static int gk20a_alloc_sysmem_flush(struct gk20a *g)
{
return gk20a_gmmu_alloc_sys(g, SZ_4K, &g->mm.sysmem_flush);
}
static void gk20a_init_pramin(struct mm_gk20a *mm)
{
mm->pramin_window = 0;
spin_lock_init(&mm->pramin_window_lock);
mm->force_pramin = GK20A_FORCE_PRAMIN_DEFAULT;
}
#if defined(CONFIG_GK20A_VIDMEM)
static int gk20a_vidmem_clear_all(struct gk20a *g)
{
struct mm_gk20a *mm = &g->mm;
struct gk20a_fence *gk20a_fence_out = NULL;
u64 region2_base = 0;
int err = 0;
if (mm->vidmem.ce_ctx_id == (u32)~0)
return -EINVAL;
err = gk20a_ce_execute_ops(g->dev,
mm->vidmem.ce_ctx_id,
0,
mm->vidmem.base,
mm->vidmem.bootstrap_base - mm->vidmem.base,
0x00000000,
NVGPU_CE_DST_LOCATION_LOCAL_FB,
NVGPU_CE_MEMSET,
NULL,
0,
NULL);
if (err) {
gk20a_err(g->dev,
"Failed to clear vidmem region 1 : %d", err);
return err;
}
region2_base = mm->vidmem.bootstrap_base + mm->vidmem.bootstrap_size;
err = gk20a_ce_execute_ops(g->dev,
mm->vidmem.ce_ctx_id,
0,
region2_base,
mm->vidmem.size - region2_base,
0x00000000,
NVGPU_CE_DST_LOCATION_LOCAL_FB,
NVGPU_CE_MEMSET,
NULL,
0,
&gk20a_fence_out);
if (err) {
gk20a_err(g->dev,
"Failed to clear vidmem region 2 : %d", err);
return err;
}
if (gk20a_fence_out) {
struct nvgpu_timeout timeout;
nvgpu_timeout_init(g, &timeout,
gk20a_get_gr_idle_timeout(g),
NVGPU_TIMER_CPU_TIMER);
do {
err = gk20a_fence_wait(gk20a_fence_out,
gk20a_get_gr_idle_timeout(g));
} while (err == -ERESTARTSYS &&
!nvgpu_timeout_expired(&timeout));
gk20a_fence_put(gk20a_fence_out);
if (err) {
gk20a_err(g->dev,
"fence wait failed for CE execute ops");
return err;
}
}
mm->vidmem.cleared = true;
return 0;
}
#endif
static int gk20a_init_vidmem(struct mm_gk20a *mm)
{
#if defined(CONFIG_GK20A_VIDMEM)
struct gk20a *g = mm->g;
struct device *d = dev_from_gk20a(g);
size_t size = g->ops.mm.get_vidmem_size ?
g->ops.mm.get_vidmem_size(g) : 0;
u64 bootstrap_base, bootstrap_size, base;
u64 default_page_size = SZ_64K;
int err;
static struct nvgpu_alloc_carveout wpr_co =
NVGPU_CARVEOUT("wpr-region", 0, SZ_16M);
if (!size)
return 0;
wpr_co.base = size - SZ_256M;
bootstrap_base = wpr_co.base;
bootstrap_size = SZ_16M;
base = default_page_size;
/*
* Bootstrap allocator for use before the CE is initialized (CE
* initialization requires vidmem but we want to use the CE to zero
* out vidmem before allocating it...
*/
err = nvgpu_page_allocator_init(g, &g->mm.vidmem.bootstrap_allocator,
"vidmem-bootstrap",
bootstrap_base, bootstrap_size,
SZ_4K, 0);
err = nvgpu_page_allocator_init(g, &g->mm.vidmem.allocator,
"vidmem",
base, size - base,
default_page_size,
GPU_ALLOC_4K_VIDMEM_PAGES);
if (err) {
gk20a_err(d, "Failed to register vidmem for size %zu: %d",
size, err);
return err;
}
/* Reserve bootstrap region in vidmem allocator */
nvgpu_alloc_reserve_carveout(&g->mm.vidmem.allocator, &wpr_co);
mm->vidmem.base = base;
mm->vidmem.size = size - base;
mm->vidmem.bootstrap_base = bootstrap_base;
mm->vidmem.bootstrap_size = bootstrap_size;
mutex_init(&mm->vidmem.first_clear_mutex);
INIT_WORK(&mm->vidmem.clear_mem_worker, gk20a_vidmem_clear_mem_worker);
atomic64_set(&mm->vidmem.bytes_pending, 0);
INIT_LIST_HEAD(&mm->vidmem.clear_list_head);
mutex_init(&mm->vidmem.clear_list_mutex);
gk20a_dbg_info("registered vidmem: %zu MB", size / SZ_1M);
#endif
return 0;
}
int gk20a_init_mm_setup_sw(struct gk20a *g)
{
struct mm_gk20a *mm = &g->mm;
int err;
struct gk20a_platform *platform = gk20a_get_platform(g->dev);
gk20a_dbg_fn("");
if (mm->sw_ready) {
gk20a_dbg_fn("skip init");
return 0;
}
mm->g = g;
mutex_init(&mm->l2_op_lock);
/*TBD: make channel vm size configurable */
mm->channel.user_size = NV_MM_DEFAULT_USER_SIZE -
NV_MM_DEFAULT_KERNEL_SIZE;
mm->channel.kernel_size = NV_MM_DEFAULT_KERNEL_SIZE;
gk20a_dbg_info("channel vm size: user %dMB kernel %dMB",
(int)(mm->channel.user_size >> 20),
(int)(mm->channel.kernel_size >> 20));
gk20a_init_pramin(mm);
mm->vidmem.ce_ctx_id = (u32)~0;
err = gk20a_init_vidmem(mm);
if (err)
return err;
/*
* this requires fixed allocations in vidmem which must be
* allocated before all other buffers
*/
if (g->ops.pmu.alloc_blob_space && g->mm.vidmem_is_vidmem) {
err = g->ops.pmu.alloc_blob_space(g, 0, &g->acr.ucode_blob);
if (err)
return err;
}
err = gk20a_alloc_sysmem_flush(g);
if (err)
return err;
if (g->ops.mm.is_bar1_supported(g)) {
err = gk20a_init_bar1_vm(mm);
if (err)
return err;
}
if (g->ops.mm.init_bar2_vm) {
err = g->ops.mm.init_bar2_vm(g);
if (err)
return err;
}
err = gk20a_init_system_vm(mm);
if (err)
return err;
err = gk20a_init_hwpm(mm);
if (err)
return err;
err = gk20a_init_cde_vm(mm);
if (err)
return err;
if (platform->has_ce) {
err = gk20a_init_ce_vm(mm);
if (err)
return err;
}
/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
mm->remove_support = gk20a_remove_mm_support;
mm->remove_ce_support = gk20a_remove_mm_ce_support;
mm->sw_ready = true;
gk20a_dbg_fn("done");
return 0;
}
/* make sure gk20a_init_mm_support is called before */
int gk20a_init_mm_setup_hw(struct gk20a *g)
{
struct mm_gk20a *mm = &g->mm;
int err;
gk20a_dbg_fn("");
g->ops.fb.set_mmu_page_size(g);
if (g->ops.fb.set_use_full_comp_tag_line)
mm->use_full_comp_tag_line =
g->ops.fb.set_use_full_comp_tag_line(g);
gk20a_writel(g, fb_niso_flush_sysmem_addr_r(),
g->ops.mm.get_iova_addr(g, g->mm.sysmem_flush.sgt->sgl, 0)
>> 8);
if (g->ops.mm.bar1_bind)
g->ops.mm.bar1_bind(g, &mm->bar1.inst_block);
if (g->ops.mm.init_bar2_mm_hw_setup) {
err = g->ops.mm.init_bar2_mm_hw_setup(g);
if (err)
return err;
}
if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g))
return -EBUSY;
gk20a_dbg_fn("done");
return 0;
}
static int gk20a_mm_bar1_bind(struct gk20a *g, struct mem_desc *bar1_inst)
{
u64 iova = gk20a_mm_inst_block_addr(g, bar1_inst);
u32 ptr_v = (u32)(iova >> bar1_instance_block_shift_gk20a());
gk20a_dbg_info("bar1 inst block ptr: 0x%08x", ptr_v);
gk20a_writel(g, bus_bar1_block_r(),
gk20a_aperture_mask(g, bar1_inst,
bus_bar1_block_target_sys_mem_ncoh_f(),
bus_bar1_block_target_vid_mem_f()) |
bus_bar1_block_mode_virtual_f() |
bus_bar1_block_ptr_f(ptr_v));
return 0;
}
int gk20a_init_mm_support(struct gk20a *g)
{
u32 err;
err = gk20a_init_mm_reset_enable_hw(g);
if (err)
return err;
err = gk20a_init_mm_setup_sw(g);
if (err)
return err;
if (g->ops.mm.init_mm_setup_hw)
err = g->ops.mm.init_mm_setup_hw(g);
return err;
}
void gk20a_init_mm_ce_context(struct gk20a *g)
{
#if defined(CONFIG_GK20A_VIDMEM)
if (g->mm.vidmem.size && (g->mm.vidmem.ce_ctx_id == (u32)~0)) {
g->mm.vidmem.ce_ctx_id =
gk20a_ce_create_context_with_cb(g->dev,
gk20a_fifo_get_fast_ce_runlist_id(g),
-1,
-1,
-1,
NULL);
if (g->mm.vidmem.ce_ctx_id == (u32)~0)
gk20a_err(g->dev,
"Failed to allocate CE context for vidmem page clearing support");
}
#endif
}
static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
struct gk20a_mm_entry *entry)
{
u32 num_pages = 1 << order;
u32 len = num_pages * PAGE_SIZE;
int err;
struct page *pages;
gk20a_dbg_fn("");
/* note: mem_desc slightly abused (wrt. alloc_gmmu_pages) */
pages = alloc_pages(GFP_KERNEL, order);
if (!pages) {
gk20a_dbg(gpu_dbg_pte, "alloc_pages failed");
goto err_out;
}
entry->mem.sgt = kzalloc(sizeof(*entry->mem.sgt), GFP_KERNEL);
if (!entry->mem.sgt) {
gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
goto err_alloced;
}
err = sg_alloc_table(entry->mem.sgt, 1, GFP_KERNEL);
if (err) {
gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed");
goto err_sg_table;
}
sg_set_page(entry->mem.sgt->sgl, pages, len, 0);
entry->mem.cpu_va = page_address(pages);
memset(entry->mem.cpu_va, 0, len);
entry->mem.size = len;
entry->mem.aperture = APERTURE_SYSMEM;
FLUSH_CPU_DCACHE(entry->mem.cpu_va, sg_phys(entry->mem.sgt->sgl), len);
return 0;
err_sg_table:
kfree(entry->mem.sgt);
err_alloced:
__free_pages(pages, order);
err_out:
return -ENOMEM;
}
static void free_gmmu_phys_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry)
{
gk20a_dbg_fn("");
/* note: mem_desc slightly abused (wrt. free_gmmu_pages) */
free_pages((unsigned long)entry->mem.cpu_va, get_order(entry->mem.size));
entry->mem.cpu_va = NULL;
sg_free_table(entry->mem.sgt);
kfree(entry->mem.sgt);
entry->mem.sgt = NULL;
entry->mem.size = 0;
entry->mem.aperture = APERTURE_INVALID;
}
static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
{
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.sgt->sgl),
entry->mem.sgt->sgl->length);
return 0;
}
static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry)
{
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.sgt->sgl),
entry->mem.sgt->sgl->length);
}
static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
struct gk20a_mm_entry *entry)
{
struct device *d = dev_from_vm(vm);
struct gk20a *g = gk20a_from_vm(vm);
u32 num_pages = 1 << order;
u32 len = num_pages * PAGE_SIZE;
int err;
struct gk20a_platform *platform = dev_get_drvdata(g->dev);
gk20a_dbg_fn("");
if (platform->is_fmodel)
return alloc_gmmu_phys_pages(vm, order, entry);
/*
* On arm32 we're limited by vmalloc space, so we do not map pages by
* default.
*/
if (IS_ENABLED(CONFIG_ARM64))
err = gk20a_gmmu_alloc(g, len, &entry->mem);
else
err = gk20a_gmmu_alloc_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
len, &entry->mem);
if (err) {
gk20a_err(d, "memory allocation failed");
return -ENOMEM;
}
return 0;
}
void free_gmmu_pages(struct vm_gk20a *vm,
struct gk20a_mm_entry *entry)
{
struct gk20a *g = gk20a_from_vm(vm);
struct gk20a_platform *platform = dev_get_drvdata(g->dev);
gk20a_dbg_fn("");
if (!entry->mem.size)
return;
if (entry->woffset) /* fake shadow mem */
return;
if (platform->is_fmodel) {
free_gmmu_phys_pages(vm, entry);
return;
}
/*
* On arm32 we're limited by vmalloc space, so we do not map pages by
* default.
*/
if (IS_ENABLED(CONFIG_ARM64))
gk20a_gmmu_free(g, &entry->mem);
else
gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
&entry->mem);
}
int map_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
{
struct gk20a_platform *platform = dev_get_drvdata(g->dev);
gk20a_dbg_fn("");
if (platform->is_fmodel)
return map_gmmu_phys_pages(entry);
if (IS_ENABLED(CONFIG_ARM64)) {
if (entry->mem.aperture == APERTURE_VIDMEM)
return 0;
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.sgt->sgl),
entry->mem.size);
} else {
int err = gk20a_mem_begin(g, &entry->mem);
if (err)
return err;
}
return 0;
}
void unmap_gmmu_pages(struct gk20a *g, struct gk20a_mm_entry *entry)
{
struct gk20a_platform *platform = dev_get_drvdata(g->dev);
gk20a_dbg_fn("");
if (platform->is_fmodel) {
unmap_gmmu_phys_pages(entry);
return;
}
if (IS_ENABLED(CONFIG_ARM64)) {
if (entry->mem.aperture == APERTURE_VIDMEM)
return;
FLUSH_CPU_DCACHE(entry->mem.cpu_va,
sg_phys(entry->mem.sgt->sgl),
entry->mem.size);
} else {
gk20a_mem_end(g, &entry->mem);
}
}
/*
* Allocate a phys contig region big enough for a full
* sized gmmu page table for the given gmmu_page_size.
* the whole range is zeroed so it's "invalid"/will fault.
*
* If a previous entry is supplied, its memory will be used for
* suballocation for this next entry too, if there is space.
*/
static int gk20a_zalloc_gmmu_page_table(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
const struct gk20a_mmu_level *l,
struct gk20a_mm_entry *entry,
struct gk20a_mm_entry *prev_entry)
{
int err = -ENOMEM;
int order;
struct gk20a *g = gk20a_from_vm(vm);
u32 bytes;
gk20a_dbg_fn("");
/* allocate enough pages for the table */
order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
order += ilog2(l->entry_size);
bytes = 1 << order;
order -= PAGE_SHIFT;
if (order < 0 && prev_entry) {
/* try to suballocate from previous chunk */
u32 capacity = prev_entry->mem.size / bytes;
u32 prev = prev_entry->woffset * sizeof(u32) / bytes;
u32 free = capacity - prev - 1;
gk20a_dbg(gpu_dbg_pte, "cap %d prev %d free %d bytes %d",
capacity, prev, free, bytes);
if (free) {
memcpy(&entry->mem, &prev_entry->mem,
sizeof(entry->mem));
entry->woffset = prev_entry->woffset
+ bytes / sizeof(u32);
err = 0;
}
}
if (err) {
/* no suballoc space */
order = max(0, order);
err = alloc_gmmu_pages(vm, order, entry);
entry->woffset = 0;
}
gk20a_dbg(gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d, woff %x",
entry,
(entry->mem.sgt && entry->mem.aperture == APERTURE_SYSMEM) ?
g->ops.mm.get_iova_addr(g, entry->mem.sgt->sgl, 0)
: 0,
order, entry->woffset);
if (err)
return err;
entry->pgsz = pgsz_idx;
entry->mem.skip_wmb = true;
return err;
}
int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
{
return vm->mmu_levels[0].lo_bit[0];
}
/* given address range (inclusive) determine the pdes crossed */
void pde_range_from_vaddr_range(struct vm_gk20a *vm,
u64 addr_lo, u64 addr_hi,
u32 *pde_lo, u32 *pde_hi)
{
int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
*pde_lo = (u32)(addr_lo >> pde_shift);
*pde_hi = (u32)(addr_hi >> pde_shift);
gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
addr_lo, addr_hi, pde_shift);
gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
*pde_lo, *pde_hi);
}
static u32 pde_from_index(u32 i)
{
return i * gmmu_pde__size_v() / sizeof(u32);
}
static u32 pte_from_index(u32 i)
{
return i * gmmu_pte__size_v() / sizeof(u32);
}
u32 pte_index_from_vaddr(struct vm_gk20a *vm,
u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
{
u32 ret;
/* mask off pde part */
addr = addr & ((1ULL << gk20a_mm_pde_coverage_bit_count(vm)) - 1ULL);
/* shift over to get pte index. note assumption that pte index
* doesn't leak over into the high 32b */
ret = (u32)(addr >> ilog2(vm->gmmu_page_sizes[pgsz_idx]));
gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
return ret;
}
static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
u64 addr)
{
struct vm_reserved_va_node *va_node;
list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
if (addr >= va_node->vaddr_start &&
addr < (u64)va_node->vaddr_start + (u64)va_node->size)
return va_node;
return NULL;
}
int gk20a_vm_get_buffers(struct vm_gk20a *vm,
struct mapped_buffer_node ***mapped_buffers,
int *num_buffers)
{
struct mapped_buffer_node *mapped_buffer;
struct mapped_buffer_node **buffer_list;
struct rb_node *node;
int i = 0;
if (vm->userspace_managed) {
*mapped_buffers = NULL;
*num_buffers = 0;
return 0;
}
mutex_lock(&vm->update_gmmu_lock);
buffer_list = nvgpu_kalloc(sizeof(*buffer_list) *
vm->num_user_mapped_buffers, true);
if (!buffer_list) {
mutex_unlock(&vm->update_gmmu_lock);
return -ENOMEM;
}
node = rb_first(&vm->mapped_buffers);
while (node) {
mapped_buffer =
container_of(node, struct mapped_buffer_node, node);
if (mapped_buffer->user_mapped) {
buffer_list[i] = mapped_buffer;
kref_get(&mapped_buffer->ref);
i++;
}
node = rb_next(&mapped_buffer->node);
}
BUG_ON(i != vm->num_user_mapped_buffers);
*num_buffers = vm->num_user_mapped_buffers;
*mapped_buffers = buffer_list;
mutex_unlock(&vm->update_gmmu_lock);
return 0;
}
static void gk20a_vm_unmap_locked_kref(struct kref *ref)
{
struct mapped_buffer_node *mapped_buffer =
container_of(ref, struct mapped_buffer_node, ref);
gk20a_vm_unmap_locked(mapped_buffer, mapped_buffer->vm->kref_put_batch);
}
void gk20a_vm_mapping_batch_start(struct vm_gk20a_mapping_batch *mapping_batch)
{
memset(mapping_batch, 0, sizeof(*mapping_batch));
mapping_batch->gpu_l2_flushed = false;
mapping_batch->need_tlb_invalidate = false;
}
void gk20a_vm_mapping_batch_finish_locked(
struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *mapping_batch)
{
/* hanging kref_put batch pointer? */
WARN_ON(vm->kref_put_batch == mapping_batch);
if (mapping_batch->need_tlb_invalidate) {
struct gk20a *g = gk20a_from_vm(vm);
g->ops.mm.tlb_invalidate(vm);
}
}
void gk20a_vm_mapping_batch_finish(struct vm_gk20a *vm,
struct vm_gk20a_mapping_batch *mapping_batch)
{
mutex_lock(&vm->update_gmmu_lock);
gk20a_vm_mapping_batch_finish_locked(vm, mapping_batch);
mutex_unlock(&vm->update_gmmu_lock);
}
void gk20a_vm_put_buffers(struct vm_gk20a *vm,
struct mapped_buffer_node **mapped_buffers,
int num_buffers)
{
int i;
struct vm_gk20a_mapping_batch batch;
if (num_buffers == 0)
return;
mutex_lock(&vm->update_gmmu_lock);
gk20a_vm_mapping_batch_start(&batch);
vm->kref_put_batch = &batch;
for (i = 0; i < num_buffers; ++i)
kref_put(&mapped_buffers[i]->ref,
gk20a_vm_unmap_locked_kref);
vm->kref_put_batch = NULL;
gk20a_vm_mapping_batch_finish_locked(vm, &batch);
mutex_unlock(&vm->update_gmmu_lock);
nvgpu_kfree(mapped_buffers);
}
static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset,
struct vm_gk20a_mapping_batch *batch)
{
struct device *d = dev_from_vm(vm);
int retries = 10000; /* 50 ms */
struct mapped_buffer_node *mapped_buffer;
mutex_lock(&vm->update_gmmu_lock);
mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
if (!mapped_buffer) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
return;
}
if (mapped_buffer->flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
mutex_unlock(&vm->update_gmmu_lock);
while (retries >= 0 || !tegra_platform_is_silicon()) {
if (atomic_read(&mapped_buffer->ref.refcount) == 1)
break;
retries--;
udelay(5);
}
if (retries < 0 && tegra_platform_is_silicon())
gk20a_err(d, "sync-unmap failed on 0x%llx",
offset);
mutex_lock(&vm->update_gmmu_lock);
}
if (mapped_buffer->user_mapped == 0) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d, "addr already unmapped from user 0x%llx", offset);
return;
}
mapped_buffer->user_mapped--;
if (mapped_buffer->user_mapped == 0)
vm->num_user_mapped_buffers--;
vm->kref_put_batch = batch;
kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
vm->kref_put_batch = NULL;
mutex_unlock(&vm->update_gmmu_lock);
}
u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
u64 size,
enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
{
struct nvgpu_allocator *vma = vm->vma[gmmu_pgsz_idx];
u64 offset;
u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
dev_warn(dev_from_vm(vm),
"invalid page size requested in gk20a vm alloc");
return 0;
}
if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
dev_warn(dev_from_vm(vm),
"unsupportd page size requested");
return 0;
}
/* Be certain we round up to gmmu_page_size if needed */
size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);
offset = nvgpu_alloc(vma, size);
if (!offset) {
gk20a_err(dev_from_vm(vm),
"%s oom: sz=0x%llx", vma->name, size);
return 0;
}
gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
return offset;
}
int gk20a_vm_free_va(struct vm_gk20a *vm,
u64 offset, u64 size,
enum gmmu_pgsz_gk20a pgsz_idx)
{
struct nvgpu_allocator *vma = vm->vma[pgsz_idx];
gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
vma->name, offset, size);
nvgpu_free(vma, offset);
return 0;
}
static int insert_mapped_buffer(struct rb_root *root,
struct mapped_buffer_node *mapped_buffer)
{
struct rb_node **new_node = &(root->rb_node), *parent = NULL;
/* Figure out where to put new node */
while (*new_node) {
struct mapped_buffer_node *cmp_with =
container_of(*new_node, struct mapped_buffer_node,
node);
parent = *new_node;
if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
new_node = &((*new_node)->rb_left);
else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
new_node = &((*new_node)->rb_right);
else
return -EINVAL; /* no fair dup'ing */
}
/* Add new node and rebalance tree. */
rb_link_node(&mapped_buffer->node, parent, new_node);
rb_insert_color(&mapped_buffer->node, root);
return 0;
}
static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
struct rb_root *root, struct dma_buf *dmabuf,
u32 kind)
{
struct rb_node *node = rb_first(root);
while (node) {
struct mapped_buffer_node *mapped_buffer =
container_of(node, struct mapped_buffer_node, node);
if (mapped_buffer->dmabuf == dmabuf &&
kind == mapped_buffer->kind)
return mapped_buffer;
node = rb_next(&mapped_buffer->node);
}
return NULL;
}
static struct mapped_buffer_node *find_mapped_buffer_locked(
struct rb_root *root, u64 addr)
{
struct rb_node *node = root->rb_node;
while (node) {
struct mapped_buffer_node *mapped_buffer =
container_of(node, struct mapped_buffer_node, node);
if (mapped_buffer->addr > addr) /* u64 cmp */
node = node->rb_left;
else if (mapped_buffer->addr != addr) /* u64 cmp */
node = node->rb_right;
else
return mapped_buffer;
}
return NULL;
}
static struct mapped_buffer_node *find_mapped_buffer_range_locked(
struct rb_root *root, u64 addr)
{
struct rb_node *node = root->rb_node;
while (node) {
struct mapped_buffer_node *m =
container_of(node, struct mapped_buffer_node, node);
if (m->addr <= addr && m->addr + m->size > addr)
return m;
else if (m->addr > addr) /* u64 cmp */
node = node->rb_left;
else
node = node->rb_right;
}
return NULL;
}
/* find the first mapped buffer with GPU VA less than addr */
static struct mapped_buffer_node *find_mapped_buffer_less_than_locked(
struct rb_root *root, u64 addr)
{
struct rb_node *node = root->rb_node;
struct mapped_buffer_node *ret = NULL;
while (node) {
struct mapped_buffer_node *mapped_buffer =
container_of(node, struct mapped_buffer_node, node);
if (mapped_buffer->addr >= addr)
node = node->rb_left;
else {
ret = mapped_buffer;
node = node->rb_right;
}
}
return ret;
}
#define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
struct buffer_attrs {
struct sg_table *sgt;
u64 size;
u64 align;
u32 ctag_offset;
u32 ctag_lines;
u32 ctag_allocated_lines;
int pgsz_idx;
u8 kind_v;
u8 uc_kind_v;
bool ctag_user_mappable;
};
static void gmmu_select_page_size(struct vm_gk20a *vm,
struct buffer_attrs *bfr)
{
bfr->pgsz_idx = __get_pte_size(vm, 0, bfr->size);
}
static int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
u32 flags,
struct buffer_attrs *bfr,
enum gmmu_pgsz_gk20a pgsz_idx)
{
bool kind_compressible;
struct gk20a *g = gk20a_from_vm(vm);
struct device *d = dev_from_gk20a(g);
int ctag_granularity = g->ops.fb.compression_page_size(g);
if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
bfr->kind_v = gmmu_pte_kind_pitch_v();
if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
return -EINVAL;
}
bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
/* find a suitable uncompressed kind if it becomes necessary later */
kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
if (kind_compressible) {
bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
/* shouldn't happen, but it is worth cross-checking */
gk20a_err(d, "comptag kind 0x%x can't be"
" downgraded to uncompressed kind",
bfr->kind_v);
return -EINVAL;
}
}
/* comptags only supported for suitable kinds, 128KB pagesize */
if (kind_compressible &&
vm->gmmu_page_sizes[pgsz_idx] < g->ops.fb.compressible_page_size(g)) {
/*
gk20a_warn(d, "comptags specified"
" but pagesize being used doesn't support it");*/
/* it is safe to fall back to uncompressed as
functionality is not harmed */
bfr->kind_v = bfr->uc_kind_v;
kind_compressible = false;
}
if (kind_compressible)
bfr->ctag_lines = DIV_ROUND_UP_ULL(bfr->size, ctag_granularity);
else
bfr->ctag_lines = 0;
return 0;
}
static int validate_fixed_buffer(struct vm_gk20a *vm,
struct buffer_attrs *bfr,
u64 map_offset, u64 map_size,
struct vm_reserved_va_node **pva_node)
{
struct device *dev = dev_from_vm(vm);
struct vm_reserved_va_node *va_node;
struct mapped_buffer_node *buffer;
u64 map_end = map_offset + map_size;
/* can wrap around with insane map_size; zero is disallowed too */
if (map_end <= map_offset) {
gk20a_warn(dev, "fixed offset mapping with invalid map_size");
return -EINVAL;
}
if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
map_offset);
return -EINVAL;
}
/* Find the space reservation, but it's ok to have none for
* userspace-managed address spaces */
va_node = addr_to_reservation(vm, map_offset);
if (!va_node && !vm->userspace_managed) {
gk20a_warn(dev, "fixed offset mapping without space allocation");
return -EINVAL;
}
/* Mapped area should fit inside va, if there's one */
if (va_node && map_end > va_node->vaddr_start + va_node->size) {
gk20a_warn(dev, "fixed offset mapping size overflows va node");
return -EINVAL;
}
/* check that this mapping does not collide with existing
* mappings by checking the buffer with the highest GPU VA
* that is less than our buffer end */
buffer = find_mapped_buffer_less_than_locked(
&vm->mapped_buffers, map_offset + map_size);
if (buffer && buffer->addr + buffer->size > map_offset) {
gk20a_warn(dev, "overlapping buffer map requested");
return -EINVAL;
}
*pva_node = va_node;
return 0;
}
u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
u64 map_offset,
struct sg_table *sgt,
u64 buffer_offset,
u64 size,
int pgsz_idx,
u8 kind_v,
u32 ctag_offset,
u32 flags,
int rw_flag,
bool clear_ctags,
bool sparse,
bool priv,
struct vm_gk20a_mapping_batch *batch,
enum gk20a_aperture aperture)
{
int err = 0;
bool allocated = false;
struct device *d = dev_from_vm(vm);
struct gk20a *g = gk20a_from_vm(vm);
int ctag_granularity = g->ops.fb.compression_page_size(g);
u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
/* Allocate (or validate when map_offset != 0) the virtual address. */
if (!map_offset) {
map_offset = gk20a_vm_alloc_va(vm, size,
pgsz_idx);
if (!map_offset) {
gk20a_err(d, "failed to allocate va space");
err = -ENOMEM;
goto fail_alloc;
}
allocated = true;
}
gk20a_dbg(gpu_dbg_map,
"gv: 0x%04x_%08x + 0x%-7llx "
"[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
"pgsz=%-3dKb as=%-2d ctags=%d start=%d "
"kind=0x%x flags=0x%x apt=%s",
hi32(map_offset), lo32(map_offset), size,
sgt ? hi32((u64)sg_dma_address(sgt->sgl)) : 0,
sgt ? lo32((u64)sg_dma_address(sgt->sgl)) : 0,
sgt ? hi32((u64)sg_phys(sgt->sgl)) : 0,
sgt ? lo32((u64)sg_phys(sgt->sgl)) : 0,
vm->gmmu_page_sizes[pgsz_idx] >> 10, vm_aspace_id(vm),
ctag_lines, ctag_offset,
kind_v, flags, gk20a_aperture_str(aperture));
err = update_gmmu_ptes_locked(vm, pgsz_idx,
sgt,
buffer_offset,
map_offset, map_offset + size,
kind_v,
ctag_offset,
flags &
NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
flags &
NVGPU_AS_MAP_BUFFER_FLAGS_UNMAPPED_PTE,
rw_flag,
sparse,
priv,
aperture);
if (err) {
gk20a_err(d, "failed to update ptes on map");
goto fail_validate;
}
if (!batch)
g->ops.mm.tlb_invalidate(vm);
else
batch->need_tlb_invalidate = true;
return map_offset;
fail_validate:
if (allocated)
gk20a_vm_free_va(vm, map_offset, size, pgsz_idx);
fail_alloc:
gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
return 0;
}
void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
u64 vaddr,
u64 size,
int pgsz_idx,
bool va_allocated,
int rw_flag,
bool sparse,
struct vm_gk20a_mapping_batch *batch)
{
int err = 0;
struct gk20a *g = gk20a_from_vm(vm);
if (va_allocated) {
err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
if (err) {
dev_err(dev_from_vm(vm),
"failed to free va");
return;
}
}
/* unmap here needs to know the page size we assigned at mapping */
err = update_gmmu_ptes_locked(vm,
pgsz_idx,
NULL, /* n/a for unmap */
0,
vaddr,
vaddr + size,
0, 0, false /* n/a for unmap */,
false, rw_flag,
sparse, 0,
APERTURE_INVALID); /* don't care for unmap */
if (err)
dev_err(dev_from_vm(vm),
"failed to update gmmu ptes on unmap");
/* flush l2 so any dirty lines are written out *now*.
* also as we could potentially be switching this buffer
* from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
* some point in the future we need to invalidate l2. e.g. switching
* from a render buffer unmap (here) to later using the same memory
* for gmmu ptes. note the positioning of this relative to any smmu
* unmapping (below). */
if (!batch) {
gk20a_mm_l2_flush(g, true);
g->ops.mm.tlb_invalidate(vm);
} else {
if (!batch->gpu_l2_flushed) {
gk20a_mm_l2_flush(g, true);
batch->gpu_l2_flushed = true;
}
batch->need_tlb_invalidate = true;
}
}
static enum gk20a_aperture gk20a_dmabuf_aperture(struct gk20a *g,
struct dma_buf *dmabuf)
{
struct gk20a *buf_owner = gk20a_vidmem_buf_owner(dmabuf);
if (buf_owner == NULL) {
/* Not nvgpu-allocated, assume system memory */
return APERTURE_SYSMEM;
} else if (WARN_ON(buf_owner == g && !g->mm.vidmem_is_vidmem)) {
/* Looks like our video memory, but this gpu doesn't support
* it. Warn about a bug and bail out */
gk20a_warn(dev_from_gk20a(g),
"dmabuf is our vidmem but we don't have local vidmem");
return APERTURE_INVALID;
} else if (buf_owner != g) {
/* Someone else's vidmem */
return APERTURE_INVALID;
} else {
/* Yay, buf_owner == g */
return APERTURE_VIDMEM;
}
}
static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
struct dma_buf *dmabuf,
u64 offset_align,
u32 flags,
int kind,
struct sg_table **sgt,
bool user_mapped,
int rw_flag)
{
struct gk20a *g = gk20a_from_vm(vm);
struct mapped_buffer_node *mapped_buffer = NULL;
mapped_buffer =
find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
dmabuf, kind);
if (!mapped_buffer)
return 0;
if (mapped_buffer->flags != flags)
return 0;
if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
mapped_buffer->addr != offset_align)
return 0;
BUG_ON(mapped_buffer->vm != vm);
/* mark the buffer as used */
if (user_mapped) {
if (mapped_buffer->user_mapped == 0)
vm->num_user_mapped_buffers++;
mapped_buffer->user_mapped++;
/* If the mapping comes from user space, we own
* the handle ref. Since we reuse an
* existing mapping here, we need to give back those
* refs once in order not to leak.
*/
if (mapped_buffer->own_mem_ref)
dma_buf_put(mapped_buffer->dmabuf);
else
mapped_buffer->own_mem_ref = true;
}
kref_get(&mapped_buffer->ref);
gk20a_dbg(gpu_dbg_map,
"gv: 0x%04x_%08x + 0x%-7zu "
"[dma: 0x%02x_%08x, pa: 0x%02x_%08x] "
"pgsz=%-3dKb as=%-2d ctags=%d start=%d "
"flags=0x%x apt=%s (reused)",
hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
dmabuf->size,
hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10,
vm_aspace_id(vm),
mapped_buffer->ctag_lines, mapped_buffer->ctag_offset,
mapped_buffer->flags,
gk20a_aperture_str(gk20a_dmabuf_aperture(g, dmabuf)));
if (sgt)
*sgt = mapped_buffer->sgt;
return mapped_buffer->addr;
}
#if defined(CONFIG_GK20A_VIDMEM)
static struct sg_table *gk20a_vidbuf_map_dma_buf(
struct dma_buf_attachment *attach, enum dma_data_direction dir)
{
struct gk20a_vidmem_buf *buf = attach->dmabuf->priv;
return buf->mem->sgt;
}
static void gk20a_vidbuf_unmap_dma_buf(struct dma_buf_attachment *attach,
struct sg_table *sgt,
enum dma_data_direction dir)
{
}
static void gk20a_vidbuf_release(struct dma_buf *dmabuf)
{
struct gk20a_vidmem_buf *buf = dmabuf->priv;
gk20a_dbg_fn("");
if (buf->dmabuf_priv)
buf->dmabuf_priv_delete(buf->dmabuf_priv);
gk20a_gmmu_free(buf->g, buf->mem);
kfree(buf);
}
static void *gk20a_vidbuf_kmap(struct dma_buf *dmabuf, unsigned long page_num)
{
WARN_ON("Not supported");
return NULL;
}
static void *gk20a_vidbuf_kmap_atomic(struct dma_buf *dmabuf,
unsigned long page_num)
{
WARN_ON("Not supported");
return NULL;
}
static int gk20a_vidbuf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
{
return -EINVAL;
}
static int gk20a_vidbuf_set_private(struct dma_buf *dmabuf,
struct device *dev, void *priv, void (*delete)(void *priv))
{
struct gk20a_vidmem_buf *buf = dmabuf->priv;
buf->dmabuf_priv = priv;
buf->dmabuf_priv_delete = delete;
return 0;
}
static void *gk20a_vidbuf_get_private(struct dma_buf *dmabuf,
struct device *dev)
{
struct gk20a_vidmem_buf *buf = dmabuf->priv;
return buf->dmabuf_priv;
}
static const struct dma_buf_ops gk20a_vidbuf_ops = {
.map_dma_buf = gk20a_vidbuf_map_dma_buf,
.unmap_dma_buf = gk20a_vidbuf_unmap_dma_buf,
.release = gk20a_vidbuf_release,
.kmap_atomic = gk20a_vidbuf_kmap_atomic,
.kmap = gk20a_vidbuf_kmap,
.mmap = gk20a_vidbuf_mmap,
.set_drvdata = gk20a_vidbuf_set_private,
.get_drvdata = gk20a_vidbuf_get_private,
};
static struct dma_buf *gk20a_vidbuf_export(struct gk20a_vidmem_buf *buf)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.priv = buf;
exp_info.ops = &gk20a_vidbuf_ops;
exp_info.size = buf->mem->size;
exp_info.flags = O_RDWR;
return dma_buf_export(&exp_info);
#else
return dma_buf_export(buf, &gk20a_vidbuf_ops, buf->mem->size,
O_RDWR, NULL);
#endif
}
#endif
static struct gk20a *gk20a_vidmem_buf_owner(struct dma_buf *dmabuf)
{
#if defined(CONFIG_GK20A_VIDMEM)
struct gk20a_vidmem_buf *buf = dmabuf->priv;
if (dmabuf->ops != &gk20a_vidbuf_ops)
return NULL;
return buf->g;
#else
return NULL;
#endif
}
int gk20a_vidmem_buf_alloc(struct gk20a *g, size_t bytes)
{
#if defined(CONFIG_GK20A_VIDMEM)
struct gk20a_vidmem_buf *buf;
int err = 0, fd;
gk20a_dbg_fn("");
buf = kzalloc(sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
buf->g = g;
if (!g->mm.vidmem.cleared) {
mutex_lock(&g->mm.vidmem.first_clear_mutex);
if (!g->mm.vidmem.cleared) {
err = gk20a_vidmem_clear_all(g);
if (err) {
gk20a_err(g->dev,
"failed to clear whole vidmem");
goto err_kfree;
}
}
mutex_unlock(&g->mm.vidmem.first_clear_mutex);
}
buf->mem = kzalloc(sizeof(struct mem_desc), GFP_KERNEL);
if (!buf->mem)
goto err_kfree;
buf->mem->user_mem = true;
err = gk20a_gmmu_alloc_vid(g, bytes, buf->mem);
if (err)
goto err_memfree;
buf->dmabuf = gk20a_vidbuf_export(buf);
if (IS_ERR(buf->dmabuf)) {
err = PTR_ERR(buf->dmabuf);
goto err_bfree;
}
fd = __alloc_fd(current->files, 1024, sysctl_nr_open, O_RDWR);
if (fd < 0) {
/* ->release frees what we have done */
dma_buf_put(buf->dmabuf);
return fd;
}
/* fclose() on this drops one ref, freeing the dma buf */
fd_install(fd, buf->dmabuf->file);
return fd;
err_bfree:
gk20a_gmmu_free(g, buf->mem);
err_memfree:
kfree(buf->mem);
err_kfree:
kfree(buf);
return err;
#else
return -ENOSYS;
#endif
}
int gk20a_vidmem_get_space(struct gk20a *g, u64 *space)
{
#if defined(CONFIG_GK20A_VIDMEM)
struct nvgpu_allocator *allocator = &g->mm.vidmem.allocator;
gk20a_dbg_fn("");
if (!nvgpu_alloc_initialized(allocator))
return -ENOSYS;
mutex_lock(&g->mm.vidmem.clear_list_mutex);
*space = nvgpu_alloc_space(allocator) +
atomic64_read(&g->mm.vidmem.bytes_pending);
mutex_unlock(&g->mm.vidmem.clear_list_mutex);
return 0;
#else
return -ENOSYS;
#endif
}
int gk20a_vidbuf_access_memory(struct gk20a *g, struct dma_buf *dmabuf,
void *buffer, u64 offset, u64 size, u32 cmd)
{
#if defined(CONFIG_GK20A_VIDMEM)
struct gk20a_vidmem_buf *vidmem_buf;
struct mem_desc *mem;
int err = 0;
if (gk20a_dmabuf_aperture(g, dmabuf) != APERTURE_VIDMEM)
return -EINVAL;
vidmem_buf = dmabuf->priv;
mem = vidmem_buf->mem;
switch (cmd) {
case NVGPU_DBG_GPU_IOCTL_ACCESS_FB_MEMORY_CMD_READ:
gk20a_mem_rd_n(g, mem, offset, buffer, size);
break;
case NVGPU_DBG_GPU_IOCTL_ACCESS_FB_MEMORY_CMD_WRITE:
gk20a_mem_wr_n(g, mem, offset, buffer, size);
break;
default:
err = -EINVAL;
}
return err;
#else
return -ENOSYS;
#endif
}
static u64 gk20a_mm_get_align(struct gk20a *g, struct scatterlist *sgl,
enum gk20a_aperture aperture)
{
u64 align = 0, chunk_align = 0;
u64 buf_addr;
if (aperture == APERTURE_VIDMEM) {
struct nvgpu_page_alloc *alloc = get_vidmem_page_alloc(sgl);
struct page_alloc_chunk *chunk = NULL;
list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
chunk_align = 1ULL << __ffs(chunk->base | chunk->length);
if (align)
align = min(align, chunk_align);
else
align = chunk_align;
}
return align;
}
buf_addr = (u64)sg_dma_address(sgl);
if (g->mm.bypass_smmu || buf_addr == DMA_ERROR_CODE || !buf_addr) {
while (sgl) {
buf_addr = (u64)sg_phys(sgl);
chunk_align = 1ULL << __ffs(buf_addr | (u64)sgl->length);
if (align)
align = min(align, chunk_align);
else
align = chunk_align;
sgl = sg_next(sgl);
}
return align;
}
align = 1ULL << __ffs(buf_addr);
return align;
}
u64 gk20a_vm_map(struct vm_gk20a *vm,
struct dma_buf *dmabuf,
u64 offset_align,
u32 flags /*NVGPU_AS_MAP_BUFFER_FLAGS_*/,
int kind,
struct sg_table **sgt,
bool user_mapped,
int rw_flag,
u64 buffer_offset,
u64 mapping_size,
struct vm_gk20a_mapping_batch *batch)
{
struct gk20a *g = gk20a_from_vm(vm);
struct gk20a_comptag_allocator *ctag_allocator = &g->gr.comp_tags;
struct device *d = dev_from_vm(vm);
struct mapped_buffer_node *mapped_buffer = NULL;
bool inserted = false, va_allocated = false;
u32 gmmu_page_size = 0;
u64 map_offset = 0;
int err = 0;
struct buffer_attrs bfr = {NULL};
struct gk20a_comptags comptags;
bool clear_ctags = false;
struct scatterlist *sgl;
u64 ctag_map_win_size = 0;
u32 ctag_map_win_ctagline = 0;
struct vm_reserved_va_node *va_node = NULL;
u32 ctag_offset;
enum gk20a_aperture aperture;
if (user_mapped && vm->userspace_managed &&
!(flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)) {
gk20a_err(d,
"%s: non-fixed-offset mapping not available on userspace managed address spaces",
__func__);
return -EFAULT;
}
mutex_lock(&vm->update_gmmu_lock);
/* check if this buffer is already mapped */
if (!vm->userspace_managed) {
map_offset = gk20a_vm_map_duplicate_locked(
vm, dmabuf, offset_align,
flags, kind, sgt,
user_mapped, rw_flag);
if (map_offset) {
mutex_unlock(&vm->update_gmmu_lock);
return map_offset;
}
}
/* pin buffer to get phys/iovmm addr */
bfr.sgt = gk20a_mm_pin(d, dmabuf);
if (IS_ERR(bfr.sgt)) {
/* Falling back to physical is actually possible
* here in many cases if we use 4K phys pages in the
* gmmu. However we have some regions which require
* contig regions to work properly (either phys-contig
* or contig through smmu io_vaspace). Until we can
* track the difference between those two cases we have
* to fail the mapping when we run out of SMMU space.
*/
gk20a_warn(d, "oom allocating tracking buffer");
goto clean_up;
}
if (sgt)
*sgt = bfr.sgt;
bfr.kind_v = kind;
bfr.size = dmabuf->size;
sgl = bfr.sgt->sgl;
aperture = gk20a_dmabuf_aperture(g, dmabuf);
if (aperture == APERTURE_INVALID) {
err = -EINVAL;
goto clean_up;
}
if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)
map_offset = offset_align;
bfr.align = gk20a_mm_get_align(g, sgl, aperture);
bfr.pgsz_idx = __get_pte_size(vm, map_offset,
min_t(u64, bfr.size, bfr.align));
mapping_size = mapping_size ? mapping_size : bfr.size;
if (vm->big_pages)
gmmu_select_page_size(vm, &bfr);
else
bfr.pgsz_idx = gmmu_page_size_small;
/* If FIX_OFFSET is set, pgsz is determined at address allocation
* time. The alignment at address alloc time must be the same as
* the alignment determined by gmmu_select_page_size().
*/
if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
int pgsz_idx = __get_pte_size(vm, offset_align, mapping_size);
if (pgsz_idx > bfr.pgsz_idx) {
gk20a_err(d, "%llx buffer pgsz %d, VA pgsz %d",
offset_align, bfr.pgsz_idx, pgsz_idx);
err = -EINVAL;
goto clean_up;
}
bfr.pgsz_idx = min(bfr.pgsz_idx, pgsz_idx);
}
/* validate/adjust bfr attributes */
if (unlikely(bfr.pgsz_idx == -1)) {
gk20a_err(d, "unsupported page size detected");
goto clean_up;
}
if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
bfr.pgsz_idx > gmmu_page_size_big)) {
BUG_ON(1);
err = -EINVAL;
goto clean_up;
}
gmmu_page_size = vm->gmmu_page_sizes[bfr.pgsz_idx];
/* Check if we should use a fixed offset for mapping this buffer */
if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
err = validate_fixed_buffer(vm, &bfr,
offset_align, mapping_size,
&va_node);
if (err)
goto clean_up;
map_offset = offset_align;
va_allocated = false;
} else
va_allocated = true;
if (sgt)
*sgt = bfr.sgt;
err = setup_buffer_kind_and_compression(vm, flags, &bfr, bfr.pgsz_idx);
if (unlikely(err)) {
gk20a_err(d, "failure setting up kind and compression");
goto clean_up;
}
/* bar1 and pmu vm don't need ctag */
if (!vm->enable_ctag)
bfr.ctag_lines = 0;
gk20a_get_comptags(d, dmabuf, &comptags);
/* ensure alignment to compression page size if compression enabled */
if (bfr.ctag_offset)
mapping_size = ALIGN(mapping_size,
g->ops.fb.compression_page_size(g));
if (bfr.ctag_lines && !comptags.lines) {
const bool user_mappable =
!!(flags & NVGPU_AS_MAP_BUFFER_FLAGS_MAPPABLE_COMPBITS);
/* allocate compression resources if needed */
err = gk20a_alloc_comptags(g, d, dmabuf, ctag_allocator,
bfr.ctag_lines, user_mappable,
&ctag_map_win_size,
&ctag_map_win_ctagline);
if (err) {
/* ok to fall back here if we ran out */
/* TBD: we can partially alloc ctags as well... */
bfr.kind_v = bfr.uc_kind_v;
} else {
gk20a_get_comptags(d, dmabuf, &comptags);
if (g->ops.ltc.cbc_ctrl)
g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
comptags.offset,
comptags.offset +
comptags.allocated_lines - 1);
else
clear_ctags = true;
}
}
/* store the comptag info */
bfr.ctag_offset = comptags.offset;
bfr.ctag_lines = comptags.lines;
bfr.ctag_allocated_lines = comptags.allocated_lines;
bfr.ctag_user_mappable = comptags.user_mappable;
/*
* Calculate comptag index for this mapping. Differs in
* case of partial mapping.
*/
ctag_offset = comptags.offset;
if (ctag_offset)
ctag_offset += buffer_offset >>
ilog2(g->ops.fb.compression_page_size(g));
/* update gmmu ptes */
map_offset = g->ops.mm.gmmu_map(vm, map_offset,
bfr.sgt,
buffer_offset, /* sg offset */
mapping_size,
bfr.pgsz_idx,
bfr.kind_v,
ctag_offset,
flags, rw_flag,
clear_ctags,
false,
false,
batch,
aperture);
if (!map_offset)
goto clean_up;
#if defined(NVHOST_DEBUG)
{
int i;
struct scatterlist *sg = NULL;
gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
u64 da = sg_dma_address(sg);
u64 pa = sg_phys(sg);
u64 len = sg->length;
gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
i, hi32(pa), lo32(pa), hi32(da), lo32(da),
hi32(len), lo32(len));
}
}
#endif
/* keep track of the buffer for unmapping */
/* TBD: check for multiple mapping of same buffer */
mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
if (!mapped_buffer) {
gk20a_warn(d, "oom allocating tracking buffer");
goto clean_up;
}
mapped_buffer->dmabuf = dmabuf;
mapped_buffer->sgt = bfr.sgt;
mapped_buffer->addr = map_offset;
mapped_buffer->size = mapping_size;
mapped_buffer->pgsz_idx = bfr.pgsz_idx;
mapped_buffer->ctag_offset = bfr.ctag_offset;
mapped_buffer->ctag_lines = bfr.ctag_lines;
mapped_buffer->ctag_allocated_lines = bfr.ctag_allocated_lines;
mapped_buffer->ctags_mappable = bfr.ctag_user_mappable;
mapped_buffer->ctag_map_win_size = ctag_map_win_size;
mapped_buffer->ctag_map_win_ctagline = ctag_map_win_ctagline;
mapped_buffer->vm = vm;
mapped_buffer->flags = flags;
mapped_buffer->kind = kind;
mapped_buffer->va_allocated = va_allocated;
mapped_buffer->user_mapped = user_mapped ? 1 : 0;
mapped_buffer->own_mem_ref = user_mapped;
INIT_LIST_HEAD(&mapped_buffer->unmap_list);
INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
kref_init(&mapped_buffer->ref);
err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
if (err) {
gk20a_err(d, "failed to insert into mapped buffer tree");
goto clean_up;
}
inserted = true;
if (user_mapped)
vm->num_user_mapped_buffers++;
gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
if (va_node) {
list_add_tail(&mapped_buffer->va_buffers_list,
&va_node->va_buffers_list);
mapped_buffer->va_node = va_node;
}
mutex_unlock(&vm->update_gmmu_lock);
return map_offset;
clean_up:
if (inserted) {
rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
if (user_mapped)
vm->num_user_mapped_buffers--;
}
kfree(mapped_buffer);
if (va_allocated)
gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
if (!IS_ERR(bfr.sgt))
gk20a_mm_unpin(d, dmabuf, bfr.sgt);
mutex_unlock(&vm->update_gmmu_lock);
gk20a_dbg_info("err=%d\n", err);
return 0;
}
int gk20a_vm_get_compbits_info(struct vm_gk20a *vm,
u64 mapping_gva,
u64 *compbits_win_size,
u32 *compbits_win_ctagline,
u32 *mapping_ctagline,
u32 *flags)
{
struct mapped_buffer_node *mapped_buffer;
struct device *d = dev_from_vm(vm);
mutex_lock(&vm->update_gmmu_lock);
mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, mapping_gva);
if (!mapped_buffer || !mapped_buffer->user_mapped)
{
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d, "%s: bad offset 0x%llx", __func__, mapping_gva);
return -EFAULT;
}
*compbits_win_size = 0;
*compbits_win_ctagline = 0;
*mapping_ctagline = 0;
*flags = 0;
if (mapped_buffer->ctag_offset)
*flags |= NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_HAS_COMPBITS;
if (mapped_buffer->ctags_mappable)
{
*flags |= NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_MAPPABLE;
*compbits_win_size = mapped_buffer->ctag_map_win_size;
*compbits_win_ctagline = mapped_buffer->ctag_map_win_ctagline;
*mapping_ctagline = mapped_buffer->ctag_offset;
}
mutex_unlock(&vm->update_gmmu_lock);
return 0;
}
int gk20a_vm_map_compbits(struct vm_gk20a *vm,
u64 mapping_gva,
u64 *compbits_win_gva,
u64 *mapping_iova,
u32 flags)
{
struct mapped_buffer_node *mapped_buffer;
struct gk20a *g = gk20a_from_vm(vm);
struct device *d = dev_from_vm(vm);
const bool fixed_mapping =
(flags & NVGPU_AS_MAP_BUFFER_COMPBITS_FLAGS_FIXED_OFFSET) != 0;
if (vm->userspace_managed && !fixed_mapping) {
gk20a_err(d,
"%s: non-fixed-offset mapping is not available on userspace managed address spaces",
__func__);
return -EFAULT;
}
if (fixed_mapping && !vm->userspace_managed) {
gk20a_err(d,
"%s: fixed-offset mapping is available only on userspace managed address spaces",
__func__);
return -EFAULT;
}
mutex_lock(&vm->update_gmmu_lock);
mapped_buffer =
find_mapped_buffer_locked(&vm->mapped_buffers, mapping_gva);
if (!mapped_buffer || !mapped_buffer->user_mapped) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d, "%s: bad offset 0x%llx", __func__, mapping_gva);
return -EFAULT;
}
if (!mapped_buffer->ctags_mappable) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d, "%s: comptags not mappable, offset 0x%llx",
__func__, mapping_gva);
return -EFAULT;
}
if (!mapped_buffer->ctag_map_win_addr) {
const u32 small_pgsz_index = 0; /* small pages, 4K */
const u32 aggregate_cacheline_sz =
g->gr.cacheline_size * g->gr.slices_per_ltc *
g->ltc_count;
/* first aggregate cacheline to map */
u32 cacheline_start; /* inclusive */
/* offset of the start cacheline (will be page aligned) */
u64 cacheline_offset_start;
if (!mapped_buffer->ctag_map_win_size) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d,
"%s: mapping 0x%llx does not have "
"mappable comptags",
__func__, mapping_gva);
return -EFAULT;
}
cacheline_start = mapped_buffer->ctag_offset /
g->gr.comptags_per_cacheline;
cacheline_offset_start =
(u64)cacheline_start * aggregate_cacheline_sz;
if (fixed_mapping) {
struct buffer_attrs bfr;
int err;
struct vm_reserved_va_node *va_node = NULL;
memset(&bfr, 0, sizeof(bfr));
bfr.pgsz_idx = small_pgsz_index;
err = validate_fixed_buffer(
vm, &bfr, *compbits_win_gva,
mapped_buffer->ctag_map_win_size, &va_node);
if (err) {
mutex_unlock(&vm->update_gmmu_lock);
return err;
}
if (va_node) {
/* this would create a dangling GPU VA
* pointer if the space is freed
* before before the buffer is
* unmapped */
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d,
"%s: comptags cannot be mapped into allocated space",
__func__);
return -EINVAL;
}
}
mapped_buffer->ctag_map_win_addr =
g->ops.mm.gmmu_map(
vm,
!fixed_mapping ? 0 : *compbits_win_gva, /* va */
g->gr.compbit_store.mem.sgt,
cacheline_offset_start, /* sg offset */
mapped_buffer->ctag_map_win_size, /* size */
small_pgsz_index,
0, /* kind */
0, /* ctag_offset */
NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
gk20a_mem_flag_read_only,
false, /* clear_ctags */
false, /* sparse */
false, /* priv */
NULL, /* mapping_batch handle */
g->gr.compbit_store.mem.aperture);
if (!mapped_buffer->ctag_map_win_addr) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d,
"%s: failed to map comptags for mapping 0x%llx",
__func__, mapping_gva);
return -ENOMEM;
}
} else if (fixed_mapping && *compbits_win_gva &&
mapped_buffer->ctag_map_win_addr != *compbits_win_gva) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d,
"%s: re-requesting comptags map into mismatching address. buffer offset 0x"
"%llx, existing comptag map at 0x%llx, requested remap 0x%llx",
__func__, mapping_gva,
mapped_buffer->ctag_map_win_addr, *compbits_win_gva);
return -EINVAL;
}
*mapping_iova = gk20a_mm_iova_addr(g, mapped_buffer->sgt->sgl, 0);
*compbits_win_gva = mapped_buffer->ctag_map_win_addr;
mutex_unlock(&vm->update_gmmu_lock);
return 0;
}
/*
* Core GMMU map function for the kernel to use. If @addr is 0 then the GPU
* VA will be allocated for you. If addr is non-zero then the buffer will be
* mapped at @addr.
*/
static u64 __gk20a_gmmu_map(struct vm_gk20a *vm,
struct sg_table **sgt,
u64 addr,
u64 size,
u32 flags,
int rw_flag,
bool priv,
enum gk20a_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
u64 vaddr;
mutex_lock(&vm->update_gmmu_lock);
vaddr = g->ops.mm.gmmu_map(vm, addr,
*sgt, /* sg table */
0, /* sg offset */
size,
gmmu_page_size_kernel,
0, /* kind */
0, /* ctag_offset */
flags, rw_flag,
false, /* clear_ctags */
false, /* sparse */
priv, /* priv */
NULL, /* mapping_batch handle */
aperture);
mutex_unlock(&vm->update_gmmu_lock);
if (!vaddr) {
gk20a_err(dev_from_vm(vm), "failed to allocate va space");
return 0;
}
return vaddr;
}
u64 gk20a_gmmu_map(struct vm_gk20a *vm,
struct sg_table **sgt,
u64 size,
u32 flags,
int rw_flag,
bool priv,
enum gk20a_aperture aperture)
{
return __gk20a_gmmu_map(vm, sgt, 0, size, flags, rw_flag, priv,
aperture);
}
/*
* Like gk20a_gmmu_map() except it works on a fixed address instead.
*/
u64 gk20a_gmmu_fixed_map(struct vm_gk20a *vm,
struct sg_table **sgt,
u64 addr,
u64 size,
u32 flags,
int rw_flag,
bool priv,
enum gk20a_aperture aperture)
{
return __gk20a_gmmu_map(vm, sgt, addr, size, flags, rw_flag, priv,
aperture);
}
int gk20a_gmmu_alloc(struct gk20a *g, size_t size, struct mem_desc *mem)
{
return gk20a_gmmu_alloc_attr(g, 0, size, mem);
}
int gk20a_gmmu_alloc_attr(struct gk20a *g, enum dma_attr attr, size_t size,
struct mem_desc *mem)
{
if (g->mm.vidmem_is_vidmem) {
int err = gk20a_gmmu_alloc_attr_vid(g, attr, size, mem);
if (!err)
return 0;
/*
* Fall back to sysmem (which may then also fail) in case
* vidmem is exhausted.
*/
}
return gk20a_gmmu_alloc_attr_sys(g, attr, size, mem);
}
int gk20a_gmmu_alloc_sys(struct gk20a *g, size_t size, struct mem_desc *mem)
{
return gk20a_gmmu_alloc_attr_sys(g, 0, size, mem);
}
int gk20a_gmmu_alloc_attr_sys(struct gk20a *g, enum dma_attr attr,
size_t size, struct mem_desc *mem)
{
struct device *d = dev_from_gk20a(g);
int err;
dma_addr_t iova;
gk20a_dbg_fn("");
if (attr) {
DEFINE_DMA_ATTRS(attrs);
dma_set_attr(attr, &attrs);
if (attr == DMA_ATTR_NO_KERNEL_MAPPING) {
mem->pages = dma_alloc_attrs(d,
size, &iova, GFP_KERNEL, &attrs);
if (!mem->pages)
return -ENOMEM;
} else {
mem->cpu_va = dma_alloc_attrs(d,
size, &iova, GFP_KERNEL, &attrs);
if (!mem->cpu_va)
return -ENOMEM;
}
} else {
mem->cpu_va = dma_alloc_coherent(d, size, &iova, GFP_KERNEL);
if (!mem->cpu_va)
return -ENOMEM;
}
if (attr == DMA_ATTR_NO_KERNEL_MAPPING)
err = gk20a_get_sgtable_from_pages(d, &mem->sgt, mem->pages,
iova, size);
else {
err = gk20a_get_sgtable(d, &mem->sgt, mem->cpu_va, iova, size);
memset(mem->cpu_va, 0, size);
}
if (err)
goto fail_free;
mem->size = size;
mem->aperture = APERTURE_SYSMEM;
gk20a_dbg_fn("done");
return 0;
fail_free:
dma_free_coherent(d, size, mem->cpu_va, iova);
mem->cpu_va = NULL;
mem->sgt = NULL;
return err;
}
static void gk20a_gmmu_free_attr_sys(struct gk20a *g, enum dma_attr attr,
struct mem_desc *mem)
{
struct device *d = dev_from_gk20a(g);
if (mem->cpu_va || mem->pages) {
if (attr) {
DEFINE_DMA_ATTRS(attrs);
dma_set_attr(attr, &attrs);
if (attr == DMA_ATTR_NO_KERNEL_MAPPING) {
if (mem->pages)
dma_free_attrs(d, mem->size, mem->pages,
sg_dma_address(mem->sgt->sgl),
&attrs);
} else {
if (mem->cpu_va)
dma_free_attrs(d, mem->size,
mem->cpu_va,
sg_dma_address(mem->sgt->sgl),
&attrs);
}
} else {
if (mem->cpu_va)
dma_free_coherent(d, mem->size, mem->cpu_va,
sg_dma_address(mem->sgt->sgl));
}
mem->cpu_va = NULL;
mem->pages = NULL;
}
if (mem->sgt)
gk20a_free_sgtable(&mem->sgt);
mem->size = 0;
mem->aperture = APERTURE_INVALID;
}
#if defined(CONFIG_GK20A_VIDMEM)
static int gk20a_gmmu_clear_vidmem_mem(struct gk20a *g, struct mem_desc *mem)
{
struct gk20a_fence *gk20a_fence_out = NULL;
struct gk20a_fence *gk20a_last_fence = NULL;
struct nvgpu_page_alloc *alloc = NULL;
struct page_alloc_chunk *chunk = NULL;
int err = 0;
if (g->mm.vidmem.ce_ctx_id == (u32)~0)
return -EINVAL;
alloc = get_vidmem_page_alloc(mem->sgt->sgl);
list_for_each_entry(chunk, &alloc->alloc_chunks, list_entry) {
if (gk20a_last_fence)
gk20a_fence_put(gk20a_last_fence);
err = gk20a_ce_execute_ops(g->dev,
g->mm.vidmem.ce_ctx_id,
0,
chunk->base,
chunk->length,
0x00000000,
NVGPU_CE_DST_LOCATION_LOCAL_FB,
NVGPU_CE_MEMSET,
NULL,
0,
&gk20a_fence_out);
if (err) {
gk20a_err(g->dev,
"Failed gk20a_ce_execute_ops[%d]", err);
return err;
}
gk20a_last_fence = gk20a_fence_out;
}
if (gk20a_last_fence) {
struct nvgpu_timeout timeout;
nvgpu_timeout_init(g, &timeout,
gk20a_get_gr_idle_timeout(g),
NVGPU_TIMER_CPU_TIMER);
do {
err = gk20a_fence_wait(gk20a_last_fence,
gk20a_get_gr_idle_timeout(g));
} while (err == -ERESTARTSYS &&
!nvgpu_timeout_expired(&timeout));
gk20a_fence_put(gk20a_last_fence);
if (err)
gk20a_err(g->dev,
"fence wait failed for CE execute ops");
}
return err;
}
#endif
int gk20a_gmmu_alloc_vid(struct gk20a *g, size_t size, struct mem_desc *mem)
{
return gk20a_gmmu_alloc_attr_vid(g, 0, size, mem);
}
int gk20a_gmmu_alloc_attr_vid(struct gk20a *g, enum dma_attr attr,
size_t size, struct mem_desc *mem)
{
return gk20a_gmmu_alloc_attr_vid_at(g, attr, size, mem, 0);
}
#if defined(CONFIG_GK20A_VIDMEM)
static u64 __gk20a_gmmu_alloc(struct nvgpu_allocator *allocator, dma_addr_t at,
size_t size)
{
u64 addr = 0;
if (at)
addr = nvgpu_alloc_fixed(allocator, at, size, 0);
else
addr = nvgpu_alloc(allocator, size);
return addr;
}
#endif
int gk20a_gmmu_alloc_attr_vid_at(struct gk20a *g, enum dma_attr attr,
size_t size, struct mem_desc *mem, dma_addr_t at)
{
#if defined(CONFIG_GK20A_VIDMEM)
u64 addr;
int err;
struct nvgpu_allocator *vidmem_alloc = g->mm.vidmem.cleared ?
&g->mm.vidmem.allocator :
&g->mm.vidmem.bootstrap_allocator;
int before_pending;
gk20a_dbg_fn("");
if (!nvgpu_alloc_initialized(&g->mm.vidmem.allocator))
return -ENOSYS;
/* we don't support dma attributes here, except that kernel mappings
* are not done anyway */
WARN_ON(attr != 0 && attr != DMA_ATTR_NO_KERNEL_MAPPING);
mutex_lock(&g->mm.vidmem.clear_list_mutex);
before_pending = atomic64_read(&g->mm.vidmem.bytes_pending);
addr = __gk20a_gmmu_alloc(vidmem_alloc, at, size);
mutex_unlock(&g->mm.vidmem.clear_list_mutex);
if (!addr) {
/*
* If memory is known to be freed soon, let the user know that
* it may be available after a while.
*/
if (before_pending)
return -EAGAIN;
else
return -ENOMEM;
}
if (at)
mem->fixed = true;
else
mem->fixed = false;
mem->sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
if (!mem->sgt) {
err = -ENOMEM;
goto fail_physfree;
}
err = sg_alloc_table(mem->sgt, 1, GFP_KERNEL);
if (err)
goto fail_kfree;
set_vidmem_page_alloc(mem->sgt->sgl, addr);
sg_set_page(mem->sgt->sgl, NULL, size, 0);
mem->size = size;
mem->aperture = APERTURE_VIDMEM;
mem->allocator = vidmem_alloc;
INIT_LIST_HEAD(&mem->clear_list_entry);
gk20a_dbg_fn("done at 0x%llx size %zu", addr, size);
return 0;
fail_kfree:
kfree(mem->sgt);
fail_physfree:
nvgpu_free(&g->mm.vidmem.allocator, addr);
return err;
#else
return -ENOSYS;
#endif
}
static void gk20a_gmmu_free_attr_vid(struct gk20a *g, enum dma_attr attr,
struct mem_desc *mem)
{
#if defined(CONFIG_GK20A_VIDMEM)
bool was_empty;
if (mem->user_mem) {
mutex_lock(&g->mm.vidmem.clear_list_mutex);
was_empty = list_empty(&g->mm.vidmem.clear_list_head);
list_add_tail(&mem->clear_list_entry,
&g->mm.vidmem.clear_list_head);
atomic64_add(mem->size, &g->mm.vidmem.bytes_pending);
mutex_unlock(&g->mm.vidmem.clear_list_mutex);
if (was_empty) {
cancel_work_sync(&g->mm.vidmem.clear_mem_worker);
schedule_work(&g->mm.vidmem.clear_mem_worker);
}
} else {
gk20a_memset(g, mem, 0, 0, mem->size);
nvgpu_free(mem->allocator,
(u64)get_vidmem_page_alloc(mem->sgt->sgl));
gk20a_free_sgtable(&mem->sgt);
mem->size = 0;
mem->aperture = APERTURE_INVALID;
}
#endif
}
void gk20a_gmmu_free_attr(struct gk20a *g, enum dma_attr attr,
struct mem_desc *mem)
{
switch (mem->aperture) {
case APERTURE_SYSMEM:
return gk20a_gmmu_free_attr_sys(g, attr, mem);
case APERTURE_VIDMEM:
return gk20a_gmmu_free_attr_vid(g, attr, mem);
default:
break; /* like free() on "null" memory */
}
}
void gk20a_gmmu_free(struct gk20a *g, struct mem_desc *mem)
{
return gk20a_gmmu_free_attr(g, 0, mem);
}
/*
* If mem is in VIDMEM, return base address in vidmem
* else return IOVA address for SYSMEM
*/
u64 gk20a_mem_get_base_addr(struct gk20a *g, struct mem_desc *mem,
u32 flags)
{
struct nvgpu_page_alloc *alloc;
u64 addr;
if (mem->aperture == APERTURE_VIDMEM) {
alloc = get_vidmem_page_alloc(mem->sgt->sgl);
/* This API should not be used with > 1 chunks */
WARN_ON(alloc->nr_chunks != 1);
addr = alloc->base;
} else {
addr = g->ops.mm.get_iova_addr(g, mem->sgt->sgl, flags);
}
return addr;
}
#if defined(CONFIG_GK20A_VIDMEM)
static struct mem_desc *get_pending_mem_desc(struct mm_gk20a *mm)
{
struct mem_desc *mem = NULL;
mutex_lock(&mm->vidmem.clear_list_mutex);
mem = list_first_entry_or_null(&mm->vidmem.clear_list_head,
struct mem_desc, clear_list_entry);
if (mem)
list_del_init(&mem->clear_list_entry);
mutex_unlock(&mm->vidmem.clear_list_mutex);
return mem;
}
static void gk20a_vidmem_clear_mem_worker(struct work_struct *work)
{
struct mm_gk20a *mm = container_of(work, struct mm_gk20a,
vidmem.clear_mem_worker);
struct gk20a *g = mm->g;
struct mem_desc *mem;
while ((mem = get_pending_mem_desc(mm)) != NULL) {
gk20a_gmmu_clear_vidmem_mem(g, mem);
nvgpu_free(mem->allocator,
(u64)get_vidmem_page_alloc(mem->sgt->sgl));
gk20a_free_sgtable(&mem->sgt);
WARN_ON(atomic64_sub_return(mem->size,
&g->mm.vidmem.bytes_pending) < 0);
mem->size = 0;
mem->aperture = APERTURE_INVALID;
kfree(mem);
}
}
#endif
u32 __gk20a_aperture_mask(struct gk20a *g, enum gk20a_aperture aperture,
u32 sysmem_mask, u32 vidmem_mask)
{
switch (aperture) {
case APERTURE_SYSMEM:
/* sysmem for dgpus; some igpus consider system memory vidmem */
return g->mm.vidmem_is_vidmem ? sysmem_mask : vidmem_mask;
case APERTURE_VIDMEM:
/* for dgpus only */
return vidmem_mask;
case APERTURE_INVALID:
WARN_ON("Bad aperture");
}
return 0;
}
u32 gk20a_aperture_mask(struct gk20a *g, struct mem_desc *mem,
u32 sysmem_mask, u32 vidmem_mask)
{
return __gk20a_aperture_mask(g, mem->aperture,
sysmem_mask, vidmem_mask);
}
int gk20a_gmmu_alloc_map(struct vm_gk20a *vm, size_t size,
struct mem_desc *mem)
{
return gk20a_gmmu_alloc_map_attr(vm, 0, size, mem);
}
int gk20a_gmmu_alloc_map_attr(struct vm_gk20a *vm,
enum dma_attr attr, size_t size, struct mem_desc *mem)
{
if (vm->mm->vidmem_is_vidmem) {
int err = gk20a_gmmu_alloc_map_attr_vid(vm, 0, size, mem);
if (!err)
return 0;
/*
* Fall back to sysmem (which may then also fail) in case
* vidmem is exhausted.
*/
}
return gk20a_gmmu_alloc_map_attr_sys(vm, 0, size, mem);
}
int gk20a_gmmu_alloc_map_sys(struct vm_gk20a *vm, size_t size,
struct mem_desc *mem)
{
return gk20a_gmmu_alloc_map_attr_sys(vm, 0, size, mem);
}
int gk20a_gmmu_alloc_map_attr_sys(struct vm_gk20a *vm,
enum dma_attr attr, size_t size, struct mem_desc *mem)
{
int err = gk20a_gmmu_alloc_attr_sys(vm->mm->g, attr, size, mem);
if (err)
return err;
mem->gpu_va = gk20a_gmmu_map(vm, &mem->sgt, size, 0,
gk20a_mem_flag_none, false,
mem->aperture);
if (!mem->gpu_va) {
err = -ENOMEM;
goto fail_free;
}
return 0;
fail_free:
gk20a_gmmu_free(vm->mm->g, mem);
return err;
}
int gk20a_gmmu_alloc_map_vid(struct vm_gk20a *vm, size_t size, struct mem_desc *mem)
{
return gk20a_gmmu_alloc_map_attr_vid(vm, 0, size, mem);
}
int gk20a_gmmu_alloc_map_attr_vid(struct vm_gk20a *vm,
enum dma_attr attr, size_t size, struct mem_desc *mem)
{
int err = gk20a_gmmu_alloc_attr_vid(vm->mm->g, attr, size, mem);
if (err)
return err;
mem->gpu_va = gk20a_gmmu_map(vm, &mem->sgt, size, 0,
gk20a_mem_flag_none, false,
mem->aperture);
if (!mem->gpu_va) {
err = -ENOMEM;
goto fail_free;
}
return 0;
fail_free:
gk20a_gmmu_free(vm->mm->g, mem);
return err;
}
void gk20a_gmmu_unmap_free(struct vm_gk20a *vm, struct mem_desc *mem)
{
if (mem->gpu_va)
gk20a_gmmu_unmap(vm, mem->gpu_va, mem->size, gk20a_mem_flag_none);
mem->gpu_va = 0;
gk20a_gmmu_free(vm->mm->g, mem);
}
dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr)
{
struct mapped_buffer_node *buffer;
dma_addr_t addr = 0;
struct gk20a *g = gk20a_from_vm(vm);
mutex_lock(&vm->update_gmmu_lock);
buffer = find_mapped_buffer_locked(&vm->mapped_buffers, gpu_vaddr);
if (buffer)
addr = g->ops.mm.get_iova_addr(g, buffer->sgt->sgl,
buffer->flags);
mutex_unlock(&vm->update_gmmu_lock);
return addr;
}
void gk20a_gmmu_unmap(struct vm_gk20a *vm,
u64 vaddr,
u64 size,
int rw_flag)
{
struct gk20a *g = gk20a_from_vm(vm);
mutex_lock(&vm->update_gmmu_lock);
g->ops.mm.gmmu_unmap(vm,
vaddr,
size,
gmmu_page_size_kernel,
true, /*va_allocated */
rw_flag,
false,
NULL);
mutex_unlock(&vm->update_gmmu_lock);
}
phys_addr_t gk20a_get_phys_from_iova(struct device *d,
u64 dma_addr)
{
phys_addr_t phys;
u64 iova;
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
if (!mapping)
return dma_addr;
iova = dma_addr & PAGE_MASK;
phys = iommu_iova_to_phys(mapping->domain, iova);
return phys;
}
/* get sg_table from already allocated buffer */
int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
void *cpuva, u64 iova,
size_t size)
{
int err = 0;
*sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
if (!(*sgt)) {
dev_err(d, "failed to allocate memory\n");
err = -ENOMEM;
goto fail;
}
err = dma_get_sgtable(d, *sgt,
cpuva, iova,
size);
if (err) {
dev_err(d, "failed to create sg table\n");
goto fail;
}
sg_dma_address((*sgt)->sgl) = iova;
return 0;
fail:
if (*sgt) {
kfree(*sgt);
*sgt = NULL;
}
return err;
}
int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
struct page **pages, u64 iova,
size_t size)
{
int err = 0;
*sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
if (!(*sgt)) {
dev_err(d, "failed to allocate memory\n");
err = -ENOMEM;
goto fail;
}
err = sg_alloc_table_from_pages(*sgt, pages,
DIV_ROUND_UP(size, PAGE_SIZE), 0, size, GFP_KERNEL);
if (err) {
dev_err(d, "failed to allocate sg_table\n");
goto fail;
}
sg_dma_address((*sgt)->sgl) = iova;
return 0;
fail:
if (*sgt) {
kfree(*sgt);
*sgt = NULL;
}
return err;
}
void gk20a_free_sgtable(struct sg_table **sgt)
{
sg_free_table(*sgt);
kfree(*sgt);
*sgt = NULL;
}
u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova)
{
/* ensure it is not vidmem allocation */
WARN_ON(is_vidmem_page_alloc((u64)iova));
if (device_is_iommuable(dev_from_gk20a(g)) &&
g->ops.mm.get_physical_addr_bits)
return iova | 1ULL << g->ops.mm.get_physical_addr_bits(g);
return iova;
}
u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl,
u32 flags)
{
if (!device_is_iommuable(dev_from_gk20a(g)))
return sg_phys(sgl);
if (sg_dma_address(sgl) == 0)
return sg_phys(sgl);
if (sg_dma_address(sgl) == DMA_ERROR_CODE)
return 0;
return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
}
void gk20a_pde_wr32(struct gk20a *g, struct gk20a_mm_entry *entry,
size_t w, size_t data)
{
gk20a_mem_wr32(g, &entry->mem, entry->woffset + w, data);
}
u64 gk20a_pde_addr(struct gk20a *g, struct gk20a_mm_entry *entry)
{
u64 base;
if (g->mm.has_physical_mode)
base = sg_phys(entry->mem.sgt->sgl);
else
base = gk20a_mem_get_base_addr(g, &entry->mem, 0);
return base + entry->woffset * sizeof(u32);
}
/* for gk20a the "video memory" apertures here are misnomers. */
static inline u32 big_valid_pde0_bits(struct gk20a *g,
struct gk20a_mm_entry *entry)
{
u64 pte_addr = gk20a_pde_addr(g, entry);
u32 pde0_bits =
gk20a_aperture_mask(g, &entry->mem,
gmmu_pde_aperture_big_sys_mem_ncoh_f(),
gmmu_pde_aperture_big_video_memory_f()) |
gmmu_pde_address_big_sys_f(
(u32)(pte_addr >> gmmu_pde_address_shift_v()));
return pde0_bits;
}
static inline u32 small_valid_pde1_bits(struct gk20a *g,
struct gk20a_mm_entry *entry)
{
u64 pte_addr = gk20a_pde_addr(g, entry);
u32 pde1_bits =
gk20a_aperture_mask(g, &entry->mem,
gmmu_pde_aperture_small_sys_mem_ncoh_f(),
gmmu_pde_aperture_small_video_memory_f()) |
gmmu_pde_vol_small_true_f() | /* tbd: why? */
gmmu_pde_address_small_sys_f(
(u32)(pte_addr >> gmmu_pde_address_shift_v()));
return pde1_bits;
}
/* Given the current state of the ptes associated with a pde,
determine value and write it out. There's no checking
here to determine whether or not a change was actually
made. So, superfluous updates will cause unnecessary
pde invalidations.
*/
static int update_gmmu_pde_locked(struct vm_gk20a *vm,
struct gk20a_mm_entry *pte,
u32 i, u32 gmmu_pgsz_idx,
struct scatterlist **sgl,
u64 *offset,
u64 *iova,
u32 kind_v, u64 *ctag,
bool cacheable, bool unammped_pte,
int rw_flag, bool sparse, bool priv,
enum gk20a_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
bool small_valid, big_valid;
struct gk20a_mm_entry *entry = vm->pdb.entries + i;
u32 pde_v[2] = {0, 0};
u32 pde;
gk20a_dbg_fn("");
small_valid = entry->mem.size && entry->pgsz == gmmu_page_size_small;
big_valid = entry->mem.size && entry->pgsz == gmmu_page_size_big;
pde_v[0] = gmmu_pde_size_full_f();
pde_v[0] |= big_valid ?
big_valid_pde0_bits(g, entry) :
gmmu_pde_aperture_big_invalid_f();
pde_v[1] |= (small_valid ?
small_valid_pde1_bits(g, entry) :
(gmmu_pde_aperture_small_invalid_f() |
gmmu_pde_vol_small_false_f()))
|
(big_valid ? (gmmu_pde_vol_big_true_f()) :
gmmu_pde_vol_big_false_f());
pde = pde_from_index(i);
gk20a_pde_wr32(g, &vm->pdb, pde + 0, pde_v[0]);
gk20a_pde_wr32(g, &vm->pdb, pde + 1, pde_v[1]);
gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
return 0;
}
static int update_gmmu_pte_locked(struct vm_gk20a *vm,
struct gk20a_mm_entry *pte,
u32 i, u32 gmmu_pgsz_idx,
struct scatterlist **sgl,
u64 *offset,
u64 *iova,
u32 kind_v, u64 *ctag,
bool cacheable, bool unmapped_pte,
int rw_flag, bool sparse, bool priv,
enum gk20a_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
int ctag_shift = ilog2(g->ops.fb.compression_page_size(g));
u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
u32 pte_w[2] = {0, 0}; /* invalid pte */
if (*iova) {
u32 pte_valid = unmapped_pte ?
gmmu_pte_valid_false_f() :
gmmu_pte_valid_true_f();
u32 iova_v = *iova >> gmmu_pte_address_shift_v();
u32 pte_addr = aperture == APERTURE_SYSMEM ?
gmmu_pte_address_sys_f(iova_v) :
gmmu_pte_address_vid_f(iova_v);
pte_w[0] = pte_valid | pte_addr;
if (priv)
pte_w[0] |= gmmu_pte_privilege_true_f();
pte_w[1] = __gk20a_aperture_mask(g, aperture,
gmmu_pte_aperture_sys_mem_ncoh_f(),
gmmu_pte_aperture_video_memory_f()) |
gmmu_pte_kind_f(kind_v) |
gmmu_pte_comptagline_f((u32)(*ctag >> ctag_shift));
if (*ctag && vm->mm->use_full_comp_tag_line && *iova & 0x10000)
pte_w[1] |= gmmu_pte_comptagline_f(
1 << (gmmu_pte_comptagline_s() - 1));
if (rw_flag == gk20a_mem_flag_read_only) {
pte_w[0] |= gmmu_pte_read_only_true_f();
pte_w[1] |=
gmmu_pte_write_disable_true_f();
} else if (rw_flag ==
gk20a_mem_flag_write_only) {
pte_w[1] |=
gmmu_pte_read_disable_true_f();
}
if (!unmapped_pte) {
if (!cacheable)
pte_w[1] |=
gmmu_pte_vol_true_f();
} else {
/* Store cacheable value behind
* gmmu_pte_write_disable_true_f */
if (!cacheable)
pte_w[1] |=
gmmu_pte_write_disable_true_f();
}
gk20a_dbg(gpu_dbg_pte,
"pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
i, *iova,
kind_v, (u32)(*ctag >> ctag_shift), !cacheable,
pte_w[1], pte_w[0]);
if (*ctag)
*ctag += page_size;
} else if (sparse) {
pte_w[0] = gmmu_pte_valid_false_f();
pte_w[1] |= gmmu_pte_vol_true_f();
} else {
gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
}
gk20a_pde_wr32(g, pte, pte_from_index(i) + 0, pte_w[0]);
gk20a_pde_wr32(g, pte, pte_from_index(i) + 1, pte_w[1]);
if (*iova) {
*iova += page_size;
*offset += page_size;
if (*sgl && *offset + page_size > (*sgl)->length) {
u64 new_iova;
*sgl = sg_next(*sgl);
if (*sgl) {
new_iova = sg_phys(*sgl);
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
new_iova, (*sgl)->length);
if (new_iova) {
*offset = 0;
*iova = new_iova;
}
}
}
}
return 0;
}
static int update_gmmu_level_locked(struct vm_gk20a *vm,
struct gk20a_mm_entry *pte,
enum gmmu_pgsz_gk20a pgsz_idx,
struct scatterlist **sgl,
u64 *offset,
u64 *iova,
u64 gpu_va, u64 gpu_end,
u8 kind_v, u64 *ctag,
bool cacheable, bool unmapped_pte,
int rw_flag,
bool sparse,
int lvl,
bool priv,
enum gk20a_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
int err = 0;
u32 pde_i;
u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
struct gk20a_mm_entry *next_pte = NULL, *prev_pte = NULL;
gk20a_dbg_fn("");
pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
>> (u64)l->lo_bit[pgsz_idx];
gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
pgsz_idx, lvl, gpu_va, gpu_end-1, *iova);
while (gpu_va < gpu_end) {
u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
/* Allocate next level */
if (next_l->update_entry) {
if (!pte->entries) {
int num_entries =
1 <<
(l->hi_bit[pgsz_idx]
- l->lo_bit[pgsz_idx] + 1);
pte->entries =
vzalloc(sizeof(struct gk20a_mm_entry) *
num_entries);
if (!pte->entries)
return -ENOMEM;
pte->pgsz = pgsz_idx;
pte->num_entries = num_entries;
}
prev_pte = next_pte;
next_pte = pte->entries + pde_i;
if (!next_pte->mem.size) {
err = gk20a_zalloc_gmmu_page_table(vm,
pgsz_idx, next_l, next_pte, prev_pte);
if (err)
return err;
}
}
err = l->update_entry(vm, pte, pde_i, pgsz_idx,
sgl, offset, iova,
kind_v, ctag, cacheable, unmapped_pte,
rw_flag, sparse, priv, aperture);
if (err)
return err;
if (next_l->update_entry) {
/* get cpu access to the ptes */
err = map_gmmu_pages(g, next_pte);
if (err) {
gk20a_err(dev_from_vm(vm),
"couldn't map ptes for update as=%d",
vm_aspace_id(vm));
return err;
}
err = update_gmmu_level_locked(vm, next_pte,
pgsz_idx,
sgl,
offset,
iova,
gpu_va,
next,
kind_v, ctag, cacheable, unmapped_pte,
rw_flag, sparse, lvl+1, priv, aperture);
unmap_gmmu_pages(g, next_pte);
if (err)
return err;
}
pde_i++;
gpu_va = next;
}
gk20a_dbg_fn("done");
return 0;
}
static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
enum gmmu_pgsz_gk20a pgsz_idx,
struct sg_table *sgt,
u64 buffer_offset,
u64 gpu_va, u64 gpu_end,
u8 kind_v, u32 ctag_offset,
bool cacheable, bool unmapped_pte,
int rw_flag,
bool sparse,
bool priv,
enum gk20a_aperture aperture)
{
struct gk20a *g = gk20a_from_vm(vm);
int ctag_granularity = g->ops.fb.compression_page_size(g);
u64 ctag = (u64)ctag_offset * (u64)ctag_granularity;
u64 iova = 0;
u64 space_to_skip = buffer_offset;
u64 map_size = gpu_end - gpu_va;
u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
int err;
struct scatterlist *sgl = NULL;
struct nvgpu_page_alloc *alloc = NULL;
struct page_alloc_chunk *chunk = NULL;
u64 length;
/* note: here we need to map kernel to small, since the
* low-level mmu code assumes 0 is small and 1 is big pages */
if (pgsz_idx == gmmu_page_size_kernel)
pgsz_idx = gmmu_page_size_small;
if (space_to_skip & (page_size - 1))
return -EINVAL;
err = map_gmmu_pages(g, &vm->pdb);
if (err) {
gk20a_err(dev_from_vm(vm),
"couldn't map ptes for update as=%d",
vm_aspace_id(vm));
return err;
}
if (aperture == APERTURE_VIDMEM) {
gk20a_dbg(gpu_dbg_map_v, "vidmem map size_idx=%d, gpu_va=[%llx,%llx], alloc=%llx",
pgsz_idx, gpu_va, gpu_end-1, iova);
if (sgt) {
alloc = get_vidmem_page_alloc(sgt->sgl);
list_for_each_entry(chunk, &alloc->alloc_chunks,
list_entry) {
if (space_to_skip &&
space_to_skip > chunk->length) {
space_to_skip -= chunk->length;
} else {
iova = chunk->base + space_to_skip;
length = chunk->length - space_to_skip;
length = min(length, map_size);
space_to_skip = 0;
err = update_gmmu_level_locked(vm,
&vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_va + length,
kind_v, &ctag,
cacheable, unmapped_pte,
rw_flag, sparse, 0, priv,
aperture);
if (err)
break;
/* need to set explicit zero here */
space_to_skip = 0;
gpu_va += length;
map_size -= length;
if (!map_size)
break;
}
}
} else {
err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_end,
kind_v, &ctag,
cacheable, unmapped_pte, rw_flag,
sparse, 0, priv,
aperture);
}
} else {
gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx, buffer offset %lld, nents %d",
pgsz_idx,
sgt ? g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0)
: 0ULL,
buffer_offset,
sgt ? sgt->nents : 0);
gk20a_dbg(gpu_dbg_map_v, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
pgsz_idx, gpu_va, gpu_end-1, iova);
if (sgt) {
iova = g->ops.mm.get_iova_addr(vm->mm->g, sgt->sgl, 0);
if (!vm->mm->bypass_smmu && iova) {
iova += space_to_skip;
} else {
sgl = sgt->sgl;
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
(u64)sg_phys(sgl),
sgl->length);
while (space_to_skip && sgl &&
space_to_skip + page_size > sgl->length) {
space_to_skip -= sgl->length;
sgl = sg_next(sgl);
gk20a_dbg(gpu_dbg_pte, "chunk address %llx, size %d",
(u64)sg_phys(sgl),
sgl->length);
}
iova = sg_phys(sgl) + space_to_skip;
}
}
err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
&sgl,
&space_to_skip,
&iova,
gpu_va, gpu_end,
kind_v, &ctag,
cacheable, unmapped_pte, rw_flag,
sparse, 0, priv,
aperture);
}
unmap_gmmu_pages(g, &vm->pdb);
smp_mb();
gk20a_dbg_fn("done");
return err;
}
/* NOTE! mapped_buffers lock must be held */
void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer,
struct vm_gk20a_mapping_batch *batch)
{
struct vm_gk20a *vm = mapped_buffer->vm;
struct gk20a *g = vm->mm->g;
if (mapped_buffer->ctag_map_win_addr) {
/* unmap compbits */
g->ops.mm.gmmu_unmap(vm,
mapped_buffer->ctag_map_win_addr,
mapped_buffer->ctag_map_win_size,
0, /* page size 4k */
true, /* va allocated */
gk20a_mem_flag_none,
false, /* not sparse */
batch); /* batch handle */
}
g->ops.mm.gmmu_unmap(vm,
mapped_buffer->addr,
mapped_buffer->size,
mapped_buffer->pgsz_idx,
mapped_buffer->va_allocated,
gk20a_mem_flag_none,
mapped_buffer->va_node ?
mapped_buffer->va_node->sparse : false,
batch);
gk20a_dbg(gpu_dbg_map,
"gv: 0x%04x_%08x pgsz=%-3dKb as=%-2d own_mem_ref=%d",
hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
vm->gmmu_page_sizes[mapped_buffer->pgsz_idx] >> 10,
vm_aspace_id(vm),
mapped_buffer->own_mem_ref);
gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
mapped_buffer->sgt);
/* remove from mapped buffer tree and remove list, free */
rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
if (!list_empty(&mapped_buffer->va_buffers_list))
list_del(&mapped_buffer->va_buffers_list);
/* keep track of mapped buffers */
if (mapped_buffer->user_mapped)
vm->num_user_mapped_buffers--;
if (mapped_buffer->own_mem_ref)
dma_buf_put(mapped_buffer->dmabuf);
kfree(mapped_buffer);
return;
}
void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
{
struct device *d = dev_from_vm(vm);
struct mapped_buffer_node *mapped_buffer;
mutex_lock(&vm->update_gmmu_lock);
mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
if (!mapped_buffer) {
mutex_unlock(&vm->update_gmmu_lock);
gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
return;
}
kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
mutex_unlock(&vm->update_gmmu_lock);
}
static void gk20a_vm_free_entries(struct vm_gk20a *vm,
struct gk20a_mm_entry *parent,
int level)
{
int i;
if (parent->entries)
for (i = 0; i < parent->num_entries; i++)
gk20a_vm_free_entries(vm, &parent->entries[i], level+1);
if (parent->mem.size)
free_gmmu_pages(vm, parent);
vfree(parent->entries);
parent->entries = NULL;
}
static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
{
struct mapped_buffer_node *mapped_buffer;
struct vm_reserved_va_node *va_node, *va_node_tmp;
struct rb_node *node;
gk20a_dbg_fn("");
/*
* Do this outside of the update_gmmu_lock since unmapping the semaphore
* pool involves unmapping a GMMU mapping which means aquiring the
* update_gmmu_lock.
*/
if (!gk20a_platform_has_syncpoints(gk20a_from_vm(vm)->dev)) {
if (vm->sema_pool) {
gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
gk20a_semaphore_pool_put(vm->sema_pool);
}
}
mutex_lock(&vm->update_gmmu_lock);
/* TBD: add a flag here for the unmap code to recognize teardown
* and short-circuit any otherwise expensive operations. */
node = rb_first(&vm->mapped_buffers);
while (node) {
mapped_buffer =
container_of(node, struct mapped_buffer_node, node);
gk20a_vm_unmap_locked(mapped_buffer, NULL);
node = rb_first(&vm->mapped_buffers);
}
/* destroy remaining reserved memory areas */
list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
reserved_va_list) {
list_del(&va_node->reserved_va_list);
kfree(va_node);
}
gk20a_deinit_vm(vm);
mutex_unlock(&vm->update_gmmu_lock);
}
void gk20a_vm_remove_support(struct vm_gk20a *vm)
{
gk20a_vm_remove_support_nofree(vm);
/* vm is not used anymore. release it. */
kfree(vm);
}
static void gk20a_vm_remove_support_kref(struct kref *ref)
{
struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
struct gk20a *g = gk20a_from_vm(vm);
g->ops.mm.vm_remove(vm);
}
void gk20a_vm_get(struct vm_gk20a *vm)
{
kref_get(&vm->ref);
}
void gk20a_vm_put(struct vm_gk20a *vm)
{
kref_put(&vm->ref, gk20a_vm_remove_support_kref);
}
const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
{.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
.lo_bit = {26, 26},
.update_entry = update_gmmu_pde_locked,
.entry_size = 8},
{.hi_bit = {25, 25},
.lo_bit = {12, 16},
.update_entry = update_gmmu_pte_locked,
.entry_size = 8},
{.update_entry = NULL}
};
const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
{.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
.lo_bit = {27, 27},
.update_entry = update_gmmu_pde_locked,
.entry_size = 8},
{.hi_bit = {26, 26},
.lo_bit = {12, 17},
.update_entry = update_gmmu_pte_locked,
.entry_size = 8},
{.update_entry = NULL}
};
/*
* Initialize a semaphore pool. Just return successfully if we do not need
* semaphores (i.e when sync-pts are active).
*/
static int gk20a_init_sema_pool(struct vm_gk20a *vm)
{
struct gk20a_semaphore_sea *sema_sea;
struct mm_gk20a *mm = vm->mm;
struct gk20a *g = mm->g;
int err;
/*
* Don't waste the memory on semaphores if we don't need them.
*/
if (gk20a_platform_has_syncpoints(g->dev))
return 0;
if (vm->sema_pool)
return 0;
sema_sea = gk20a_semaphore_sea_create(g);
if (!sema_sea)
return -ENOMEM;
vm->sema_pool = gk20a_semaphore_pool_alloc(sema_sea);
if (!vm->sema_pool) {
gk20a_vm_put(vm);
return -ENOMEM;
}
/*
* Allocate a chunk of GPU VA space for mapping the semaphores. We will
* do a fixed alloc in the kernel VM so that all channels have the same
* RO address range for the semaphores.
*
* !!! TODO: cleanup.
*/
sema_sea->gpu_va = nvgpu_alloc_fixed(&vm->kernel,
vm->va_limit -
mm->channel.kernel_size,
512 * PAGE_SIZE,
SZ_4K);
if (!sema_sea->gpu_va) {
nvgpu_free(&vm->kernel, sema_sea->gpu_va);
gk20a_vm_put(vm);
return -ENOMEM;
}
err = gk20a_semaphore_pool_map(vm->sema_pool, vm);
if (err) {
gk20a_semaphore_pool_unmap(vm->sema_pool, vm);
nvgpu_free(vm->vma[gmmu_page_size_small],
vm->sema_pool->gpu_va);
gk20a_vm_put(vm);
}
return 0;
}
/*
* Determine if the passed address space can support big pages or not.
*/
int gk20a_big_pages_possible(struct vm_gk20a *vm, u64 base, u64 size)
{
u64 mask = ((u64)vm->big_page_size << 10) - 1;
if (base & mask || size & mask)
return 0;
return 1;
}
/*
* Attempt to find a reserved memory area to determine PTE size for the passed
* mapping. If no reserved area can be found use small pages but drop a warning.
*/
enum gmmu_pgsz_gk20a __get_pte_size_fixed_map(struct vm_gk20a *vm,
u64 base, u64 size)
{
struct vm_reserved_va_node *node;
node = addr_to_reservation(vm, base);
if (!node)
return gmmu_page_size_small;
return node->pgsz_idx;
}
/**
* gk20a_init_vm() - Initialize an address space.
*
* @mm - Parent MM.
* @vm - The VM to init.
* @big_page_size - Size of big pages associated with this VM.
* @low_hole - The size of the low hole (unaddressable memory at the bottom of
* the address space.
* @kernel_reserved - Space reserved for kernel only allocations.
* @aperture_size - Total size of the aperture.
* @big_pages - Ignored. Will be set based on other passed params.
* @name - Name of the address space.
*
* This function initializes an address space according to the following map:
*
* +--+ 0x0
* | |
* +--+ @low_hole
* | |
* ~ ~ This is the "user" section.
* | |
* +--+ @aperture_size - @kernel_reserved
* | |
* ~ ~ This is the "kernel" section.
* | |
* +--+ @aperture_size
*
* The user section is therefor what ever is left over after the @low_hole and
* @kernel_reserved memory have been portioned out. The @kernel_reserved is
* always persent at the top of the memory space and the @low_hole is always at
* the bottom.
*
* For certain address spaces a "user" section makes no sense (bar1, etc) so in
* such cases the @kernel_reserved and @low_hole should sum to exactly
* @aperture_size.
*/
int gk20a_init_vm(struct mm_gk20a *mm,
struct vm_gk20a *vm,
u32 big_page_size,
u64 low_hole,
u64 kernel_reserved,
u64 aperture_size,
bool big_pages,
bool userspace_managed,
char *name)
{
int err, i;
char alloc_name[32];
u64 user_vma_start, user_vma_limit, kernel_vma_start, kernel_vma_limit;
u32 pde_lo, pde_hi;
struct gk20a *g = mm->g;
/* note: this must match gmmu_pgsz_gk20a enum */
u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size, SZ_4K };
if (WARN_ON(kernel_reserved + low_hole > aperture_size))
return -ENOMEM;
vm->mm = mm;
/* Set up vma pointers. */
vm->vma[0] = &vm->user;
vm->vma[1] = &vm->user;
vm->vma[2] = &vm->kernel;
vm->va_start = low_hole;
vm->va_limit = aperture_size;
vm->big_pages = big_pages;
vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
vm->userspace_managed = userspace_managed;
vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
vm->big_page_size);
for (i = 0; i < gmmu_nr_page_sizes; i++)
vm->gmmu_page_sizes[i] = gmmu_page_sizes[i];
gk20a_dbg_info("small page-size (%dKB)",
vm->gmmu_page_sizes[gmmu_page_size_small] >> 10);
gk20a_dbg_info("big page-size (%dKB) (%s)\n",
vm->gmmu_page_sizes[gmmu_page_size_big] >> 10, name);
gk20a_dbg_info("kernel page-size (%dKB)",
vm->gmmu_page_sizes[gmmu_page_size_kernel] >> 10);
pde_range_from_vaddr_range(vm,
0, vm->va_limit-1,
&pde_lo, &pde_hi);
vm->pdb.entries = vzalloc(sizeof(struct gk20a_mm_entry) *
(pde_hi + 1));
vm->pdb.num_entries = pde_hi + 1;
if (!vm->pdb.entries)
return -ENOMEM;
gk20a_dbg_info("init space for %s va_limit=0x%llx num_pdes=%d",
name, vm->va_limit, pde_hi + 1);
/* allocate the page table directory */
err = gk20a_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0],
&vm->pdb, NULL);
if (err)
goto clean_up_pdes;
/* setup vma limits */
user_vma_start = low_hole;
user_vma_limit = vm->va_limit - kernel_reserved;
kernel_vma_start = vm->va_limit - kernel_reserved;
kernel_vma_limit = vm->va_limit;
gk20a_dbg_info(
"user_vma=[0x%llx,0x%llx) kernel_vma=[0x%llx,0x%llx)\n",
user_vma_start, user_vma_limit,
kernel_vma_start, kernel_vma_limit);
WARN_ON(user_vma_start > user_vma_limit);
WARN_ON(kernel_vma_start >= kernel_vma_limit);
/*
* A "user" area only makes sense for the GVA spaces. For VMs where
* there is no "user" area user_vma_start will be equal to
* user_vma_limit (i.e a 0 sized space). In such a situation the kernel
* area must be non-zero in length.
*/
if (user_vma_start > user_vma_limit ||
kernel_vma_start >= kernel_vma_limit) {
err = -EINVAL;
goto clean_up_pdes;
}
/*
* Attempt to make a separate VM for fixed allocations.
*/
if (g->separate_fixed_allocs &&
user_vma_start < user_vma_limit) {
if (g->separate_fixed_allocs >= user_vma_limit)
goto clean_up_pdes;
snprintf(alloc_name, sizeof(alloc_name),
"gk20a_%s-fixed", name);
err = __nvgpu_buddy_allocator_init(g, &vm->fixed,
vm, alloc_name,
user_vma_start,
g->separate_fixed_allocs,
SZ_4K,
GPU_BALLOC_MAX_ORDER,
GPU_ALLOC_GVA_SPACE);
if (err)
goto clean_up_ptes;
/* Make sure to update the user vma size. */
user_vma_start = g->separate_fixed_allocs;
}
if (user_vma_start < user_vma_limit) {
snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s", name);
if (!gk20a_big_pages_possible(vm, user_vma_start,
user_vma_limit - user_vma_start))
vm->big_pages = false;
err = __nvgpu_buddy_allocator_init(g, &vm->user,
vm, alloc_name,
user_vma_start,
user_vma_limit -
user_vma_start,
SZ_4K,
GPU_BALLOC_MAX_ORDER,
GPU_ALLOC_GVA_SPACE);
if (err)
goto clean_up_ptes;
} else {
/*
* Make these allocator pointers point to the kernel allocator
* since we still use the legacy notion of page size to choose
* the allocator.
*/
vm->vma[0] = &vm->kernel;
vm->vma[1] = &vm->kernel;
}
snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-sys", name);
if (!gk20a_big_pages_possible(vm, kernel_vma_start,
kernel_vma_limit - kernel_vma_start))
vm->big_pages = false;
err = __nvgpu_buddy_allocator_init(g, &vm->kernel,
vm, alloc_name,
kernel_vma_start,
kernel_vma_limit - kernel_vma_start,
SZ_4K,
GPU_BALLOC_MAX_ORDER,
GPU_ALLOC_GVA_SPACE);
if (err)
goto clean_up_user_allocator;
vm->mapped_buffers = RB_ROOT;
mutex_init(&vm->update_gmmu_lock);
kref_init(&vm->ref);
INIT_LIST_HEAD(&vm->reserved_va_list);
/*
* This is only necessary for channel address spaces. The best way to
* distinguish channel address spaces from other address spaces is by
* size - if the address space is 4GB or less, it's not a channel.
*/
if (vm->va_limit > SZ_4G) {
err = gk20a_init_sema_pool(vm);
if (err)
goto clean_up_user_allocator;
}
return 0;
clean_up_user_allocator:
if (user_vma_start < user_vma_limit)
nvgpu_alloc_destroy(&vm->user);
clean_up_ptes:
free_gmmu_pages(vm, &vm->pdb);
clean_up_pdes:
vfree(vm->pdb.entries);
return err;
}
/* address space interfaces for the gk20a module */
int gk20a_vm_alloc_share(struct gk20a_as_share *as_share, u32 big_page_size,
u32 flags)
{
struct gk20a_as *as = as_share->as;
struct gk20a *g = gk20a_from_as(as);
struct mm_gk20a *mm = &g->mm;
struct vm_gk20a *vm;
char name[32];
int err;
const bool userspace_managed =
(flags & NVGPU_GPU_IOCTL_ALLOC_AS_FLAGS_USERSPACE_MANAGED) != 0;
gk20a_dbg_fn("");
if (big_page_size == 0) {
big_page_size =
gk20a_get_platform(g->dev)->default_big_page_size;
} else {
if (!is_power_of_2(big_page_size))
return -EINVAL;
if (!(big_page_size & g->gpu_characteristics.available_big_page_sizes))
return -EINVAL;
}
vm = kzalloc(sizeof(*vm), GFP_KERNEL);
if (!vm)
return -ENOMEM;
as_share->vm = vm;
vm->as_share = as_share;
vm->enable_ctag = true;
snprintf(name, sizeof(name), "as_%d", as_share->id);
err = gk20a_init_vm(mm, vm, big_page_size,
big_page_size << 10,
mm->channel.kernel_size,
mm->channel.user_size + mm->channel.kernel_size,
!mm->disable_bigpage, userspace_managed, name);
return err;
}
int gk20a_vm_release_share(struct gk20a_as_share *as_share)
{
struct vm_gk20a *vm = as_share->vm;
gk20a_dbg_fn("");
vm->as_share = NULL;
/* put as reference to vm */
gk20a_vm_put(vm);
as_share->vm = NULL;
return 0;
}
int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
struct nvgpu_as_alloc_space_args *args)
{
int err = -ENOMEM;
int pgsz_idx = gmmu_page_size_small;
struct nvgpu_allocator *vma;
struct vm_gk20a *vm = as_share->vm;
struct gk20a *g = vm->mm->g;
struct vm_reserved_va_node *va_node;
u64 vaddr_start = 0;
int page_sizes = gmmu_nr_page_sizes;
gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
args->flags, args->page_size, args->pages,
args->o_a.offset);
if (!vm->big_pages)
page_sizes--;
for (; pgsz_idx < page_sizes; pgsz_idx++) {
if (vm->gmmu_page_sizes[pgsz_idx] == args->page_size)
break;
}
if (pgsz_idx >= page_sizes) {
err = -EINVAL;
goto clean_up;
}
va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
if (!va_node) {
err = -ENOMEM;
goto clean_up;
}
vma = vm->vma[pgsz_idx];
if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET) {
if (nvgpu_alloc_initialized(&vm->fixed))
vma = &vm->fixed;
vaddr_start = nvgpu_alloc_fixed(vma, args->o_a.offset,
(u64)args->pages *
(u64)args->page_size,
args->page_size);
} else {
vaddr_start = nvgpu_alloc(vma,
(u64)args->pages *
(u64)args->page_size);
}
if (!vaddr_start) {
kfree(va_node);
goto clean_up;
}
va_node->vaddr_start = vaddr_start;
va_node->size = (u64)args->page_size * (u64)args->pages;
va_node->pgsz_idx = pgsz_idx;
INIT_LIST_HEAD(&va_node->va_buffers_list);
INIT_LIST_HEAD(&va_node->reserved_va_list);
mutex_lock(&vm->update_gmmu_lock);
/* mark that we need to use sparse mappings here */
if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_SPARSE) {
u64 map_offset = g->ops.mm.gmmu_map(vm, vaddr_start,
NULL,
0,
va_node->size,
pgsz_idx,
0,
0,
args->flags,
gk20a_mem_flag_none,
false,
true,
false,
NULL,
APERTURE_INVALID);
if (!map_offset) {
mutex_unlock(&vm->update_gmmu_lock);
nvgpu_free(vma, vaddr_start);
kfree(va_node);
goto clean_up;
}
va_node->sparse = true;
}
list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
mutex_unlock(&vm->update_gmmu_lock);
args->o_a.offset = vaddr_start;
err = 0;
clean_up:
return err;
}
int gk20a_vm_free_space(struct gk20a_as_share *as_share,
struct nvgpu_as_free_space_args *args)
{
int err = -ENOMEM;
int pgsz_idx;
struct nvgpu_allocator *vma;
struct vm_gk20a *vm = as_share->vm;
struct vm_reserved_va_node *va_node;
struct gk20a *g = gk20a_from_vm(vm);
gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
args->pages, args->offset);
/* determine pagesz idx */
pgsz_idx = __get_pte_size(vm, args->offset,
args->page_size * args->pages);
if (nvgpu_alloc_initialized(&vm->fixed))
vma = &vm->fixed;
else
vma = vm->vma[pgsz_idx];
nvgpu_free(vma, args->offset);
mutex_lock(&vm->update_gmmu_lock);
va_node = addr_to_reservation(vm, args->offset);
if (va_node) {
struct mapped_buffer_node *buffer, *n;
/* Decrement the ref count on all buffers in this va_node. This
* allows userspace to let the kernel free mappings that are
* only used by this va_node. */
list_for_each_entry_safe(buffer, n,
&va_node->va_buffers_list, va_buffers_list) {
list_del_init(&buffer->va_buffers_list);
kref_put(&buffer->ref, gk20a_vm_unmap_locked_kref);
}
list_del(&va_node->reserved_va_list);
/* if this was a sparse mapping, free the va */
if (va_node->sparse)
g->ops.mm.gmmu_unmap(vm,
va_node->vaddr_start,
va_node->size,
va_node->pgsz_idx,
true,
gk20a_mem_flag_none,
true,
NULL);
kfree(va_node);
}
mutex_unlock(&vm->update_gmmu_lock);
err = 0;
return err;
}
int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
struct channel_gk20a *ch)
{
int err = 0;
struct vm_gk20a *vm = as_share->vm;
gk20a_dbg_fn("");
ch->vm = vm;
err = channel_gk20a_commit_va(ch);
if (err)
ch->vm = NULL;
return err;
}
int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
{
struct gk20a_dmabuf_priv *priv;
static DEFINE_MUTEX(priv_lock);
static u64 priv_count = 0;
priv = dma_buf_get_drvdata(dmabuf, dev);
if (likely(priv))
return 0;
mutex_lock(&priv_lock);
priv = dma_buf_get_drvdata(dmabuf, dev);
if (priv)
goto priv_exist_or_err;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv) {
priv = ERR_PTR(-ENOMEM);
goto priv_exist_or_err;
}
mutex_init(&priv->lock);
INIT_LIST_HEAD(&priv->states);
priv->buffer_id = ++priv_count;
dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
priv_exist_or_err:
mutex_unlock(&priv_lock);
if (IS_ERR(priv))
return -ENOMEM;
return 0;
}
int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct device *dev,
u64 offset, struct gk20a_buffer_state **state)
{
int err = 0;
struct gk20a_dmabuf_priv *priv;
struct gk20a_buffer_state *s;
if (WARN_ON(offset >= (u64)dmabuf->size))
return -EINVAL;
err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev);
if (err)
return err;
priv = dma_buf_get_drvdata(dmabuf, dev);
if (WARN_ON(!priv))
return -ENOSYS;
mutex_lock(&priv->lock);
list_for_each_entry(s, &priv->states, list)
if (s->offset == offset)
goto out;
/* State not found, create state. */
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
err = -ENOMEM;
goto out;
}
s->offset = offset;
INIT_LIST_HEAD(&s->list);
mutex_init(&s->lock);
list_add_tail(&s->list, &priv->states);
out:
mutex_unlock(&priv->lock);
if (!err)
*state = s;
return err;
}
int gk20a_vm_map_buffer(struct vm_gk20a *vm,
int dmabuf_fd,
u64 *offset_align,
u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/
int kind,
u64 buffer_offset,
u64 mapping_size,
struct vm_gk20a_mapping_batch *batch)
{
int err = 0;
struct dma_buf *dmabuf;
u64 ret_va;
gk20a_dbg_fn("");
/* get ref to the mem handle (released on unmap_locked) */
dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(dmabuf)) {
dev_warn(dev_from_vm(vm), "%s: fd %d is not a dmabuf",
__func__, dmabuf_fd);
return PTR_ERR(dmabuf);
}
err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
if (err) {
dma_buf_put(dmabuf);
return err;
}
ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
flags, kind, NULL, true,
gk20a_mem_flag_none,
buffer_offset,
mapping_size,
batch);
*offset_align = ret_va;
if (!ret_va) {
dma_buf_put(dmabuf);
err = -EINVAL;
}
return err;
}
int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset,
struct vm_gk20a_mapping_batch *batch)
{
gk20a_dbg_fn("");
gk20a_vm_unmap_user(vm, offset, batch);
return 0;
}
void gk20a_deinit_vm(struct vm_gk20a *vm)
{
if (nvgpu_alloc_initialized(&vm->kernel))
nvgpu_alloc_destroy(&vm->kernel);
if (nvgpu_alloc_initialized(&vm->user))
nvgpu_alloc_destroy(&vm->user);
if (nvgpu_alloc_initialized(&vm->fixed))
nvgpu_alloc_destroy(&vm->fixed);
gk20a_vm_free_entries(vm, &vm->pdb, 0);
}
int gk20a_alloc_inst_block(struct gk20a *g, struct mem_desc *inst_block)
{
struct device *dev = dev_from_gk20a(g);
int err;
gk20a_dbg_fn("");
err = gk20a_gmmu_alloc(g, ram_in_alloc_size_v(), inst_block);
if (err) {
gk20a_err(dev, "%s: memory allocation failed\n", __func__);
return err;
}
gk20a_dbg_fn("done");
return 0;
}
void gk20a_free_inst_block(struct gk20a *g, struct mem_desc *inst_block)
{
if (inst_block->size)
gk20a_gmmu_free(g, inst_block);
}
u64 gk20a_mm_inst_block_addr(struct gk20a *g, struct mem_desc *inst_block)
{
u64 addr;
if (g->mm.has_physical_mode)
addr = gk20a_mem_phys(inst_block);
else
addr = gk20a_mem_get_base_addr(g, inst_block, 0);
return addr;
}
static int gk20a_init_bar1_vm(struct mm_gk20a *mm)
{
int err;
struct vm_gk20a *vm = &mm->bar1.vm;
struct gk20a *g = gk20a_from_mm(mm);
struct mem_desc *inst_block = &mm->bar1.inst_block;
u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
gk20a_init_vm(mm, vm,
big_page_size,
SZ_4K, /* Low hole */
mm->bar1.aperture_size - SZ_4K, /* Kernel reserved. */
mm->bar1.aperture_size,
true, false,
"bar1");
err = gk20a_alloc_inst_block(g, inst_block);
if (err)
goto clean_up_va;
g->ops.mm.init_inst_block(inst_block, vm, big_page_size);
return 0;
clean_up_va:
gk20a_deinit_vm(vm);
return err;
}
/* pmu vm, share channel_vm interfaces */
static int gk20a_init_system_vm(struct mm_gk20a *mm)
{
int err;
struct vm_gk20a *vm = &mm->pmu.vm;
struct gk20a *g = gk20a_from_mm(mm);
struct mem_desc *inst_block = &mm->pmu.inst_block;
u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
u32 low_hole, aperture_size;
/*
* No user region - so we will pass that as zero sized.
*/
low_hole = SZ_4K * 16;
aperture_size = GK20A_PMU_VA_SIZE * 2;
mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
gk20a_init_vm(mm, vm, big_page_size,
low_hole,
aperture_size - low_hole,
aperture_size,
true,
false,
"system");
err = gk20a_alloc_inst_block(g, inst_block);
if (err)
goto clean_up_va;
g->ops.mm.init_inst_block(inst_block, vm, big_page_size);
return 0;
clean_up_va:
gk20a_deinit_vm(vm);
return err;
}
static int gk20a_init_hwpm(struct mm_gk20a *mm)
{
int err;
struct vm_gk20a *vm = &mm->pmu.vm;
struct gk20a *g = gk20a_from_mm(mm);
struct mem_desc *inst_block = &mm->hwpm.inst_block;
err = gk20a_alloc_inst_block(g, inst_block);
if (err)
return err;
g->ops.mm.init_inst_block(inst_block, vm, 0);
return 0;
}
static int gk20a_init_cde_vm(struct mm_gk20a *mm)
{
struct vm_gk20a *vm = &mm->cde.vm;
struct gk20a *g = gk20a_from_mm(mm);
u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
return gk20a_init_vm(mm, vm, big_page_size,
SZ_4K * 16,
NV_MM_DEFAULT_KERNEL_SIZE,
NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
false, false, "cde");
}
static int gk20a_init_ce_vm(struct mm_gk20a *mm)
{
struct vm_gk20a *vm = &mm->ce.vm;
struct gk20a *g = gk20a_from_mm(mm);
u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
return gk20a_init_vm(mm, vm, big_page_size,
SZ_4K * 16,
NV_MM_DEFAULT_KERNEL_SIZE,
NV_MM_DEFAULT_KERNEL_SIZE + NV_MM_DEFAULT_USER_SIZE,
false, false, "ce");
}
void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *inst_block,
struct vm_gk20a *vm)
{
u64 pdb_addr = gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0);
u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
u32 pdb_addr_hi = u64_hi32(pdb_addr);
gk20a_dbg_info("pde pa=0x%llx", pdb_addr);
gk20a_mem_wr32(g, inst_block, ram_in_page_dir_base_lo_w(),
gk20a_aperture_mask(g, &vm->pdb.mem,
ram_in_page_dir_base_target_sys_mem_ncoh_f(),
ram_in_page_dir_base_target_vid_mem_f()) |
ram_in_page_dir_base_vol_true_f() |
ram_in_page_dir_base_lo_f(pdb_addr_lo));
gk20a_mem_wr32(g, inst_block, ram_in_page_dir_base_hi_w(),
ram_in_page_dir_base_hi_f(pdb_addr_hi));
}
void gk20a_init_inst_block(struct mem_desc *inst_block, struct vm_gk20a *vm,
u32 big_page_size)
{
struct gk20a *g = gk20a_from_vm(vm);
gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
gk20a_mm_inst_block_addr(g, inst_block), inst_block->cpu_va);
g->ops.mm.init_pdb(g, inst_block, vm);
gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
u64_lo32(vm->va_limit - 1) & ~0xfff);
gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
if (big_page_size && g->ops.mm.set_big_page_size)
g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
}
int gk20a_mm_fb_flush(struct gk20a *g)
{
struct mm_gk20a *mm = &g->mm;
u32 data;
int ret = 0;
struct nvgpu_timeout timeout;
gk20a_dbg_fn("");
gk20a_busy_noresume(g->dev);
if (!g->power_on) {
pm_runtime_put_noidle(g->dev);
return 0;
}
nvgpu_timeout_init(g, &timeout, 100, NVGPU_TIMER_RETRY_TIMER);
mutex_lock(&mm->l2_op_lock);
/* Make sure all previous writes are committed to the L2. There's no
guarantee that writes are to DRAM. This will be a sysmembar internal
to the L2. */
trace_gk20a_mm_fb_flush(dev_name(g->dev));
gk20a_writel(g, flush_fb_flush_r(),
flush_fb_flush_pending_busy_f());
do {
data = gk20a_readl(g, flush_fb_flush_r());
if (flush_fb_flush_outstanding_v(data) ==
flush_fb_flush_outstanding_true_v() ||
flush_fb_flush_pending_v(data) ==
flush_fb_flush_pending_busy_v()) {
gk20a_dbg_info("fb_flush 0x%x", data);
udelay(5);
} else
break;
} while (!nvgpu_timeout_expired(&timeout));
if (nvgpu_timeout_peek_expired(&timeout)) {
if (g->ops.fb.dump_vpr_wpr_info)
g->ops.fb.dump_vpr_wpr_info(g);
ret = -EBUSY;
}
trace_gk20a_mm_fb_flush_done(dev_name(g->dev));
mutex_unlock(&mm->l2_op_lock);
pm_runtime_put_noidle(g->dev);
return ret;
}
static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
{
u32 data;
struct nvgpu_timeout timeout;
trace_gk20a_mm_l2_invalidate(dev_name(g->dev));
nvgpu_timeout_init(g, &timeout, 200, NVGPU_TIMER_RETRY_TIMER);
/* Invalidate any clean lines from the L2 so subsequent reads go to
DRAM. Dirty lines are not affected by this operation. */
gk20a_writel(g, flush_l2_system_invalidate_r(),
flush_l2_system_invalidate_pending_busy_f());
do {
data = gk20a_readl(g, flush_l2_system_invalidate_r());
if (flush_l2_system_invalidate_outstanding_v(data) ==
flush_l2_system_invalidate_outstanding_true_v() ||
flush_l2_system_invalidate_pending_v(data) ==
flush_l2_system_invalidate_pending_busy_v()) {
gk20a_dbg_info("l2_system_invalidate 0x%x",
data);
udelay(5);
} else
break;
} while (!nvgpu_timeout_expired(&timeout));
if (nvgpu_timeout_peek_expired(&timeout))
gk20a_warn(dev_from_gk20a(g),
"l2_system_invalidate too many retries");
trace_gk20a_mm_l2_invalidate_done(dev_name(g->dev));
}
void gk20a_mm_l2_invalidate(struct gk20a *g)
{
struct mm_gk20a *mm = &g->mm;
gk20a_busy_noresume(g->dev);
if (g->power_on) {
mutex_lock(&mm->l2_op_lock);
gk20a_mm_l2_invalidate_locked(g);
mutex_unlock(&mm->l2_op_lock);
}
pm_runtime_put_noidle(g->dev);
}
void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
{
struct mm_gk20a *mm = &g->mm;
u32 data;
struct nvgpu_timeout timeout;
gk20a_dbg_fn("");
gk20a_busy_noresume(g->dev);
if (!g->power_on)
goto hw_was_off;
nvgpu_timeout_init(g, &timeout, 2000, NVGPU_TIMER_RETRY_TIMER);
mutex_lock(&mm->l2_op_lock);
trace_gk20a_mm_l2_flush(dev_name(g->dev));
/* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
as clean, so subsequent reads might hit in the L2. */
gk20a_writel(g, flush_l2_flush_dirty_r(),
flush_l2_flush_dirty_pending_busy_f());
do {
data = gk20a_readl(g, flush_l2_flush_dirty_r());
if (flush_l2_flush_dirty_outstanding_v(data) ==
flush_l2_flush_dirty_outstanding_true_v() ||
flush_l2_flush_dirty_pending_v(data) ==
flush_l2_flush_dirty_pending_busy_v()) {
gk20a_dbg_info("l2_flush_dirty 0x%x", data);
udelay(5);
} else
break;
} while (!nvgpu_timeout_expired_msg(&timeout,
"l2_flush_dirty too many retries"));
trace_gk20a_mm_l2_flush_done(dev_name(g->dev));
if (invalidate)
gk20a_mm_l2_invalidate_locked(g);
mutex_unlock(&mm->l2_op_lock);
hw_was_off:
pm_runtime_put_noidle(g->dev);
}
void gk20a_mm_cbc_clean(struct gk20a *g)
{
struct mm_gk20a *mm = &g->mm;
u32 data;
struct nvgpu_timeout timeout;
gk20a_dbg_fn("");
gk20a_busy_noresume(g->dev);
if (!g->power_on)
goto hw_was_off;
nvgpu_timeout_init(g, &timeout, 200, NVGPU_TIMER_RETRY_TIMER);
mutex_lock(&mm->l2_op_lock);
/* Flush all dirty lines from the CBC to L2 */
gk20a_writel(g, flush_l2_clean_comptags_r(),
flush_l2_clean_comptags_pending_busy_f());
do {
data = gk20a_readl(g, flush_l2_clean_comptags_r());
if (flush_l2_clean_comptags_outstanding_v(data) ==
flush_l2_clean_comptags_outstanding_true_v() ||
flush_l2_clean_comptags_pending_v(data) ==
flush_l2_clean_comptags_pending_busy_v()) {
gk20a_dbg_info("l2_clean_comptags 0x%x", data);
udelay(5);
} else
break;
} while (!nvgpu_timeout_expired_msg(&timeout,
"l2_clean_comptags too many retries"));
mutex_unlock(&mm->l2_op_lock);
hw_was_off:
pm_runtime_put_noidle(g->dev);
}
int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
struct dma_buf **dmabuf,
u64 *offset)
{
struct mapped_buffer_node *mapped_buffer;
gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
mutex_lock(&vm->update_gmmu_lock);
mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
gpu_va);
if (!mapped_buffer) {
mutex_unlock(&vm->update_gmmu_lock);
return -EINVAL;
}
*dmabuf = mapped_buffer->dmabuf;
*offset = gpu_va - mapped_buffer->addr;
mutex_unlock(&vm->update_gmmu_lock);
return 0;
}
void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
{
struct gk20a *g = gk20a_from_vm(vm);
struct nvgpu_timeout timeout;
u32 addr_lo;
u32 data;
static DEFINE_MUTEX(tlb_lock);
gk20a_dbg_fn("");
/* pagetables are considered sw states which are preserved after
prepare_poweroff. When gk20a deinit releases those pagetables,
common code in vm unmap path calls tlb invalidate that touches
hw. Use the power_on flag to skip tlb invalidation when gpu
power is turned off */
if (!g->power_on)
return;
addr_lo = u64_lo32(gk20a_mem_get_base_addr(g, &vm->pdb.mem, 0) >> 12);
mutex_lock(&tlb_lock);
trace_gk20a_mm_tlb_invalidate(dev_name(g->dev));
nvgpu_timeout_init(g, &timeout, 1000, NVGPU_TIMER_RETRY_TIMER);
do {
data = gk20a_readl(g, fb_mmu_ctrl_r());
if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
break;
udelay(2);
} while (!nvgpu_timeout_expired_msg(&timeout,
"wait mmu fifo space"));
if (nvgpu_timeout_peek_expired(&timeout))
goto out;
nvgpu_timeout_init(g, &timeout, 1000, NVGPU_TIMER_RETRY_TIMER);
gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
fb_mmu_invalidate_pdb_addr_f(addr_lo) |
gk20a_aperture_mask(g, &vm->pdb.mem,
fb_mmu_invalidate_pdb_aperture_sys_mem_f(),
fb_mmu_invalidate_pdb_aperture_vid_mem_f()));
gk20a_writel(g, fb_mmu_invalidate_r(),
fb_mmu_invalidate_all_va_true_f() |
fb_mmu_invalidate_trigger_true_f());
do {
data = gk20a_readl(g, fb_mmu_ctrl_r());
if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
fb_mmu_ctrl_pri_fifo_empty_false_f())
break;
udelay(2);
} while (!nvgpu_timeout_expired_msg(&timeout,
"wait mmu invalidate"));
trace_gk20a_mm_tlb_invalidate_done(dev_name(g->dev));
out:
mutex_unlock(&tlb_lock);
}
int gk20a_mm_suspend(struct gk20a *g)
{
gk20a_dbg_fn("");
cancel_work_sync(&g->mm.vidmem.clear_mem_worker);
g->ops.mm.cbc_clean(g);
g->ops.mm.l2_flush(g, false);
gk20a_dbg_fn("done");
return 0;
}
bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
{
u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
fb_mmu_debug_ctrl_debug_enabled_v();
}
static void gk20a_mm_mmu_set_debug_mode(struct gk20a *g, bool enable)
{
u32 reg_val, debug_ctrl;
reg_val = gk20a_readl(g, fb_mmu_debug_ctrl_r());
if (enable) {
debug_ctrl = fb_mmu_debug_ctrl_debug_enabled_f();
g->mmu_debug_ctrl = true;
} else {
debug_ctrl = fb_mmu_debug_ctrl_debug_disabled_f();
g->mmu_debug_ctrl = false;
}
reg_val = set_field(reg_val,
fb_mmu_debug_ctrl_debug_m(), debug_ctrl);
gk20a_writel(g, fb_mmu_debug_ctrl_r(), reg_val);
}
u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g)
{
return 34;
}
const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
u32 big_page_size)
{
return (big_page_size == SZ_64K) ?
gk20a_mm_levels_64k : gk20a_mm_levels_128k;
}
int gk20a_mm_get_buffer_info(struct device *dev, int dmabuf_fd,
u64 *buffer_id, u64 *buffer_len)
{
struct dma_buf *dmabuf;
struct gk20a_dmabuf_priv *priv;
int err = 0;
dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(dmabuf)) {
dev_warn(dev, "%s: fd %d is not a dmabuf", __func__, dmabuf_fd);
return PTR_ERR(dmabuf);
}
err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev);
if (err) {
dev_warn(dev, "Failed to allocate dmabuf drvdata (err = %d)",
err);
goto clean_up;
}
priv = dma_buf_get_drvdata(dmabuf, dev);
if (likely(priv)) {
*buffer_id = priv->buffer_id;
*buffer_len = dmabuf->size;
}
clean_up:
dma_buf_put(dmabuf);
return err;
}
static bool gk20a_mm_is_bar1_supported(struct gk20a *g)
{
return true;
}
#ifdef CONFIG_DEBUG_FS
void gk20a_mm_debugfs_init(struct device *dev)
{
struct gk20a_platform *platform = dev_get_drvdata(dev);
struct dentry *gpu_root = platform->debugfs;
struct gk20a *g = gk20a_get_platform(dev)->g;
debugfs_create_x64("separate_fixed_allocs", 0664, gpu_root,
&g->separate_fixed_allocs);
debugfs_create_bool("force_pramin", 0664, gpu_root,
&g->mm.force_pramin);
}
#endif
void gk20a_init_mm(struct gpu_ops *gops)
{
gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled;
gops->mm.set_debug_mode = gk20a_mm_mmu_set_debug_mode;
gops->mm.gmmu_map = gk20a_locked_gmmu_map;
gops->mm.gmmu_unmap = gk20a_locked_gmmu_unmap;
gops->mm.vm_remove = gk20a_vm_remove_support;
gops->mm.vm_alloc_share = gk20a_vm_alloc_share;
gops->mm.vm_bind_channel = gk20a_vm_bind_channel;
gops->mm.fb_flush = gk20a_mm_fb_flush;
gops->mm.l2_invalidate = gk20a_mm_l2_invalidate;
gops->mm.l2_flush = gk20a_mm_l2_flush;
gops->mm.cbc_clean = gk20a_mm_cbc_clean;
gops->mm.tlb_invalidate = gk20a_mm_tlb_invalidate;
gops->mm.get_iova_addr = gk20a_mm_iova_addr;
gops->mm.get_physical_addr_bits = gk20a_mm_get_physical_addr_bits;
gops->mm.get_mmu_levels = gk20a_mm_get_mmu_levels;
gops->mm.init_pdb = gk20a_mm_init_pdb;
gops->mm.init_mm_setup_hw = gk20a_init_mm_setup_hw;
gops->mm.bar1_bind = gk20a_mm_bar1_bind;
gops->mm.init_inst_block = gk20a_init_inst_block;
gops->mm.is_bar1_supported = gk20a_mm_is_bar1_supported;
}