Files
linux-nv-oot/drivers/video/tegra/nvmap/nvmap_cache.c
Ketan Patil 9feb2a4347 video: tegra: nvmap: Add multithreaded cache flush support
On TOT, NvMap does page by page cache flush i.e. it takes virtual
address of each page present in the buffer and then perform cache
flush on it using dcache_by_line_op. This result in very poor
performance for larger buffers. ~70% of the time taken by
NvRmMemHandleAllocAttr is consumed in cache flush.
Address this perf issue using multithreaded cache flush
- Use a threshold value of 32768 pages which is derived from perf
experiments and as per discussion with cuda as per usecases.
- When the cache flush request of >= 32768 pages is made, then vmap
pages to map them in contiguous VA space and create n number of kernel
threads; where n indicate the number of online CPUs.
- Divide the above VA range among the threads and each thread would do
cache flush on the VA range assigned to it.

This logic in resulting into following % improvement for alloc tests.
-----------------------------------
Buffer Size in MB | % improvement |
----------------------------------|
128               |   52          |
256               |   56          |
512               |   57          |
1024              |   58          |
1536              |   57          |
2048              |   58          |
2560              |   57          |
3072              |   58          |
3584              |   58          |
4096              |   58          |
4608              |   58          |
5120              |   58          |
-----------------------------------

Bug 4628529

Change-Id: I803ef5245ff9283fdc3afc497a6b642c97e89c06
Signed-off-by: Ketan Patil <ketanp@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3187871
Reviewed-by: Krishna Reddy <vdumpa@nvidia.com>
GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com>
2025-07-24 10:19:14 +00:00

440 lines
11 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*/
#define pr_fmt(fmt) "nvmap: %s() " fmt, __func__
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/kthread.h>
#include <linux/libnvdimm.h>
#include <linux/of.h>
#include <linux/rtmutex.h>
#include <linux/sys_soc.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <soc/tegra/fuse.h>
__weak struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
#include <trace/events/nvmap.h>
#include "nvmap_dev.h"
#include "nvmap_alloc.h"
#include "nvmap_alloc_int.h"
#include "nvmap_handle.h"
#include "nvmap_dmabuf.h"
#include "nvmap_debug.h"
#ifdef CONFIG_ARM64
#define PG_PROT_KERNEL PAGE_KERNEL
#else
#define PG_PROT_KERNEL pgprot_kernel
#endif
extern void __clean_dcache_area_poc(void *addr, size_t len);
/*
* FIXME:
*
* __clean_dcache_page() is only available on ARM64 (well, we haven't
* implemented it on ARMv7).
*/
void nvmap_clean_cache_page(struct page *page)
{
__clean_dcache_area_poc(page_address(page), PAGE_SIZE);
}
static int threaded_cache_flush(void *arg)
{
struct nvmap_cache_thread *t_data = (struct nvmap_cache_thread *)arg;
__clean_dcache_area_poc((void *)t_data->va_start, t_data->size);
return 0;
}
void nvmap_clean_cache(struct page **pages, int numpages)
{
int i = 0;
/* Not technically a flush but that's what nvmap knows about. */
nvmap_stats_inc(NS_CFLUSH_DONE, numpages << PAGE_SHIFT);
trace_nvmap_cache_flush(numpages << PAGE_SHIFT,
nvmap_stats_read(NS_ALLOC),
nvmap_stats_read(NS_CFLUSH_RQ),
nvmap_stats_read(NS_CFLUSH_DONE));
/*
* If pages are more than THRESHOLD_PAGES_CACHE_FLUSH, then do threaded cache flush
* where number of threads equal to number of online cpus
*/
if (numpages >= THRESHOLD_PAGES_CACHE_FLUSH) {
/* Map pages in kernel VA space */
void *vaddr;
int online_cpus = num_online_cpus();
struct nvmap_cache_thread **td_array = nvmap_altalloc(online_cpus *
sizeof(*td_array));
int created_threads = 0, j;
size_t set_size, last_set_size;
if (!td_array) {
pr_err("td_array allocation failed\n");
goto page_by_page_flush;
}
vaddr = vmap(pages, numpages, VM_MAP, PAGE_KERNEL);
if (vaddr == NULL) {
pr_err("vmap failed\n");
nvmap_altfree(td_array, online_cpus * sizeof(*td_array));
goto page_by_page_flush;
}
set_size = ((unsigned long long)numpages / online_cpus) << PAGE_SHIFT;
/*
* The last thread should flush the entire remaining
* pages, as numpages may not be always divisible by
* number of online_cpus.
*/
last_set_size = (unsigned long long) (numpages - (set_size
* (online_cpus - 1))) << PAGE_SHIFT;
for (i = 0; i < online_cpus; i++) {
td_array[i] = nvmap_altalloc(sizeof(struct nvmap_cache_thread));
if (!td_array[i]) {
pr_err("failed to allocate memory for nvmap_cache_thread\n");
goto stop_threads;
}
td_array[i]->thread_id = i + 1;
td_array[i]->size = (i == online_cpus - 1) ? last_set_size : set_size;
td_array[i]->va_start = vaddr + i * set_size;
td_array[i]->task = kthread_run(
threaded_cache_flush, td_array[i],
"nvmap_cache_flush_thread_%d", i);
if (IS_ERR(td_array[i]->task)) {
pr_err("failed to create kernel thread:%d\n", i);
goto stop_threads;
}
get_task_struct(td_array[i]->task);
created_threads++;
}
stop_threads:
for (j = 0; j < created_threads; j++) {
if (!IS_ERR_OR_NULL(td_array[j]->task)) {
kthread_stop(td_array[j]->task);
put_task_struct(td_array[j]->task);
}
}
while (--i >= 0) {
nvmap_altfree(td_array[i], sizeof(struct nvmap_cache_thread));
}
vunmap(vaddr);
nvmap_altfree(td_array, online_cpus * sizeof(*td_array));
if (created_threads != online_cpus)
goto page_by_page_flush;
return;
}
page_by_page_flush:
for (i = 0; i < numpages; i++)
nvmap_clean_cache_page(pages[i]);
}
void inner_cache_maint(unsigned int op, void *vaddr, size_t size)
{
if (op == NVMAP_CACHE_OP_WB_INV)
arch_invalidate_pmem(vaddr, size);
else if (op == NVMAP_CACHE_OP_INV)
__dma_map_area_from_device(vaddr, size);
else
__dma_map_area_to_device(vaddr, size);
}
static int heap_page_cache_maint(
struct nvmap_handle *h, unsigned long start, unsigned long end,
unsigned int op, bool inner, bool outer, bool clean_only_dirty)
{
unsigned long difference;
if (check_sub_overflow(end, start, &difference))
return -EOVERFLOW;
/* Don't perform cache maint for RO mapped buffers */
if (h->from_va && h->is_ro)
return 0;
if (h->userflags & NVMAP_HANDLE_CACHE_SYNC) {
/*
* zap user VA->PA mappings so that any access to the pages
* will result in a fault and can be marked dirty
*/
nvmap_handle_mkclean(h, start, difference);
}
if (inner) {
if (h->vaddr == NULL) {
if (__nvmap_mmap(h))
__nvmap_munmap(h, h->vaddr);
else
goto per_page_cache_maint;
}
/* Fast inner cache maintenance using single mapping */
inner_cache_maint(op, h->vaddr + start, difference);
if (!outer)
return 0;
/* Skip per-page inner maintenance in loop below */
inner = false;
}
per_page_cache_maint:
while (start < end) {
struct page *page;
phys_addr_t paddr;
unsigned long next;
unsigned long off;
size_t size;
int ret;
phys_addr_t sum;
page = nvmap_to_page(h->pgalloc.pages[start >> PAGE_SHIFT]);
next = min(((start + PAGE_SIZE) & PAGE_MASK), end);
off = start & ~PAGE_MASK;
size = next - start;
if (check_add_overflow((phys_addr_t)page_to_phys(page), (phys_addr_t)off, &sum))
return -EOVERFLOW;
paddr = sum;
if (check_add_overflow(paddr, (phys_addr_t)size, &sum))
return -EOVERFLOW;
ret = nvmap_cache_maint_phys_range(op, paddr, sum,
inner, outer);
WARN_ON(ret != 0);
start = next;
}
return 0;
}
struct cache_maint_op {
phys_addr_t start;
phys_addr_t end;
unsigned int op;
struct nvmap_handle *h;
bool inner;
bool outer;
bool clean_only_dirty;
};
int nvmap_cache_maint_phys_range(unsigned int op, phys_addr_t pstart,
phys_addr_t pend, int inner, int outer)
{
void __iomem *io_addr;
phys_addr_t loop;
if (!inner)
goto do_outer;
loop = pstart;
while (loop < pend) {
phys_addr_t next = (loop + PAGE_SIZE) & PAGE_MASK;
void *base;
next = min(next, pend);
#if defined(CONFIG_GENERIC_IOREMAP)
io_addr = ioremap_prot(loop, PAGE_SIZE, pgprot_val(PAGE_KERNEL));
#else
io_addr = __ioremap(loop, PAGE_SIZE, PG_PROT_KERNEL);
#endif
if (io_addr == NULL)
return -ENOMEM;
base = (__force void *)io_addr + (loop & ~PAGE_MASK);
inner_cache_maint(op, base, next - loop);
iounmap(io_addr);
loop = next;
}
do_outer:
return 0;
}
static int do_cache_maint(struct cache_maint_op *cache_work)
{
phys_addr_t pstart = cache_work->start;
phys_addr_t pend = cache_work->end;
int err = 0;
struct nvmap_handle *h = cache_work->h;
unsigned int op = cache_work->op;
phys_addr_t difference;
if (!h || !h->alloc)
return -EFAULT;
wmb();
if (h->flags == NVMAP_HANDLE_UNCACHEABLE ||
h->flags == NVMAP_HANDLE_WRITE_COMBINE || pstart == pend)
goto out;
trace_nvmap_cache_maint(h->owner, h, pstart, pend, op, pend - pstart);
if (pstart > h->size || pend > h->size) {
pr_warn("cache maintenance outside handle\n");
err = -EINVAL;
goto out;
}
if (h->heap_pgalloc) {
err = heap_page_cache_maint(h, pstart, pend, op, true,
(h->flags == NVMAP_HANDLE_INNER_CACHEABLE) ?
false : true, cache_work->clean_only_dirty);
if (err != 0)
err = -EOVERFLOW;
goto out;
}
if (!h->vaddr) {
if (__nvmap_mmap(h))
__nvmap_munmap(h, h->vaddr);
else
goto per_page_phy_cache_maint;
}
inner_cache_maint(op, h->vaddr + pstart, pend - pstart);
goto out;
per_page_phy_cache_maint:
pstart += h->carveout->base;
pend += h->carveout->base;
err = nvmap_cache_maint_phys_range(op, pstart, pend, true,
h->flags != NVMAP_HANDLE_INNER_CACHEABLE);
out:
if (!err && !check_sub_overflow(pend, pstart, &difference)) {
nvmap_stats_inc(NS_CFLUSH_DONE, difference);
trace_nvmap_cache_flush(difference,
nvmap_stats_read(NS_ALLOC),
nvmap_stats_read(NS_CFLUSH_RQ),
nvmap_stats_read(NS_CFLUSH_DONE));
}
return 0;
}
static void nvmap_handle_get_cacheability(struct nvmap_handle *h,
bool *inner, bool *outer)
{
*inner = h->flags == NVMAP_HANDLE_CACHEABLE ||
h->flags == NVMAP_HANDLE_INNER_CACHEABLE;
*outer = h->flags == NVMAP_HANDLE_CACHEABLE;
}
int __nvmap_do_cache_maint(struct nvmap_client *client,
struct nvmap_handle *h,
unsigned long start, unsigned long end,
unsigned int op, bool clean_only_dirty)
{
int err;
struct cache_maint_op cache_op;
h = nvmap_handle_get(h);
if (!h)
return -EFAULT;
if ((start >= h->size) || (end > h->size)) {
pr_debug("%s start: %ld end: %ld h->size: %zu\n", __func__,
start, end, h->size);
nvmap_handle_put(h);
return -EFAULT;
}
if (!(h->heap_type & nvmap_dev->cpu_access_mask)) {
pr_debug("%s heap_type %u access_mask 0x%x\n", __func__,
h->heap_type, nvmap_dev->cpu_access_mask);
nvmap_handle_put(h);
return -EPERM;
}
nvmap_kmaps_inc(h);
if (op == NVMAP_CACHE_OP_INV)
op = NVMAP_CACHE_OP_WB_INV;
/* clean only dirty is applicable only for Write Back operation */
if (op != NVMAP_CACHE_OP_WB)
clean_only_dirty = false;
cache_op.h = h;
cache_op.start = start ? start : 0;
cache_op.end = end ? end : h->size;
cache_op.op = op;
nvmap_handle_get_cacheability(h, &cache_op.inner, &cache_op.outer);
cache_op.clean_only_dirty = clean_only_dirty;
nvmap_stats_inc(NS_CFLUSH_RQ, end - start);
err = do_cache_maint(&cache_op);
nvmap_kmaps_dec(h);
nvmap_handle_put(h);
return err;
}
int __nvmap_cache_maint(struct nvmap_client *client,
struct nvmap_cache_op_64 *op)
{
struct vm_area_struct *vma;
struct nvmap_vma_priv *priv;
struct nvmap_handle *handle;
unsigned long start;
unsigned long end;
unsigned long sum;
int err = 0;
if (!op->addr || op->op < NVMAP_CACHE_OP_WB ||
op->op > NVMAP_CACHE_OP_WB_INV)
return -EINVAL;
handle = nvmap_handle_get_from_id(client, op->handle);
if (IS_ERR_OR_NULL(handle))
return -EINVAL;
nvmap_acquire_mmap_read_lock(current->mm);
vma = find_vma(current->active_mm, (unsigned long)op->addr);
if (vma == NULL || is_nvmap_vma(vma) == 0 ||
(ulong)op->addr < vma->vm_start ||
(ulong)op->addr >= vma->vm_end ||
op->len > vma->vm_end - (ulong)op->addr) {
err = -EADDRNOTAVAIL;
goto out;
}
priv = (struct nvmap_vma_priv *)vma->vm_private_data;
if (priv->handle != handle) {
err = -EFAULT;
goto out;
}
start = (unsigned long)op->addr - vma->vm_start +
(vma->vm_pgoff << PAGE_SHIFT);
if (check_add_overflow(start, (unsigned long)op->len, &sum)) {
err = -EOVERFLOW;
goto out;
}
end = sum;
err = __nvmap_do_cache_maint(client, priv->handle, start, end, op->op,
false);
out:
nvmap_release_mmap_read_lock(current->mm);
nvmap_handle_put(handle);
return err;
}