From 9feb2a4347d9ba756bc30669e3c8035765fe8bc4 Mon Sep 17 00:00:00 2001 From: Ketan Patil Date: Mon, 5 Aug 2024 04:09:13 +0000 Subject: [PATCH] video: tegra: nvmap: Add multithreaded cache flush support On TOT, NvMap does page by page cache flush i.e. it takes virtual address of each page present in the buffer and then perform cache flush on it using dcache_by_line_op. This result in very poor performance for larger buffers. ~70% of the time taken by NvRmMemHandleAllocAttr is consumed in cache flush. Address this perf issue using multithreaded cache flush - Use a threshold value of 32768 pages which is derived from perf experiments and as per discussion with cuda as per usecases. - When the cache flush request of >= 32768 pages is made, then vmap pages to map them in contiguous VA space and create n number of kernel threads; where n indicate the number of online CPUs. - Divide the above VA range among the threads and each thread would do cache flush on the VA range assigned to it. This logic in resulting into following % improvement for alloc tests. ----------------------------------- Buffer Size in MB | % improvement | ----------------------------------| 128 | 52 | 256 | 56 | 512 | 57 | 1024 | 58 | 1536 | 57 | 2048 | 58 | 2560 | 57 | 3072 | 58 | 3584 | 58 | 4096 | 58 | 4608 | 58 | 5120 | 58 | ----------------------------------- Bug 4628529 Change-Id: I803ef5245ff9283fdc3afc497a6b642c97e89c06 Signed-off-by: Ketan Patil Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3187871 Reviewed-by: Krishna Reddy GVS: buildbot_gerritrpt --- drivers/video/tegra/nvmap/nvmap_alloc.c | 5 +- drivers/video/tegra/nvmap/nvmap_alloc_int.h | 15 ++- drivers/video/tegra/nvmap/nvmap_cache.c | 105 ++++++++++++++++++-- 3 files changed, 114 insertions(+), 11 deletions(-) diff --git a/drivers/video/tegra/nvmap/nvmap_alloc.c b/drivers/video/tegra/nvmap/nvmap_alloc.c index 27596e42..addc5086 100644 --- a/drivers/video/tegra/nvmap/nvmap_alloc.c +++ b/drivers/video/tegra/nvmap/nvmap_alloc.c @@ -104,6 +104,7 @@ static int handle_page_alloc(struct nvmap_client *client, gfp_t gfp = GFP_NVMAP | __GFP_ZERO; u64 result; #ifdef CONFIG_ARM64_4K_PAGES + int cc_index = 0; #ifdef NVMAP_CONFIG_PAGE_POOLS int pages_per_big_pg = NVMAP_PP_BIG_PAGE_SIZE >> PAGE_SHIFT; #else @@ -133,6 +134,7 @@ static int handle_page_alloc(struct nvmap_client *client, pages_per_big_pg = nvmap_dev->pool->pages_per_big_pg; #endif /* Try to allocate big pages from page allocator */ + cc_index = page_index; for (i = page_index; i < nr_page && pages_per_big_pg > 1 && (nr_page - i) >= pages_per_big_pg; i += pages_per_big_pg, page_index += pages_per_big_pg) { @@ -151,9 +153,10 @@ static int handle_page_alloc(struct nvmap_client *client, for (idx = 0; idx < pages_per_big_pg; idx++) pages[i + idx] = nth_page(page, idx); - nvmap_clean_cache(&pages[i], pages_per_big_pg); } + nvmap_clean_cache(&pages[cc_index], page_index - cc_index); + if (check_add_overflow(nvmap_big_page_allocs, (u64)page_index, &result)) goto fail; diff --git a/drivers/video/tegra/nvmap/nvmap_alloc_int.h b/drivers/video/tegra/nvmap/nvmap_alloc_int.h index 2793f301..559a9323 100644 --- a/drivers/video/tegra/nvmap/nvmap_alloc_int.h +++ b/drivers/video/tegra/nvmap/nvmap_alloc_int.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ #ifndef __NVMAP_ALLOC_INT_H #define __NVMAP_ALLOC_INT_H @@ -14,6 +14,19 @@ #define NVMAP_PP_BIG_PAGE_SIZE (0x10000) #endif /* CONFIG_ARM64_4K_PAGES */ +/* + * Indicate the threshold number of pages after which + * the multithreaded cache flush will be used. + */ +#define THRESHOLD_PAGES_CACHE_FLUSH 32768 + +struct nvmap_cache_thread { + pid_t thread_id; + void *va_start; + size_t size; + struct task_struct *task; +}; + struct dma_coherent_mem_replica { void *virt_base; dma_addr_t device_base; diff --git a/drivers/video/tegra/nvmap/nvmap_cache.c b/drivers/video/tegra/nvmap/nvmap_cache.c index d1bc0463..90eea88d 100644 --- a/drivers/video/tegra/nvmap/nvmap_cache.c +++ b/drivers/video/tegra/nvmap/nvmap_cache.c @@ -1,23 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ #define pr_fmt(fmt) "nvmap: %s() " fmt, __func__ -#include #include -#include -#include -#include - +#include +#include #include - +#include +#include #include +#include +#include +#include __weak struct arm64_ftr_reg arm64_ftr_reg_ctrel0; #include -#include #include "nvmap_dev.h" #include "nvmap_alloc.h" #include "nvmap_alloc_int.h" @@ -44,9 +44,17 @@ void nvmap_clean_cache_page(struct page *page) __clean_dcache_area_poc(page_address(page), PAGE_SIZE); } +static int threaded_cache_flush(void *arg) +{ + struct nvmap_cache_thread *t_data = (struct nvmap_cache_thread *)arg; + + __clean_dcache_area_poc((void *)t_data->va_start, t_data->size); + return 0; +} + void nvmap_clean_cache(struct page **pages, int numpages) { - int i; + int i = 0; /* Not technically a flush but that's what nvmap knows about. */ nvmap_stats_inc(NS_CFLUSH_DONE, numpages << PAGE_SHIFT); @@ -55,6 +63,85 @@ void nvmap_clean_cache(struct page **pages, int numpages) nvmap_stats_read(NS_CFLUSH_RQ), nvmap_stats_read(NS_CFLUSH_DONE)); + /* + * If pages are more than THRESHOLD_PAGES_CACHE_FLUSH, then do threaded cache flush + * where number of threads equal to number of online cpus + */ + if (numpages >= THRESHOLD_PAGES_CACHE_FLUSH) { + /* Map pages in kernel VA space */ + void *vaddr; + int online_cpus = num_online_cpus(); + struct nvmap_cache_thread **td_array = nvmap_altalloc(online_cpus * + sizeof(*td_array)); + int created_threads = 0, j; + size_t set_size, last_set_size; + + if (!td_array) { + pr_err("td_array allocation failed\n"); + goto page_by_page_flush; + } + + vaddr = vmap(pages, numpages, VM_MAP, PAGE_KERNEL); + if (vaddr == NULL) { + pr_err("vmap failed\n"); + nvmap_altfree(td_array, online_cpus * sizeof(*td_array)); + goto page_by_page_flush; + } + + set_size = ((unsigned long long)numpages / online_cpus) << PAGE_SHIFT; + + /* + * The last thread should flush the entire remaining + * pages, as numpages may not be always divisible by + * number of online_cpus. + */ + last_set_size = (unsigned long long) (numpages - (set_size + * (online_cpus - 1))) << PAGE_SHIFT; + + for (i = 0; i < online_cpus; i++) { + td_array[i] = nvmap_altalloc(sizeof(struct nvmap_cache_thread)); + if (!td_array[i]) { + pr_err("failed to allocate memory for nvmap_cache_thread\n"); + goto stop_threads; + } + + td_array[i]->thread_id = i + 1; + td_array[i]->size = (i == online_cpus - 1) ? last_set_size : set_size; + td_array[i]->va_start = vaddr + i * set_size; + td_array[i]->task = kthread_run( + threaded_cache_flush, td_array[i], + "nvmap_cache_flush_thread_%d", i); + if (IS_ERR(td_array[i]->task)) { + pr_err("failed to create kernel thread:%d\n", i); + goto stop_threads; + } + + get_task_struct(td_array[i]->task); + created_threads++; + } + +stop_threads: + for (j = 0; j < created_threads; j++) { + if (!IS_ERR_OR_NULL(td_array[j]->task)) { + kthread_stop(td_array[j]->task); + put_task_struct(td_array[j]->task); + } + } + + while (--i >= 0) { + nvmap_altfree(td_array[i], sizeof(struct nvmap_cache_thread)); + } + + vunmap(vaddr); + nvmap_altfree(td_array, online_cpus * sizeof(*td_array)); + + if (created_threads != online_cpus) + goto page_by_page_flush; + + return; + } + +page_by_page_flush: for (i = 0; i < numpages; i++) nvmap_clean_cache_page(pages[i]); }