diff --git a/drivers/video/tegra/nvmap/nvmap_alloc.c b/drivers/video/tegra/nvmap/nvmap_alloc.c index 27596e42..addc5086 100644 --- a/drivers/video/tegra/nvmap/nvmap_alloc.c +++ b/drivers/video/tegra/nvmap/nvmap_alloc.c @@ -104,6 +104,7 @@ static int handle_page_alloc(struct nvmap_client *client, gfp_t gfp = GFP_NVMAP | __GFP_ZERO; u64 result; #ifdef CONFIG_ARM64_4K_PAGES + int cc_index = 0; #ifdef NVMAP_CONFIG_PAGE_POOLS int pages_per_big_pg = NVMAP_PP_BIG_PAGE_SIZE >> PAGE_SHIFT; #else @@ -133,6 +134,7 @@ static int handle_page_alloc(struct nvmap_client *client, pages_per_big_pg = nvmap_dev->pool->pages_per_big_pg; #endif /* Try to allocate big pages from page allocator */ + cc_index = page_index; for (i = page_index; i < nr_page && pages_per_big_pg > 1 && (nr_page - i) >= pages_per_big_pg; i += pages_per_big_pg, page_index += pages_per_big_pg) { @@ -151,9 +153,10 @@ static int handle_page_alloc(struct nvmap_client *client, for (idx = 0; idx < pages_per_big_pg; idx++) pages[i + idx] = nth_page(page, idx); - nvmap_clean_cache(&pages[i], pages_per_big_pg); } + nvmap_clean_cache(&pages[cc_index], page_index - cc_index); + if (check_add_overflow(nvmap_big_page_allocs, (u64)page_index, &result)) goto fail; diff --git a/drivers/video/tegra/nvmap/nvmap_alloc_int.h b/drivers/video/tegra/nvmap/nvmap_alloc_int.h index 2793f301..559a9323 100644 --- a/drivers/video/tegra/nvmap/nvmap_alloc_int.h +++ b/drivers/video/tegra/nvmap/nvmap_alloc_int.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ +/* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ #ifndef __NVMAP_ALLOC_INT_H #define __NVMAP_ALLOC_INT_H @@ -14,6 +14,19 @@ #define NVMAP_PP_BIG_PAGE_SIZE (0x10000) #endif /* CONFIG_ARM64_4K_PAGES */ +/* + * Indicate the threshold number of pages after which + * the multithreaded cache flush will be used. + */ +#define THRESHOLD_PAGES_CACHE_FLUSH 32768 + +struct nvmap_cache_thread { + pid_t thread_id; + void *va_start; + size_t size; + struct task_struct *task; +}; + struct dma_coherent_mem_replica { void *virt_base; dma_addr_t device_base; diff --git a/drivers/video/tegra/nvmap/nvmap_cache.c b/drivers/video/tegra/nvmap/nvmap_cache.c index d1bc0463..90eea88d 100644 --- a/drivers/video/tegra/nvmap/nvmap_cache.c +++ b/drivers/video/tegra/nvmap/nvmap_cache.c @@ -1,23 +1,23 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ #define pr_fmt(fmt) "nvmap: %s() " fmt, __func__ -#include #include -#include -#include -#include - +#include +#include #include - +#include +#include #include +#include +#include +#include __weak struct arm64_ftr_reg arm64_ftr_reg_ctrel0; #include -#include #include "nvmap_dev.h" #include "nvmap_alloc.h" #include "nvmap_alloc_int.h" @@ -44,9 +44,17 @@ void nvmap_clean_cache_page(struct page *page) __clean_dcache_area_poc(page_address(page), PAGE_SIZE); } +static int threaded_cache_flush(void *arg) +{ + struct nvmap_cache_thread *t_data = (struct nvmap_cache_thread *)arg; + + __clean_dcache_area_poc((void *)t_data->va_start, t_data->size); + return 0; +} + void nvmap_clean_cache(struct page **pages, int numpages) { - int i; + int i = 0; /* Not technically a flush but that's what nvmap knows about. */ nvmap_stats_inc(NS_CFLUSH_DONE, numpages << PAGE_SHIFT); @@ -55,6 +63,85 @@ void nvmap_clean_cache(struct page **pages, int numpages) nvmap_stats_read(NS_CFLUSH_RQ), nvmap_stats_read(NS_CFLUSH_DONE)); + /* + * If pages are more than THRESHOLD_PAGES_CACHE_FLUSH, then do threaded cache flush + * where number of threads equal to number of online cpus + */ + if (numpages >= THRESHOLD_PAGES_CACHE_FLUSH) { + /* Map pages in kernel VA space */ + void *vaddr; + int online_cpus = num_online_cpus(); + struct nvmap_cache_thread **td_array = nvmap_altalloc(online_cpus * + sizeof(*td_array)); + int created_threads = 0, j; + size_t set_size, last_set_size; + + if (!td_array) { + pr_err("td_array allocation failed\n"); + goto page_by_page_flush; + } + + vaddr = vmap(pages, numpages, VM_MAP, PAGE_KERNEL); + if (vaddr == NULL) { + pr_err("vmap failed\n"); + nvmap_altfree(td_array, online_cpus * sizeof(*td_array)); + goto page_by_page_flush; + } + + set_size = ((unsigned long long)numpages / online_cpus) << PAGE_SHIFT; + + /* + * The last thread should flush the entire remaining + * pages, as numpages may not be always divisible by + * number of online_cpus. + */ + last_set_size = (unsigned long long) (numpages - (set_size + * (online_cpus - 1))) << PAGE_SHIFT; + + for (i = 0; i < online_cpus; i++) { + td_array[i] = nvmap_altalloc(sizeof(struct nvmap_cache_thread)); + if (!td_array[i]) { + pr_err("failed to allocate memory for nvmap_cache_thread\n"); + goto stop_threads; + } + + td_array[i]->thread_id = i + 1; + td_array[i]->size = (i == online_cpus - 1) ? last_set_size : set_size; + td_array[i]->va_start = vaddr + i * set_size; + td_array[i]->task = kthread_run( + threaded_cache_flush, td_array[i], + "nvmap_cache_flush_thread_%d", i); + if (IS_ERR(td_array[i]->task)) { + pr_err("failed to create kernel thread:%d\n", i); + goto stop_threads; + } + + get_task_struct(td_array[i]->task); + created_threads++; + } + +stop_threads: + for (j = 0; j < created_threads; j++) { + if (!IS_ERR_OR_NULL(td_array[j]->task)) { + kthread_stop(td_array[j]->task); + put_task_struct(td_array[j]->task); + } + } + + while (--i >= 0) { + nvmap_altfree(td_array[i], sizeof(struct nvmap_cache_thread)); + } + + vunmap(vaddr); + nvmap_altfree(td_array, online_cpus * sizeof(*td_array)); + + if (created_threads != online_cpus) + goto page_by_page_flush; + + return; + } + +page_by_page_flush: for (i = 0; i < numpages; i++) nvmap_clean_cache_page(pages[i]); }