video: tegra: nvmap: Add multithreaded cache flush support

On TOT, NvMap does page by page cache flush i.e. it takes virtual address of each page present in the buffer and then perform cache flush on it using dcache_by_line_op. This result in very poor performance for larger buffers. ~70% of the time taken by NvRmMemHandleAllocAttr is consumed in cache flush. Address this perf issue using multithreaded cache flush - Use a threshold value of 32768 pages which is derived from perf experiments and as per discussion with cuda as per usecases. - When the cache flush request of >= 32768 pages is made, then vmap pages to map them in contiguous VA space and create n number of kernel threads; where n indicate the number of online CPUs. - Divide the above VA range among the threads and each thread would do cache flush on the VA range assigned to it. This logic in resulting into following % improvement for alloc tests. ----------------------------------- Buffer Size in MB | % improvement | ----------------------------------| 128 | 52 | 256 | 56 | 512 | 57 | 1024 | 58 | 1536 | 57 | 2048 | 58 | 2560 | 57 | 3072 | 58 | 3584 | 58 | 4096 | 58 | 4608 | 58 | 5120 | 58 | ----------------------------------- Bug 4628529 Change-Id: I803ef5245ff9283fdc3afc497a6b642c97e89c06 Signed-off-by: Ketan Patil <ketanp@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3187871 Reviewed-by: Krishna Reddy <vdumpa@nvidia.com> GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com>
2025-12-22 09:11:26 +03:00 · 2024-08-05 04:09:13 +00:00
parent a57d56284d
commit 9feb2a4347
3 changed files with 114 additions and 11 deletions
--- a/drivers/video/tegra/nvmap/nvmap_alloc.c
+++ b/drivers/video/tegra/nvmap/nvmap_alloc.c
@@ -104,6 +104,7 @@ static int handle_page_alloc(struct nvmap_client *client,
 	gfp_t gfp = GFP_NVMAP | __GFP_ZERO;
 	u64 result;
 #ifdef CONFIG_ARM64_4K_PAGES
+	int cc_index = 0;
 #ifdef NVMAP_CONFIG_PAGE_POOLS
 	int pages_per_big_pg = NVMAP_PP_BIG_PAGE_SIZE >> PAGE_SHIFT;
 #else
@@ -133,6 +134,7 @@ static int handle_page_alloc(struct nvmap_client *client,
 		pages_per_big_pg = nvmap_dev->pool->pages_per_big_pg;
 #endif
 		/* Try to allocate big pages from page allocator */
+		cc_index = page_index;
 		for (i = page_index;
 		     i < nr_page && pages_per_big_pg > 1 && (nr_page - i) >= pages_per_big_pg;
 		     i += pages_per_big_pg, page_index += pages_per_big_pg) {
@@ -151,9 +153,10 @@ static int handle_page_alloc(struct nvmap_client *client,

 			for (idx = 0; idx < pages_per_big_pg; idx++)
 				pages[i + idx] = nth_page(page, idx);
-			nvmap_clean_cache(&pages[i], pages_per_big_pg);
 		}

+		nvmap_clean_cache(&pages[cc_index], page_index - cc_index);
+
 		if (check_add_overflow(nvmap_big_page_allocs, (u64)page_index, &result))
 			goto fail;

--- a/drivers/video/tegra/nvmap/nvmap_alloc_int.h
+++ b/drivers/video/tegra/nvmap/nvmap_alloc_int.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+/* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */

 #ifndef __NVMAP_ALLOC_INT_H
 #define __NVMAP_ALLOC_INT_H
@@ -14,6 +14,19 @@
 #define NVMAP_PP_BIG_PAGE_SIZE           (0x10000)
 #endif /* CONFIG_ARM64_4K_PAGES */

+/*
+ * Indicate the threshold number of pages after which
+ * the multithreaded cache flush will be used.
+ */
+#define THRESHOLD_PAGES_CACHE_FLUSH 32768
+
+struct nvmap_cache_thread {
+	pid_t thread_id;
+	void *va_start;
+	size_t size;
+	struct task_struct *task;
+};
+
 struct dma_coherent_mem_replica {
 	void		*virt_base;
 	dma_addr_t	device_base;
--- a/drivers/video/tegra/nvmap/nvmap_cache.c
+++ b/drivers/video/tegra/nvmap/nvmap_cache.c
@@ -1,23 +1,23 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 */

 #define pr_fmt(fmt)	"nvmap: %s() " fmt, __func__

-#include <linux/io.h>
 #include <linux/debugfs.h>
-#include <linux/of.h>
-#include <linux/version.h>
-#include <soc/tegra/fuse.h>
-
+#include <linux/io.h>
+#include <linux/kthread.h>
 #include <linux/libnvdimm.h>
-
+#include <linux/of.h>
+#include <linux/rtmutex.h>
 #include <linux/sys_soc.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <soc/tegra/fuse.h>
 __weak struct arm64_ftr_reg arm64_ftr_reg_ctrel0;

 #include <trace/events/nvmap.h>
-#include <linux/rtmutex.h>
 #include "nvmap_dev.h"
 #include "nvmap_alloc.h"
 #include "nvmap_alloc_int.h"
@@ -44,9 +44,17 @@ void nvmap_clean_cache_page(struct page *page)
 	__clean_dcache_area_poc(page_address(page), PAGE_SIZE);
 }

+static int threaded_cache_flush(void *arg)
+{
+	struct nvmap_cache_thread *t_data = (struct nvmap_cache_thread *)arg;
+
+	__clean_dcache_area_poc((void *)t_data->va_start, t_data->size);
+	return 0;
+}
+
 void nvmap_clean_cache(struct page **pages, int numpages)
 {
-	int i;
+	int i = 0;

 	/* Not technically a flush but that's what nvmap knows about. */
 	nvmap_stats_inc(NS_CFLUSH_DONE, numpages << PAGE_SHIFT);
@@ -55,6 +63,85 @@ void nvmap_clean_cache(struct page **pages, int numpages)
 		nvmap_stats_read(NS_CFLUSH_RQ),
 		nvmap_stats_read(NS_CFLUSH_DONE));

+	/*
+	 * If pages are more than THRESHOLD_PAGES_CACHE_FLUSH, then do threaded cache flush
+	 * where number of threads equal to number of online cpus
+	 */
+	if (numpages >= THRESHOLD_PAGES_CACHE_FLUSH) {
+		/* Map pages in kernel VA space */
+		void *vaddr;
+		int online_cpus = num_online_cpus();
+		struct nvmap_cache_thread **td_array = nvmap_altalloc(online_cpus *
+					sizeof(*td_array));
+		int created_threads = 0, j;
+		size_t set_size, last_set_size;
+
+		if (!td_array) {
+			pr_err("td_array allocation failed\n");
+			goto page_by_page_flush;
+		}
+
+		vaddr = vmap(pages, numpages, VM_MAP, PAGE_KERNEL);
+		if (vaddr == NULL) {
+			pr_err("vmap failed\n");
+			nvmap_altfree(td_array, online_cpus * sizeof(*td_array));
+			goto page_by_page_flush;
+		}
+
+		set_size = ((unsigned long long)numpages / online_cpus) << PAGE_SHIFT;
+
+		/*
+		 * The last thread should flush the entire remaining
+		 * pages, as numpages may not be always divisible by
+		 * number of online_cpus.
+		 */
+		last_set_size = (unsigned long long) (numpages - (set_size
+							* (online_cpus - 1))) << PAGE_SHIFT;
+
+		for (i = 0; i < online_cpus; i++) {
+			td_array[i] = nvmap_altalloc(sizeof(struct nvmap_cache_thread));
+			if (!td_array[i]) {
+				pr_err("failed to allocate memory for nvmap_cache_thread\n");
+				goto stop_threads;
+			}
+
+			td_array[i]->thread_id = i + 1;
+			td_array[i]->size = (i == online_cpus - 1) ? last_set_size : set_size;
+			td_array[i]->va_start = vaddr + i * set_size;
+			td_array[i]->task = kthread_run(
+						threaded_cache_flush, td_array[i],
+						"nvmap_cache_flush_thread_%d", i);
+			if (IS_ERR(td_array[i]->task)) {
+				pr_err("failed to create kernel thread:%d\n", i);
+				goto stop_threads;
+			}
+
+			get_task_struct(td_array[i]->task);
+			created_threads++;
+		}
+
+stop_threads:
+		for (j = 0; j < created_threads; j++) {
+			if (!IS_ERR_OR_NULL(td_array[j]->task)) {
+				kthread_stop(td_array[j]->task);
+				put_task_struct(td_array[j]->task);
+			}
+		}
+
+		while (--i >= 0) {
+			nvmap_altfree(td_array[i], sizeof(struct nvmap_cache_thread));
+		}
+
+		vunmap(vaddr);
+		nvmap_altfree(td_array, online_cpus * sizeof(*td_array));
+
+		if (created_threads != online_cpus)
+			goto page_by_page_flush;
+
+		return;
+	}
+
+page_by_page_flush:
 	for (i = 0; i < numpages; i++)
 		nvmap_clean_cache_page(pages[i]);
 }