From 9feb2a4347d9ba756bc30669e3c8035765fe8bc4 Mon Sep 17 00:00:00 2001
From: Ketan Patil <ketanp@nvidia.com>
Date: Mon, 5 Aug 2024 04:09:13 +0000
Subject: [PATCH] video: tegra: nvmap: Add multithreaded cache flush support

On TOT, NvMap does page by page cache flush i.e. it takes virtual
address of each page present in the buffer and then perform cache
flush on it using dcache_by_line_op. This result in very poor
performance for larger buffers. ~70% of the time taken by
NvRmMemHandleAllocAttr is consumed in cache flush.
Address this perf issue using multithreaded cache flush
- Use a threshold value of 32768 pages which is derived from perf
experiments and as per discussion with cuda as per usecases.
- When the cache flush request of >= 32768 pages is made, then vmap
pages to map them in contiguous VA space and create n number of kernel
threads; where n indicate the number of online CPUs.
- Divide the above VA range among the threads and each thread would do
cache flush on the VA range assigned to it.

This logic in resulting into following % improvement for alloc tests.
-----------------------------------
Buffer Size in MB | % improvement |
----------------------------------|
128               |   52          |
256               |   56          |
512               |   57          |
1024              |   58          |
1536              |   57          |
2048              |   58          |
2560              |   57          |
3072              |   58          |
3584              |   58          |
4096              |   58          |
4608              |   58          |
5120              |   58          |
-----------------------------------

Bug 4628529

Change-Id: I803ef5245ff9283fdc3afc497a6b642c97e89c06
Signed-off-by: Ketan Patil <ketanp@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/3187871
Reviewed-by: Krishna Reddy <vdumpa@nvidia.com>
GVS: buildbot_gerritrpt <buildbot_gerritrpt@nvidia.com>
---
 drivers/video/tegra/nvmap/nvmap_alloc.c     |   5 +-
 drivers/video/tegra/nvmap/nvmap_alloc_int.h |  15 ++-
 drivers/video/tegra/nvmap/nvmap_cache.c     | 105 ++++++++++++++++++--
 3 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/drivers/video/tegra/nvmap/nvmap_alloc.c b/drivers/video/tegra/nvmap/nvmap_alloc.c
index 27596e42..addc5086 100644
--- a/drivers/video/tegra/nvmap/nvmap_alloc.c
+++ b/drivers/video/tegra/nvmap/nvmap_alloc.c
@@ -104,6 +104,7 @@ static int handle_page_alloc(struct nvmap_client *client,
 	gfp_t gfp = GFP_NVMAP | __GFP_ZERO;
 	u64 result;
 #ifdef CONFIG_ARM64_4K_PAGES
+	int cc_index = 0;
 #ifdef NVMAP_CONFIG_PAGE_POOLS
 	int pages_per_big_pg = NVMAP_PP_BIG_PAGE_SIZE >> PAGE_SHIFT;
 #else
@@ -133,6 +134,7 @@ static int handle_page_alloc(struct nvmap_client *client,
 		pages_per_big_pg = nvmap_dev->pool->pages_per_big_pg;
 #endif
 		/* Try to allocate big pages from page allocator */
+		cc_index = page_index;
 		for (i = page_index;
 		     i < nr_page && pages_per_big_pg > 1 && (nr_page - i) >= pages_per_big_pg;
 		     i += pages_per_big_pg, page_index += pages_per_big_pg) {
@@ -151,9 +153,10 @@ static int handle_page_alloc(struct nvmap_client *client,
 
 			for (idx = 0; idx < pages_per_big_pg; idx++)
 				pages[i + idx] = nth_page(page, idx);
-			nvmap_clean_cache(&pages[i], pages_per_big_pg);
 		}
 
+		nvmap_clean_cache(&pages[cc_index], page_index - cc_index);
+
 		if (check_add_overflow(nvmap_big_page_allocs, (u64)page_index, &result))
 			goto fail;
 
diff --git a/drivers/video/tegra/nvmap/nvmap_alloc_int.h b/drivers/video/tegra/nvmap/nvmap_alloc_int.h
index 2793f301..559a9323 100644
--- a/drivers/video/tegra/nvmap/nvmap_alloc_int.h
+++ b/drivers/video/tegra/nvmap/nvmap_alloc_int.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+/* SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
 
 #ifndef __NVMAP_ALLOC_INT_H
 #define __NVMAP_ALLOC_INT_H
@@ -14,6 +14,19 @@
 #define NVMAP_PP_BIG_PAGE_SIZE           (0x10000)
 #endif /* CONFIG_ARM64_4K_PAGES */
 
+/*
+ * Indicate the threshold number of pages after which
+ * the multithreaded cache flush will be used.
+ */
+#define THRESHOLD_PAGES_CACHE_FLUSH 32768
+
+struct nvmap_cache_thread {
+	pid_t thread_id;
+	void *va_start;
+	size_t size;
+	struct task_struct *task;
+};
+
 struct dma_coherent_mem_replica {
 	void		*virt_base;
 	dma_addr_t	device_base;
diff --git a/drivers/video/tegra/nvmap/nvmap_cache.c b/drivers/video/tegra/nvmap/nvmap_cache.c
index d1bc0463..90eea88d 100644
--- a/drivers/video/tegra/nvmap/nvmap_cache.c
+++ b/drivers/video/tegra/nvmap/nvmap_cache.c
@@ -1,23 +1,23 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2011-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  */
 
 #define pr_fmt(fmt)	"nvmap: %s() " fmt, __func__
 
-#include <linux/io.h>
 #include <linux/debugfs.h>
-#include <linux/of.h>
-#include <linux/version.h>
-#include <soc/tegra/fuse.h>
-
+#include <linux/io.h>
+#include <linux/kthread.h>
 #include <linux/libnvdimm.h>
-
+#include <linux/of.h>
+#include <linux/rtmutex.h>
 #include <linux/sys_soc.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <soc/tegra/fuse.h>
 __weak struct arm64_ftr_reg arm64_ftr_reg_ctrel0;
 
 #include <trace/events/nvmap.h>
-#include <linux/rtmutex.h>
 #include "nvmap_dev.h"
 #include "nvmap_alloc.h"
 #include "nvmap_alloc_int.h"
@@ -44,9 +44,17 @@ void nvmap_clean_cache_page(struct page *page)
 	__clean_dcache_area_poc(page_address(page), PAGE_SIZE);
 }
 
+static int threaded_cache_flush(void *arg)
+{
+	struct nvmap_cache_thread *t_data = (struct nvmap_cache_thread *)arg;
+
+	__clean_dcache_area_poc((void *)t_data->va_start, t_data->size);
+	return 0;
+}
+
 void nvmap_clean_cache(struct page **pages, int numpages)
 {
-	int i;
+	int i = 0;
 
 	/* Not technically a flush but that's what nvmap knows about. */
 	nvmap_stats_inc(NS_CFLUSH_DONE, numpages << PAGE_SHIFT);
@@ -55,6 +63,85 @@ void nvmap_clean_cache(struct page **pages, int numpages)
 		nvmap_stats_read(NS_CFLUSH_RQ),
 		nvmap_stats_read(NS_CFLUSH_DONE));
 
+	/*
+	 * If pages are more than THRESHOLD_PAGES_CACHE_FLUSH, then do threaded cache flush
+	 * where number of threads equal to number of online cpus
+	 */
+	if (numpages >= THRESHOLD_PAGES_CACHE_FLUSH) {
+		/* Map pages in kernel VA space */
+		void *vaddr;
+		int online_cpus = num_online_cpus();
+		struct nvmap_cache_thread **td_array = nvmap_altalloc(online_cpus *
+					sizeof(*td_array));
+		int created_threads = 0, j;
+		size_t set_size, last_set_size;
+
+		if (!td_array) {
+			pr_err("td_array allocation failed\n");
+			goto page_by_page_flush;
+		}
+
+		vaddr = vmap(pages, numpages, VM_MAP, PAGE_KERNEL);
+		if (vaddr == NULL) {
+			pr_err("vmap failed\n");
+			nvmap_altfree(td_array, online_cpus * sizeof(*td_array));
+			goto page_by_page_flush;
+		}
+
+		set_size = ((unsigned long long)numpages / online_cpus) << PAGE_SHIFT;
+
+		/*
+		 * The last thread should flush the entire remaining
+		 * pages, as numpages may not be always divisible by
+		 * number of online_cpus.
+		 */
+		last_set_size = (unsigned long long) (numpages - (set_size
+							* (online_cpus - 1))) << PAGE_SHIFT;
+
+		for (i = 0; i < online_cpus; i++) {
+			td_array[i] = nvmap_altalloc(sizeof(struct nvmap_cache_thread));
+			if (!td_array[i]) {
+				pr_err("failed to allocate memory for nvmap_cache_thread\n");
+				goto stop_threads;
+			}
+
+			td_array[i]->thread_id = i + 1;
+			td_array[i]->size = (i == online_cpus - 1) ? last_set_size : set_size;
+			td_array[i]->va_start = vaddr + i * set_size;
+			td_array[i]->task = kthread_run(
+						threaded_cache_flush, td_array[i],
+						"nvmap_cache_flush_thread_%d", i);
+			if (IS_ERR(td_array[i]->task)) {
+				pr_err("failed to create kernel thread:%d\n", i);
+				goto stop_threads;
+			}
+
+			get_task_struct(td_array[i]->task);
+			created_threads++;
+		}
+
+stop_threads:
+		for (j = 0; j < created_threads; j++) {
+			if (!IS_ERR_OR_NULL(td_array[j]->task)) {
+				kthread_stop(td_array[j]->task);
+				put_task_struct(td_array[j]->task);
+			}
+		}
+
+		while (--i >= 0) {
+			nvmap_altfree(td_array[i], sizeof(struct nvmap_cache_thread));
+		}
+
+		vunmap(vaddr);
+		nvmap_altfree(td_array, online_cpus * sizeof(*td_array));
+
+		if (created_threads != online_cpus)
+			goto page_by_page_flush;
+
+		return;
+	}
+
+page_by_page_flush:
 	for (i = 0; i < numpages; i++)
 		nvmap_clean_cache_page(pages[i]);
 }