gpu: nvgpu: Implement 64k large page support

Implement support for 64kB large page size. Add an API to create an address space via IOCTL so that we can accept flags, and assign one flag for enabling 64kB large page size. Also adds APIs to set per-context large page size. This is possible only on Maxwell, so return error if caller tries to set large page size on Kepler. Default large page size is still 128kB. Change-Id: I20b51c8f6d4a984acae8411ace3de9000c78e82f Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2014-10-16 15:15:11 +03:00
parent ecc6f27fd1
commit 2eb6dcb469
15 changed files with 190 additions and 24 deletions
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -37,8 +37,8 @@ static void release_as_share_id(struct gk20a_as *as, int id)
 	return;
 }

-static int gk20a_as_alloc_share(struct gk20a_as *as,
-				struct gk20a_as_share **out)
+int gk20a_as_alloc_share(struct gk20a_as *as,
+			 u32 flags, struct gk20a_as_share **out)
 {
 	struct gk20a *g = gk20a_from_as(as);
 	struct gk20a_as_share *as_share;
@@ -56,7 +56,7 @@ static int gk20a_as_alloc_share(struct gk20a_as *as,
 	as_share->ref_cnt.counter = 1;

 	/* this will set as_share->vm. */
-	err = g->ops.mm.vm_alloc_share(as_share);
+	err = g->ops.mm.vm_alloc_share(as_share, flags);
 	if (err)
 		goto failed;

@@ -186,7 +186,7 @@ int gk20a_as_dev_open(struct inode *inode, struct file *filp)
 		return err;
 	}

-	err = gk20a_as_alloc_share(&g->as, &as_share);
+	err = gk20a_as_alloc_share(&g->as, 0, &as_share);
 	if (err) {
 		gk20a_dbg_fn("failed to alloc share");
 		gk20a_put_client(g);
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.h
@@ -42,5 +42,7 @@ int gk20a_as_release_share(struct gk20a_as_share *as_share);
 int gk20a_as_dev_open(struct inode *inode, struct file *filp);
 int gk20a_as_dev_release(struct inode *inode, struct file *filp);
 long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+int gk20a_as_alloc_share(struct gk20a_as *as,
+			 u32 flags, struct gk20a_as_share **out);

 #endif
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -119,6 +119,10 @@ int channel_gk20a_commit_va(struct channel_gk20a *c)
 	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
 		ram_in_adr_limit_hi_f(u64_hi32(c->vm->va_limit)));

+	if (c->g->ops.mm.set_big_page_size)
+		c->g->ops.mm.set_big_page_size(c->g, inst_ptr,
+					       c->vm->gmmu_page_sizes[gmmu_page_size_big]);
+
 	return 0;
 }

--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -16,6 +16,8 @@

 #include <linux/highmem.h>
 #include <linux/cdev.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
 #include <uapi/linux/nvgpu.h>

 #include "gk20a.h"
@@ -148,6 +150,53 @@ static int gk20a_ctrl_mark_compressible_write(
 	return ret;
 }

+static int gk20a_ctrl_alloc_as(
+		struct gk20a *g,
+		struct nvgpu_alloc_as_args *args)
+{
+	struct platform_device *dev = g->dev;
+	struct gk20a_as_share *as_share;
+	int err;
+	int fd;
+	struct file *file;
+	char *name;
+
+	err = get_unused_fd_flags(O_RDWR);
+	if (err < 0)
+		return err;
+	fd = err;
+
+	name = kasprintf(GFP_KERNEL, "nvhost-%s-fd%d",
+			dev_name(&dev->dev), fd);
+
+	file = anon_inode_getfile(name, g->as.cdev.ops, NULL, O_RDWR);
+	kfree(name);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto clean_up;
+	}
+	fd_install(fd, file);
+
+	err = gk20a_get_client(g);
+	if (err)
+		goto clean_up;
+
+	err = gk20a_as_alloc_share(&g->as, args->big_page_size, &as_share);
+	if (err)
+		goto clean_up_client;
+
+	file->private_data = as_share;
+
+	args->as_fd = fd;
+	return 0;
+
+clean_up_client:
+	gk20a_put_client(g);
+clean_up:
+	put_unused_fd(fd);
+	return err;
+}
+
 long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct platform_device *dev = filp->private_data;
@@ -309,6 +358,10 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
 		err = gk20a_ctrl_mark_compressible_write(g,
 			(struct nvgpu_gpu_mark_compressible_write_args *)buf);
 		break;
+	case NVGPU_GPU_IOCTL_ALLOC_AS:
+		err = gk20a_ctrl_alloc_as(g,
+			(struct nvgpu_alloc_as_args *)buf);
+		break;
 	default:
 		dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd);
 		err = -ENOTTY;
--- a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -18,6 +18,7 @@
 #include "gk20a.h"
 #include "kind_gk20a.h"
 #include "hw_mc_gk20a.h"
+#include "hw_fb_gk20a.h"

 static void fb_gk20a_reset(struct gk20a *g)
 {
@@ -29,9 +30,22 @@ static void fb_gk20a_reset(struct gk20a *g)
 			| mc_enable_hub_enabled_f());
 }

+static void gk20a_fb_set_mmu_page_size(struct gk20a *g)
+{
+	/* set large page size in fb */
+	u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
+
+	fb_mmu_ctrl = (fb_mmu_ctrl &
+		       ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
+		fb_mmu_ctrl_vm_pg_size_128kb_f();
+
+	gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
+}
+
 void gk20a_init_fb(struct gpu_ops *gops)
 {
 	gops->fb.reset = fb_gk20a_reset;
+	gops->fb.set_mmu_page_size = gk20a_fb_set_mmu_page_size;
 	gk20a_init_uncompressed_kind_map();
 	gk20a_init_kind_attr();
 }
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -141,6 +141,7 @@ struct gpu_ops {
 		void (*reset)(struct gk20a *g);
 		void (*init_uncompressed_kind_map)(struct gk20a *g);
 		void (*init_kind_attr)(struct gk20a *g);
+		void (*set_mmu_page_size)(struct gk20a *g);
 	} fb;
 	struct {
 		void (*slcg_bus_load_gating_prod)(struct gk20a *g, bool prod);
@@ -291,13 +292,16 @@ struct gpu_ops {
 				bool va_allocated,
 				int rw_flag);
 		void (*vm_remove)(struct vm_gk20a *vm);
-		int (*vm_alloc_share)(struct gk20a_as_share *as_share);
+		int (*vm_alloc_share)(struct gk20a_as_share *as_share,
+				      u32 flags);
 		int (*vm_bind_channel)(struct gk20a_as_share *as_share,
 				struct channel_gk20a *ch);
 		int (*fb_flush)(struct gk20a *g);
 		void (*l2_invalidate)(struct gk20a *g);
 		void (*l2_flush)(struct gk20a *g, bool invalidate);
 		void (*tlb_invalidate)(struct vm_gk20a *vm);
+		void (*set_big_page_size)(struct gk20a *g,
+					 void *inst_ptr, int size);
 	} mm;
 	struct {
 		int (*prepare_ucode)(struct gk20a *g);
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -327,17 +327,7 @@ static int gk20a_init_mm_setup_hw(struct gk20a *g)

 	gk20a_dbg_fn("");

-	/* set large page size in fb
-	 * note this is very early on, can we defer it ? */
-	{
-		u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
-
-		fb_mmu_ctrl = (fb_mmu_ctrl &
-			       ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
-			fb_mmu_ctrl_vm_pg_size_128kb_f();
-
-		gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
-	}
+	g->ops.fb.set_mmu_page_size(g);

 	inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
 	gk20a_dbg_info("bar1 inst block ptr: 0x%08x",  (u32)inst_pa);
@@ -2173,6 +2163,7 @@ void gk20a_vm_put(struct vm_gk20a *vm)

 static int gk20a_init_vm(struct mm_gk20a *mm,
 		struct vm_gk20a *vm,
+		u32 big_page_size,
 		u64 low_hole,
 		u64 aperture_size,
 		bool big_pages,
@@ -2184,7 +2175,7 @@ static int gk20a_init_vm(struct mm_gk20a *mm,
 	size_t vma_size;

 	/* note: keep the page sizes sorted lowest to highest here */
-	u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
+	u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size };

 	vm->mm = mm;

@@ -2331,7 +2322,7 @@ clean_up_pdes:
 }

 /* address space interfaces for the gk20a module */
-int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share, u32 big_page_size)
 {
 	struct gk20a_as *as = as_share->as;
 	struct gk20a *g = gk20a_from_as(as);
@@ -2351,8 +2342,15 @@ int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
 	vm->enable_ctag = true;

 	snprintf(name, sizeof(name), "gk20a_as_%d", as_share->id);
-	err = gk20a_init_vm(mm, vm,
-			    SZ_128K << 10, mm->channel.size, true, name);
+
+	if (big_page_size && !g->ops.mm.set_big_page_size)
+		return -EINVAL;
+	if (big_page_size == 0)
+		big_page_size =
+			gk20a_get_platform(g->dev)->default_big_page_size;
+
+	err = gk20a_init_vm(mm, vm, big_page_size, big_page_size << 10,
+			    mm->channel.size, true, name);

 	return 0;
 }
@@ -2709,10 +2707,12 @@ static int gk20a_init_bar1_vm(struct mm_gk20a *mm)
 	struct device *d = dev_from_gk20a(g);
 	struct inst_desc *inst_block = &mm->bar1.inst_block;
 	dma_addr_t iova;
+	u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;

 	mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
 	gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
-	gk20a_init_vm(mm, vm, SZ_4K, mm->bar1.aperture_size, false, "bar1");
+	gk20a_init_vm(mm, vm, big_page_size, SZ_4K,
+		      mm->bar1.aperture_size, false, "bar1");

 	gk20a_dbg_info("pde pa=0x%llx",
 		       (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
@@ -2761,6 +2761,9 @@ static int gk20a_init_bar1_vm(struct mm_gk20a *mm)
 	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
 		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));

+	if (g->ops.mm.set_big_page_size)
+		g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
+
 	gk20a_dbg_info("bar1 inst block ptr: %08llx",  (u64)inst_pa);
 	return 0;

@@ -2789,11 +2792,12 @@ static int gk20a_init_system_vm(struct mm_gk20a *mm)
 	struct device *d = dev_from_gk20a(g);
 	struct inst_desc *inst_block = &mm->pmu.inst_block;
 	dma_addr_t iova;
+	u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;

 	mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
 	gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);

-	gk20a_init_vm(mm, vm,
+	gk20a_init_vm(mm, vm, big_page_size,
 		      SZ_128K << 10, GK20A_PMU_VA_SIZE, false, "system");

 	gk20a_dbg_info("pde pa=0x%llx",
@@ -2842,6 +2846,9 @@ static int gk20a_init_system_vm(struct mm_gk20a *mm)
 	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
 		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));

+	if (g->ops.mm.set_big_page_size)
+		g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
+
 	return 0;

 clean_up_inst_block:
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -512,7 +512,7 @@ int gk20a_vm_free_va(struct vm_gk20a *vm,
 /* vm-as interface */
 struct nvgpu_as_alloc_space_args;
 struct nvgpu_as_free_space_args;
-int gk20a_vm_alloc_share(struct gk20a_as_share *as_share);
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share, u32 flags);
 int gk20a_vm_release_share(struct gk20a_as_share *as_share);
 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
 			 struct nvgpu_as_alloc_space_args *args);
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -76,6 +76,9 @@ struct gk20a_platform {
 	/* Adaptative ELPG: true = enable flase = disable */
 	bool enable_aelpg;

+	/* Default big page size 64K or 128K */
+	u32 default_big_page_size;
+
 	/* Initialize the platform interface of the gk20a driver.
 	 *
 	 * The platform implementation of this function must
--- a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -443,6 +443,7 @@ struct gk20a_platform t132_gk20a_tegra_platform = {
 	.enable_elpg            = true,
 	.enable_aelpg           = true,

+	.default_big_page_size	= SZ_128K,

 	.probe = gk20a_tegra_probe,
 	.late_probe = gk20a_tegra_late_probe,
@@ -480,6 +481,8 @@ struct gk20a_platform gk20a_tegra_platform = {
 	.enable_elpg            = true,
 	.enable_aelpg           = true,

+	.default_big_page_size	= SZ_128K,
+
 	.probe = gk20a_tegra_probe,
 	.late_probe = gk20a_tegra_late_probe,

@@ -517,6 +520,8 @@ struct gk20a_platform gm20b_tegra_platform = {
 	.enable_elpg            = true,
 	.enable_aelpg           = true,

+	.default_big_page_size	= SZ_128K,
+
 	.probe = gk20a_tegra_probe,
 	.late_probe = gk20a_tegra_late_probe,

--- a/drivers/gpu/nvgpu/gm20b/fb_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/fb_gm20b.c
@@ -82,9 +82,18 @@ void gm20b_init_kind_attr(void)
 	}
 }

+static void gm20b_fb_set_mmu_page_size(struct gk20a *g)
+{
+	/* set large page size in fb */
+	u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
+	fb_mmu_ctrl |= fb_mmu_ctrl_use_pdb_big_page_size_true_f();
+	gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
+}
+
 void gm20b_init_fb(struct gpu_ops *gops)
 {
 	gops->fb.init_fs_state = fb_gm20b_init_fs_state;
+	gops->fb.set_mmu_page_size = gm20b_fb_set_mmu_page_size;
 	gm20b_init_uncompressed_kind_map();
 	gm20b_init_kind_attr();
 }
--- a/drivers/gpu/nvgpu/gm20b/hw_fb_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_fb_gm20b.h
@@ -66,6 +66,10 @@ static inline u32 fb_mmu_ctrl_vm_pg_size_128kb_f(void)
 {
 	return 0x0;
 }
+static inline u32 fb_mmu_ctrl_vm_pg_size_64kb_f(void)
+{
+	return 0x1;
+}
 static inline u32 fb_mmu_ctrl_pri_fifo_empty_v(u32 r)
 {
 	return (r >> 15) & 0x1;
@@ -78,6 +82,18 @@ static inline u32 fb_mmu_ctrl_pri_fifo_space_v(u32 r)
 {
 	return (r >> 16) & 0xff;
 }
+static inline u32 fb_mmu_ctrl_use_pdb_big_page_size_v(u32 r)
+{
+	return (r >> 11) & 0x1;
+}
+static inline u32 fb_mmu_ctrl_use_pdb_big_page_size_true_f(void)
+{
+	return 0x800;
+}
+static inline u32 fb_mmu_ctrl_use_pdb_big_page_size_false_f(void)
+{
+	return 0x0;
+}
 static inline u32 fb_priv_mmu_phy_secure_r(void)
 {
 	return 0x00100ce4;
--- a/drivers/gpu/nvgpu/gm20b/hw_ram_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/hw_ram_gm20b.h
@@ -78,6 +78,26 @@ static inline u32 ram_in_page_dir_base_vol_true_f(void)
 {
 	return 0x4;
 }
+static inline u32 ram_in_big_page_size_f(u32 v)
+{
+	return (v & 0x1) << 11;
+}
+static inline u32 ram_in_big_page_size_m(void)
+{
+	return 0x1 << 11;
+}
+static inline u32 ram_in_big_page_size_w(void)
+{
+	return 128;
+}
+static inline u32 ram_in_big_page_size_128kb_f(void)
+{
+	return 0x0;
+}
+static inline u32 ram_in_big_page_size_64kb_f(void)
+{
+	return 0x800;
+}
 static inline u32 ram_in_page_dir_base_lo_f(u32 v)
 {
 	return (v & 0xfffff) << 12;
--- a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
@@ -19,6 +19,7 @@
 #include "hw_gmmu_gm20b.h"
 #include "hw_fb_gm20b.h"
 #include "hw_gr_gm20b.h"
+#include "hw_ram_gm20b.h"

 static int allocate_gmmu_ptes_sparse(struct vm_gk20a *vm,
 				enum gmmu_pgsz_gk20a pgsz_idx,
@@ -259,6 +260,25 @@ bool gm20b_mm_mmu_debug_mode_enabled(struct gk20a *g)
 		gr_gpcs_pri_mmu_debug_ctrl_debug_enabled_v();
 }

+void gm20b_mm_set_big_page_size(struct gk20a *g, void *inst_ptr, int size)
+{
+	u32 val;
+
+	gk20a_dbg_fn("");
+
+	gk20a_dbg_info("big page size %d\n", size);
+	val = gk20a_mem_rd32(inst_ptr, ram_in_big_page_size_w());
+	val &= ~ram_in_big_page_size_m();
+
+	if (size == SZ_64K)
+		val |= ram_in_big_page_size_64kb_f();
+	else
+		val |= ram_in_big_page_size_128kb_f();
+
+	gk20a_mem_wr32(inst_ptr, ram_in_big_page_size_w(), val);
+	gk20a_dbg_fn("done");
+}
+
 void gm20b_init_mm(struct gpu_ops *gops)
 {
 	gops->mm.set_sparse = gm20b_vm_put_sparse;
@@ -273,4 +293,5 @@ void gm20b_init_mm(struct gpu_ops *gops)
 	gops->mm.l2_invalidate = gk20a_mm_l2_invalidate;
 	gops->mm.l2_flush = gk20a_mm_l2_flush;
 	gops->mm.tlb_invalidate = gk20a_mm_tlb_invalidate;
+	gops->mm.set_big_page_size = gm20b_mm_set_big_page_size;
 }
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -175,6 +175,12 @@ struct nvgpu_gpu_mark_compressible_write_args {
 	__u32 reserved[3];		/* must be zero */
 };

+struct nvgpu_alloc_as_args {
+	__u32 big_page_size;
+	__s32 as_fd;
+	__u64 reserved;			/* must be zero */
+};
+
 #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
 	_IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
 #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -189,9 +195,11 @@ struct nvgpu_gpu_mark_compressible_write_args {
 	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 6, struct nvgpu_gpu_prepare_compressible_read_args)
 #define NVGPU_GPU_IOCTL_MARK_COMPRESSIBLE_WRITE \
 	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 7, struct nvgpu_gpu_mark_compressible_write_args)
+#define NVGPU_GPU_IOCTL_ALLOC_AS \
+	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 8, struct nvgpu_alloc_as_args)

 #define NVGPU_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_GPU_IOCTL_MARK_COMPRESSIBLE_WRITE)
+	_IOC_NR(NVGPU_GPU_IOCTL_ALLOC_AS)
 #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_gpu_prepare_compressible_read_args)