gpu: nvgpu: Enable GPU MMIO path

This is adding support for work submit through GPU mmio for gpu-next. Bug 3938139 Change-Id: I69c6b2865e5264e485d8ecec4239c759abdd63d5 Signed-off-by: Dinesh T <dt@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2903841 Tested-by: Martin Radev <mradev@nvidia.com> Reviewed-by: Martin Radev <mradev@nvidia.com> Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2023-05-13 09:38:35 +00:00
parent 7dbd29ceb6
commit c8ceef2d08
9 changed files with 290 additions and 5 deletions
--- a/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
+++ b/drivers/gpu/nvgpu/common/mm/gmmu/page_table.c
@@ -61,6 +61,11 @@ NVGPU_COV_WHITELIST(false_positive, NVGPU_MISRA(Rule, 14_4), "Bug 2623654") \
 	} while (false)
 #endif

+/**
+ * Size required to submit work through MMIO.
+ */
+#define NVGPU_GPU_MMIO_SIZE		SZ_64K
+
 static int pd_allocate(struct vm_gk20a *vm,
 		       struct nvgpu_gmmu_pd *pd,
 		       const struct gk20a_mmu_level *l,
@@ -196,6 +201,140 @@ void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem)
 	nvgpu_gmmu_unmap_addr(vm, mem, mem->gpu_va);
 }

+int nvgpu_channel_setup_mmio_gpu_vas(struct gk20a *g,
+		struct nvgpu_channel *c,
+		u32 gpfifosize)
+{
+	int err = 0;
+	struct nvgpu_sgt *sgt = NULL;
+	struct vm_gk20a *vm = c->vm;
+	u64 virtual_func_offset = 0U;
+
+	/* Initialize the map sizes for userd, gpummio and gpfio */
+	c->userd_va_mapsize = SZ_4K;
+	c->gpfifo_va_mapsize = gpfifosize;
+
+	sgt = nvgpu_sgt_create_from_mem(g, &c->usermode_userd);
+	if (sgt == NULL) {
+		return -ENOMEM;
+	}
+
+	c->userd_va = nvgpu_gmmu_map_va(vm, sgt, c->userd_va_mapsize,
+				APERTURE_SYSMEM, 0);
+
+	nvgpu_sgt_free(g, sgt);
+	if (c->userd_va == 0U) {
+		return -ENOMEM;
+	}
+
+	sgt = nvgpu_sgt_create_from_mem(g, &c->usermode_gpfifo);
+	if (sgt == NULL) {
+		goto free_userd_va;
+	}
+
+	c->gpfifo_va = nvgpu_gmmu_map_va(vm, sgt, gpfifosize, APERTURE_SYSMEM, 0);
+	nvgpu_sgt_free(g, sgt);
+	if (c->gpfifo_va == 0U) {
+		goto free_userd_va;
+	}
+
+	nvgpu_mutex_acquire(&vm->gpu_mmio_va_map_lock);
+	if (vm->gpummio_va == 0U) {
+		virtual_func_offset = g->ops.usermode.base(g);
+		vm->gpummio_va_mapsize = NVGPU_GPU_MMIO_SIZE;
+		/*
+		 * create a SGT from VF addr with 64KB for the first channel"
+		 */
+		err = nvgpu_mem_create_from_phys(g, &vm->gpummio_mem,
+				virtual_func_offset,
+				vm->gpummio_va_mapsize / NVGPU_CPU_PAGE_SIZE);
+		if (err < 0) {
+			nvgpu_mutex_release(&vm->gpu_mmio_va_map_lock);
+			goto free_gpfifo_va;
+		}
+
+		sgt = nvgpu_sgt_create_from_mem(g, &vm->gpummio_mem);
+		if (sgt == NULL) {
+			goto free_mem_and_release_lock;
+		}
+
+		vm->gpummio_va = nvgpu_gmmu_map_va(vm, sgt, vm->gpummio_va_mapsize,
+				APERTURE_SYSMEM_COH, NVGPU_KIND_SMSKED_MESSAGE);
+		nvgpu_sgt_free(g, sgt);
+		if (vm->gpummio_va == 0U) {
+			goto free_mem_and_release_lock;
+		}
+	}
+	nvgpu_mutex_release(&vm->gpu_mmio_va_map_lock);
+	return 0;
+free_mem_and_release_lock:
+	nvgpu_dma_free(g, &vm->gpummio_mem);
+	nvgpu_mutex_release(&vm->gpu_mmio_va_map_lock);
+
+free_gpfifo_va:
+	nvgpu_gmmu_unmap_va(c->vm, c->gpfifo_va, c->gpfifo_va_mapsize);
+	c->gpfifo_va = 0U;
+free_userd_va:
+	nvgpu_gmmu_unmap_va(c->vm, c->userd_va, c->userd_va_mapsize);
+	c->userd_va = 0U;
+	return -ENOMEM;
+}
+
+void nvgpu_channel_free_mmio_gpu_vas(struct gk20a *g,
+			struct nvgpu_channel *c)
+{
+	(void)g;
+
+	if (c->gpfifo_va != 0U) {
+		nvgpu_gmmu_unmap_va(c->vm, c->gpfifo_va, c->gpfifo_va_mapsize);
+	}
+
+	if (c->userd_va != 0U) {
+		nvgpu_gmmu_unmap_va(c->vm, c->userd_va, c->userd_va_mapsize);
+	}
+
+	c->userd_va = 0U;
+	c->gpfifo_va = 0U;
+}
+
+u64 nvgpu_gmmu_map_va(struct vm_gk20a *vm,
+		struct nvgpu_sgt *sgt,
+		u64 size,
+		enum nvgpu_aperture aperture,
+		u8 kind)
+{
+
+	struct gk20a *g = gk20a_from_vm(vm);
+	u64 gpu_va = 0U;
+	u64 vaddr =  0U;
+	u64 buffer_offset = 0U;
+	u32 ctag_offset = 0U;
+	u32 flags = 0U;
+	enum gk20a_mem_rw_flag rw_flag = 0;
+	bool clear_ctags = false;
+	bool sparse = false;
+	bool priv = false;
+	struct vm_gk20a_mapping_batch *batch = NULL;
+
+	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+	gpu_va = g->ops.mm.gmmu.map(vm, vaddr, sgt/* sg list */,
+			buffer_offset, size, GMMU_PAGE_SIZE_SMALL, kind,
+			ctag_offset, flags, rw_flag, clear_ctags,
+			sparse, priv, batch, aperture);
+	nvgpu_mutex_release(&vm->update_gmmu_lock);
+	return gpu_va;
+}
+
+void nvgpu_gmmu_unmap_va(struct vm_gk20a *vm, u64 gpu_va, u64 size)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	nvgpu_mutex_acquire(&vm->update_gmmu_lock);
+	g->ops.mm.gmmu.unmap(vm, gpu_va, size, GMMU_PAGE_SIZE_SMALL, false,
+			gk20a_mem_flag_none, false, NULL);
+	nvgpu_mutex_release(&vm->update_gmmu_lock);
+}
+
 int nvgpu_gmmu_init_page_table(struct vm_gk20a *vm)
 {
 	u32 pdb_size;
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -813,6 +813,7 @@ int nvgpu_vm_do_init(struct mm_gk20a *mm,
 	vm->mapped_buffers = NULL;

 	nvgpu_mutex_init(&vm->syncpt_ro_map_lock);
+	nvgpu_mutex_init(&vm->gpu_mmio_va_map_lock);
 	nvgpu_mutex_init(&vm->update_gmmu_lock);

 	nvgpu_ref_init(&vm->ref);
@@ -838,6 +839,7 @@ int nvgpu_vm_do_init(struct mm_gk20a *mm,
 clean_up_gmmu_lock:
 	nvgpu_mutex_destroy(&vm->update_gmmu_lock);
 	nvgpu_mutex_destroy(&vm->syncpt_ro_map_lock);
+	nvgpu_mutex_destroy(&vm->gpu_mmio_va_map_lock);
 #endif
 clean_up_gpu_vm:
 	if (g->ops.mm.vm_as_free_share != NULL) {
@@ -943,6 +945,16 @@ static void nvgpu_vm_remove(struct vm_gk20a *vm)
 				vm->syncpt_ro_map_gpu_va);
 	}

+	nvgpu_mutex_acquire(&vm->gpu_mmio_va_map_lock);
+	if (vm->gpummio_va != 0U) {
+		nvgpu_gmmu_unmap_va(vm, vm->gpummio_va,
+				vm->gpummio_va_mapsize);
+		nvgpu_dma_free(g, &vm->gpummio_mem);
+		vm->gpummio_va = 0U;
+		vm->gpummio_va_mapsize = 0U;
+	}
+	nvgpu_mutex_release(&vm->gpu_mmio_va_map_lock);
+
 	nvgpu_mutex_acquire(&vm->update_gmmu_lock);

 	nvgpu_rbtree_enum_start(0, &node, vm->mapped_buffers);
@@ -988,6 +1000,7 @@ static void nvgpu_vm_remove(struct vm_gk20a *vm)
 	nvgpu_mutex_destroy(&vm->update_gmmu_lock);

 	nvgpu_mutex_destroy(&vm->syncpt_ro_map_lock);
+	nvgpu_mutex_destroy(&vm->gpu_mmio_va_map_lock);
 	nvgpu_kfree(g, vm);
 }

--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -63,21 +63,24 @@ struct nvgpu_runlist;
 /**
 * Enable VPR support.
 */
-#define NVGPU_SETUP_BIND_FLAGS_SUPPORT_VPR		BIT32(0)
+#define NVGPU_SETUP_BIND_FLAGS_SUPPORT_VPR				BIT32(0)
 /**
 * Channel must have deterministic (and low) submit latency.
 * This flag is only valid for kernel mode submit.
 */
-#define NVGPU_SETUP_BIND_FLAGS_SUPPORT_DETERMINISTIC	BIT32(1)
+#define NVGPU_SETUP_BIND_FLAGS_SUPPORT_DETERMINISTIC			BIT32(1)
 /**
 * Enable replayable faults.
 */
-#define NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE	BIT32(2)
+#define NVGPU_SETUP_BIND_FLAGS_REPLAYABLE_FAULTS_ENABLE			BIT32(2)
 /**
 * Enable usermode submit (mutually exclusive with kernel_mode submit).
 */
-#define NVGPU_SETUP_BIND_FLAGS_USERMODE_SUPPORT		BIT32(3)
-
+#define NVGPU_SETUP_BIND_FLAGS_USERMODE_SUPPORT				BIT32(3)
+/**
+ * Enable GPU MMIO support
+ */
+#define NVGPU_SETUP_BIND_FLAGS_USERMODE_GPU_MAP_RESOURCES_SUPPORT	BIT32(4)
 /**
 * Insert a wait on previous job's completion fence, before gpfifo entries.
 * See also #nvgpu_fence.
@@ -246,6 +249,9 @@ struct nvgpu_setup_bind_args {
 	u32 gpfifo_dmabuf_fd;
 	u64 gpfifo_dmabuf_offset;
 	u32 work_submit_token;
+	u64 gpfifo_gpu_va;
+	u64 userd_gpu_va;
+	u64 usermode_mmio_gpu_va;
 	u32 flags;
 };

@@ -581,6 +587,10 @@ struct nvgpu_channel {
 	 */
 	nvgpu_atomic_t sched_exit_wait_for_errbar_refcnt;
 #endif
+	u64 userd_va;
+	u64 gpfifo_va;
+	u64 userd_va_mapsize;
+	u64 gpfifo_va_mapsize;
 };

 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
--- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h
@@ -235,6 +235,8 @@ struct gk20a;
 			"Multimedia engine support"),			\
 	DEFINE_FLAG(NVGPU_SUPPORT_SEMA_BASED_GPFIFO_GET,		\
 			"Semaphore based gpfifo get update support"),	\
+	DEFINE_FLAG(NVGPU_SUPPORT_GPU_MMIO,				\
+			"Support for work submit through GPUMMIO"),	\
 	DEFINE_FLAG(NVGPU_MAX_ENABLED_BITS, "Marks max number of flags"),

 /**
--- a/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gmmu.h
@@ -46,6 +46,7 @@ struct nvgpu_mem;
 struct nvgpu_sgt;
 struct nvgpu_gmmu_pd;
 struct vm_gk20a_mapping_batch;
+struct nvgpu_channel;

 /**
 * Small page size (4KB) index in the page size table
@@ -430,6 +431,81 @@ void nvgpu_gmmu_unmap_addr(struct vm_gk20a *vm,
 */
 void nvgpu_gmmu_unmap(struct vm_gk20a *vm, struct nvgpu_mem *mem);

+/**
+ * @brief Map a memory pointed by sgt to GMMU.
+ * This is required to add the translations in the GPU page table
+ * for the given channel.
+ *
+ * @param vm		[in]	Pointer to virtual memory structure.
+ * @param sgt		[in]	Structure for storing the memory information.
+ * @param size		[in]	Size to be mapped to GMMU.
+ * @param aperture	[in]	Information about the type of the given memory.
+ * @param kind		[in]	Kind to be used for mapping.
+ *
+ *
+ * GMMU map:
+ * Acquires the VM GMMU lock to avoid race.
+ * Call core map routine to map the given sgt to GMMU.
+ * Release the VM GMMU lock.
+ *
+ * @return	gpu_va.
+ */
+
+u64 nvgpu_gmmu_map_va(struct vm_gk20a *vm, struct nvgpu_sgt *sgt,
+		u64 size, enum nvgpu_aperture aperture,
+		u8 kind);
+
+/**
+ * @brief Unmap a memory mapped by nvgpu_gmmu_map_va().
+ * This is required to remove the translations from the GPU page table.
+ *
+ * @param vm		[in]	Pointer to virtual memory structure.
+ * @param gpu_va	[in]	GPU virtual address.
+ * @param size		[in]	Size to be unmapped from GMMU.
+ *
+ *
+ * GMMU Unmap:
+ * Acquires the VM GMMU lock to the avoid race.
+ * Call core unmap routine to remove the translations from GMMU.
+ * Release the VM GMMU lock.
+ *
+ * @return	None.
+ */
+void nvgpu_gmmu_unmap_va(struct vm_gk20a *vm, u64 gpu_va, u64 size);
+
+/**
+ * @brief Setup mappings on the GMMU to enable gpu work submission
+ *
+ * @param g		[in]	Pointer to the super struture G.
+ * @param c		[in]	Structure for storing the channel info.
+ * @param gpfifosize	[in]	Size to create gpu mapping for gpfifo.
+ *
+ * Create the sgt from the given userd from the channel.
+ * Call nvgpu_gmmu_map_va() to map the userd with 4k in GMMU.
+ * Create the sgt from the given gpfifo derived from the channel.
+ * Call nvgpu_gmmu_map_va() to map the gpfifo with gpfifosize
+ * in GMMU.
+ * Create the sgt from the given gpummio derived from the channel.
+ * Call nvgpu_gmmu_map_va() to map the gpummio with 64k.
+ *
+ * @return	0 for success, < 1 for failure.
+ */
+int nvgpu_channel_setup_mmio_gpu_vas(struct gk20a *g,
+				struct nvgpu_channel *c,
+				u32 gpfifosize);
+
+/**
+ * @brief Free the mappings done by nvgpu_channel_setup_mmio_gpu_vas().
+ *
+ * @param g 		[in]	Pointer to the super structure G.
+ * @param c		[in]	Structure for storing the channel information.
+ *
+ * Free the mappings done by nvgpu_channel_setup_mmio_gpu_vas().
+ *
+ * @return	None.
+ */
+void nvgpu_channel_free_mmio_gpu_vas(struct gk20a *g,
+				struct nvgpu_channel *c);
 /**
 * @brief Compute number of words in a PTE.
 *
--- a/drivers/gpu/nvgpu/include/nvgpu/vm.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vm.h
@@ -338,6 +338,23 @@ struct vm_gk20a {
 	 * Protect allocation of sync point map.
 	 */
 	struct nvgpu_mutex syncpt_ro_map_lock;
+	/**
+	 * gpuva required to submit work by mmio.
+	 */
+	u64 gpummio_va;
+	/**
+	 * Size of the gpummio mapping.
+	 */
+	u64 gpummio_va_mapsize;
+	/**
+	 * nvgpu_mem to store the physical address information.
+	 */
+	struct nvgpu_mem gpummio_mem;
+	/**
+	 * Mutex to protect the gpummio mappings.
+	 */
+	struct nvgpu_mutex gpu_mmio_va_map_lock;
+
 };

 /*
@@ -354,6 +371,7 @@ struct vm_gk20a {
 #define NVGPU_VM_MAP_ACCESS_READ_ONLY			1U
 #define NVGPU_VM_MAP_ACCESS_READ_WRITE			2U

+#define NVGPU_KIND_SMSKED_MESSAGE			0xF
 #define NVGPU_KIND_INVALID				S16(-1)

 /**
--- a/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_channel.c
@@ -629,6 +629,9 @@ static u32 nvgpu_setup_bind_user_flags_to_common_flags(u32 user_flags)
 	if (user_flags & NVGPU_CHANNEL_SETUP_BIND_FLAGS_USERMODE_SUPPORT)
 		flags |= NVGPU_SETUP_BIND_FLAGS_USERMODE_SUPPORT;

+	if (user_flags & NVGPU_CHANNEL_SETUP_BIND_FLAGS_USERMODE_GPU_MAP_RESOURCES_SUPPORT)
+		flags |= NVGPU_SETUP_BIND_FLAGS_USERMODE_GPU_MAP_RESOURCES_SUPPORT;
+
 	return flags;
 }

@@ -1328,6 +1331,12 @@ long gk20a_channel_ioctl(struct file *filp,
 		err = nvgpu_channel_setup_bind(ch, &setup_bind_args);
 		channel_setup_bind_args->work_submit_token =
 			setup_bind_args.work_submit_token;
+		channel_setup_bind_args->gpfifo_gpu_va =
+			setup_bind_args.gpfifo_gpu_va;
+		channel_setup_bind_args->userd_gpu_va =
+			setup_bind_args.userd_gpu_va;
+		channel_setup_bind_args->usermode_mmio_gpu_va =
+			setup_bind_args.usermode_mmio_gpu_va;
 		gk20a_idle(ch->g);
 		break;
 	}
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -328,6 +328,8 @@ static struct nvgpu_flags_mapping flags_mapping[] = {
 		NVGPU_SCHED_EXIT_WAIT_FOR_ERRBAR_SUPPORTED},
 	{NVGPU_GPU_FLAGS_MULTI_PROCESS_TSG_SHARING,
 		NVGPU_SUPPORT_MULTI_PROCESS_TSG_SHARING},
+	{NVGPU_GPU_FLAGS_SUPPORT_GPU_MMIO,
+		NVGPU_SUPPORT_GPU_MMIO},
 };

 static u64 nvgpu_ctrl_ioctl_gpu_characteristics_flags(struct gk20a *g)
--- a/drivers/gpu/nvgpu/os/linux/linux-channel.c
+++ b/drivers/gpu/nvgpu/os/linux/linux-channel.c
@@ -494,6 +494,10 @@ void nvgpu_os_channel_free_usermode_buffers(struct nvgpu_channel *c)
 	struct gk20a *g = c->g;
 	struct device *dev = dev_from_gk20a(g);

+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_GPU_MMIO)) {
+		nvgpu_channel_free_mmio_gpu_vas(g, c);
+	}
+
 	if (priv->usermode.gpfifo.dmabuf != NULL) {
 		nvgpu_mm_unpin(dev, priv->usermode.gpfifo.dmabuf,
 			       priv->usermode.gpfifo.attachment,
@@ -560,7 +564,19 @@ static int nvgpu_channel_alloc_usermode_buffers(struct nvgpu_channel *c,
 		goto unmap_free_gpfifo;
 	}

+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_GPU_MMIO) &&
+		((args->flags & NVGPU_SETUP_BIND_FLAGS_USERMODE_GPU_MAP_RESOURCES_SUPPORT) != 0U)) {
+		err = nvgpu_channel_setup_mmio_gpu_vas(g, c, gpfifo_size);
+		if (err < 0) {
+			err = -ENOMEM;
+			goto unmap_free_gpfifo;
+		}
+	}
+
 	args->work_submit_token = g->ops.usermode.doorbell_token(c);
+	args->gpfifo_gpu_va = c->gpfifo_va;
+	args->userd_gpu_va = c->userd_va;
+	args->usermode_mmio_gpu_va = c->vm->gpummio_va;

 	return 0;
 unmap_free_gpfifo: