gpu: nvgpu: Implement compbits mapping

Implement NVGPU_AS_IOCTL_GET_BUFFER_COMPBITS_INFO for requesting info on compbits-mappable buffers; and NVGPU_AS_IOCTL_MAP_BUFFER_COMPBITS, which enables mapping compbits to the GPU address space of said buffers. This, subsequently, enables moving comptag swizzling from GPU to CDEH/CDEV formats to userspace. Compbits mapping is conservative and it may map more than what is strictly needed. This is because two reasons: 1) mapping must be done on small page alignment (4kB), and 2) GPU comptags are swizzled all around the aggregate cache line, which means that the whole cache line must be visible even if only some comptag lines are required from it. Cache line size is not necessarily a multiple of the small page size. Bug 200077571 Change-Id: I5ae88fe6b616e5ea37d3bff0dff46c07e9c9267e Signed-off-by: Sami Kiminki <skiminki@nvidia.com> Reviewed-on: http://git-master/r/719710 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com> Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
2025-12-25 11:04:51 +03:00 · 2015-03-19 21:28:34 +02:00
parent 069accc857
commit 520ff00e87
5 changed files with 323 additions and 7 deletions
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -1,7 +1,7 @@
 /*
 * GK20A Address Spaces
 *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -225,6 +225,31 @@ static int gk20a_as_ioctl_get_va_regions(
 	return 0;
 }

+static int gk20a_as_ioctl_get_buffer_compbits_info(
+		struct gk20a_as_share *as_share,
+		struct nvgpu_as_get_buffer_compbits_info_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_get_compbits_info(as_share->vm,
+					  args->mapping_gva,
+					  &args->compbits_win_size,
+					  &args->compbits_win_ctagline,
+					  &args->mapping_ctagline,
+					  &args->flags);
+}
+
+static int gk20a_as_ioctl_map_buffer_compbits(
+		struct gk20a_as_share *as_share,
+		struct nvgpu_as_map_buffer_compbits_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_map_compbits(as_share->vm,
+				     args->mapping_gva,
+				     &args->compbits_win_gva,
+				     &args->mapping_iova,
+				     args->flags);
+}
+
 int gk20a_as_dev_open(struct inode *inode, struct file *filp)
 {
 	struct gk20a_as_share *as_share;
@@ -334,6 +359,14 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		err = gk20a_as_ioctl_get_va_regions(as_share,
 				(struct nvgpu_as_get_va_regions_args *)buf);
 		break;
+	case NVGPU_AS_IOCTL_GET_BUFFER_COMPBITS_INFO:
+		err = gk20a_as_ioctl_get_buffer_compbits_info(as_share,
+				(struct nvgpu_as_get_buffer_compbits_info_args *)buf);
+		break;
+	case NVGPU_AS_IOCTL_MAP_BUFFER_COMPBITS:
+		err = gk20a_as_ioctl_map_buffer_compbits(as_share,
+				(struct nvgpu_as_map_buffer_compbits_args *)buf);
+		break;
 	default:
 		dev_dbg(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd);
 		err = -ENOTTY;
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -2016,8 +2016,13 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
 	gpu->max_ltc_per_fbp =  g->ops.gr.get_max_ltc_per_fbp(g);
 	gpu->max_lts_per_ltc = g->ops.gr.get_max_lts_per_ltc(g);
 	g->ops.gr.get_rop_l2_en_mask(g);
-
-	gpu->reserved = 0;
+	gpu->gr_compbit_store_base_hw = g->gr.compbit_store.base_hw;
+	gpu->gr_gobs_per_comptagline_per_slice =
+		g->gr.gobs_per_comptagline_per_slice;
+	gpu->num_ltc = g->ltc_count;
+	gpu->lts_per_ltc = g->gr.slices_per_ltc;
+	gpu->cbc_cache_line_size = g->gr.cacheline_size;
+	gpu->cbc_comptags_per_line = g->gr.comptags_per_cacheline;

 	return 0;
 }
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -221,7 +221,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				struct device *dev,
 				struct dma_buf *dmabuf,
 				struct gk20a_allocator *allocator,
-				u32 lines, bool user_mappable)
+				u32 lines, bool user_mappable,
+				u64 *ctag_map_win_size,
+				u32 *ctag_map_win_ctagline)
 {
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
 	u32 offset = 0;
@@ -313,6 +315,8 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				first_unneeded_cacheline *
 				g->gr.comptags_per_cacheline;

+			u64 win_size;
+
 			if (needed_ctaglines < ctaglines_to_allocate) {
 				/* free alignment lines */
 				int tmp=
@@ -326,6 +330,14 @@ static int gk20a_alloc_comptags(struct gk20a *g,

 				ctaglines_to_allocate = needed_ctaglines;
 			}
+
+			*ctag_map_win_ctagline = offset;
+			win_size =
+				DIV_ROUND_UP(lines,
+					     g->gr.comptags_per_cacheline) *
+				aggregate_cacheline_sz;
+
+			*ctag_map_win_size = round_up(win_size, small_pgsz);
 		}

 		priv->comptags.offset = offset;
@@ -1374,6 +1386,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 	bool clear_ctags = false;
 	struct scatterlist *sgl;
 	u64 buf_addr;
+	u64 ctag_map_win_size = 0;
+	u32 ctag_map_win_ctagline = 0;

 	mutex_lock(&vm->update_gmmu_lock);

@@ -1501,7 +1515,9 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,

 		/* allocate compression resources if needed */
 		err = gk20a_alloc_comptags(g, d, dmabuf, ctag_allocator,
-					   bfr.ctag_lines, user_mappable);
+					   bfr.ctag_lines, user_mappable,
+					   &ctag_map_win_size,
+					   &ctag_map_win_ctagline);
 		if (err) {
 			/* ok to fall back here if we ran out */
 			/* TBD: we can partially alloc ctags as well... */
@@ -1588,6 +1604,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 	mapped_buffer->ctag_lines  = bfr.ctag_lines;
 	mapped_buffer->ctag_allocated_lines = bfr.ctag_allocated_lines;
 	mapped_buffer->ctags_mappable = bfr.ctag_user_mappable;
+	mapped_buffer->ctag_map_win_size = ctag_map_win_size;
+	mapped_buffer->ctag_map_win_ctagline = ctag_map_win_ctagline;
 	mapped_buffer->vm          = vm;
 	mapped_buffer->flags       = flags;
 	mapped_buffer->kind        = kind;
@@ -1640,6 +1658,140 @@ clean_up:
 	return 0;
 }

+int gk20a_vm_get_compbits_info(struct vm_gk20a *vm,
+			       u64 mapping_gva,
+			       u64 *compbits_win_size,
+			       u32 *compbits_win_ctagline,
+			       u32 *mapping_ctagline,
+			       u32 *flags)
+{
+	struct mapped_buffer_node *mapped_buffer;
+	struct device *d = dev_from_vm(vm);
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, mapping_gva);
+
+	if (!mapped_buffer | !mapped_buffer->user_mapped)
+	{
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "%s: bad offset 0x%llx", __func__, mapping_gva);
+		return -EFAULT;
+	}
+
+	*compbits_win_size = 0;
+	*compbits_win_ctagline = 0;
+	*mapping_ctagline = 0;
+	*flags = 0;
+
+	if (mapped_buffer->ctag_offset)
+		*flags |= NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_HAS_COMPBITS;
+
+	if (mapped_buffer->ctags_mappable)
+	{
+		*flags |= NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_MAPPABLE;
+		*compbits_win_size = mapped_buffer->ctag_map_win_size;
+		*compbits_win_ctagline = mapped_buffer->ctag_map_win_ctagline;
+		*mapping_ctagline = mapped_buffer->ctag_offset;
+	}
+
+	mutex_unlock(&vm->update_gmmu_lock);
+	return 0;
+}
+
+
+int gk20a_vm_map_compbits(struct vm_gk20a *vm,
+			  u64 mapping_gva,
+			  u64 *compbits_win_gva,
+			  u64 *mapping_iova,
+			  u32 flags)
+{
+	struct mapped_buffer_node *mapped_buffer;
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct device *d = dev_from_vm(vm);
+
+	if (flags & NVGPU_AS_MAP_BUFFER_COMPBITS_FLAGS_FIXED_OFFSET) {
+		/* This will be implemented later */
+		gk20a_err(d,
+			  "%s: fixed-offset compbits mapping not yet supported",
+			  __func__);
+		return -EFAULT;
+	}
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, mapping_gva);
+
+	if (!mapped_buffer || !mapped_buffer->user_mapped) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "%s: bad offset 0x%llx", __func__, mapping_gva);
+		return -EFAULT;
+	}
+
+	if (!mapped_buffer->ctags_mappable) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "%s: comptags not mappable, offset 0x%llx", __func__, mapping_gva);
+		return -EFAULT;
+	}
+
+	if (!mapped_buffer->ctag_map_win_addr) {
+		const u32 small_pgsz_index = 0; /* small pages, 4K */
+		const u32 aggregate_cacheline_sz =
+			g->gr.cacheline_size * g->gr.slices_per_ltc *
+			g->ltc_count;
+
+		/* first aggregate cacheline to map */
+		u32 cacheline_start; /* inclusive */
+
+		/* offset of the start cacheline (will be page aligned) */
+		u64 cacheline_offset_start;
+
+		if (!mapped_buffer->ctag_map_win_size) {
+			mutex_unlock(&vm->update_gmmu_lock);
+			gk20a_err(d,
+				  "%s: mapping 0x%llx does not have "
+				  "mappable comptags",
+				  __func__, mapping_gva);
+			return -EFAULT;
+		}
+
+		cacheline_start = mapped_buffer->ctag_offset /
+			g->gr.comptags_per_cacheline;
+		cacheline_offset_start =
+			cacheline_start * aggregate_cacheline_sz;
+
+		mapped_buffer->ctag_map_win_addr =
+			g->ops.mm.gmmu_map(
+				vm,
+				0,
+				g->gr.compbit_store.mem.sgt,
+				cacheline_offset_start, /* sg offset */
+				mapped_buffer->ctag_map_win_size, /* size */
+				small_pgsz_index,
+				0, /* kind */
+				0, /* ctag_offset */
+				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				gk20a_mem_flag_read_only,
+				false,
+				false);
+
+		if (!mapped_buffer->ctag_map_win_addr) {
+			mutex_unlock(&vm->update_gmmu_lock);
+			gk20a_err(d,
+				  "%s: failed to map comptags for mapping 0x%llx",
+				  __func__, mapping_gva);
+			return -ENOMEM;
+		}
+	}
+
+	*mapping_iova = gk20a_mm_iova_addr(g, mapped_buffer->sgt->sgl, 0);
+	*compbits_win_gva = mapped_buffer->ctag_map_win_addr;
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	return 0;
+}
+
 u64 gk20a_gmmu_map(struct vm_gk20a *vm,
 		struct sg_table **sgt,
 		u64 size,
@@ -2276,6 +2428,18 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
 	struct vm_gk20a *vm = mapped_buffer->vm;
 	struct gk20a *g = vm->mm->g;

+	if (mapped_buffer->ctag_map_win_addr) {
+		/* unmap compbits */
+
+		g->ops.mm.gmmu_unmap(vm,
+				     mapped_buffer->ctag_map_win_addr,
+				     mapped_buffer->ctag_map_win_size,
+				     0,       /* page size 4k */
+				     true,    /* va allocated */
+				     gk20a_mem_flag_none,
+				     false);  /* not sparse */
+	}
+
 	g->ops.mm.gmmu_unmap(vm,
 		mapped_buffer->addr,
 		mapped_buffer->size,
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -186,7 +186,13 @@ struct mapped_buffer_node {
 	u32 ctag_offset;
 	u32 ctag_lines;
 	u32 ctag_allocated_lines;
+
+	/* For comptag mapping, these are the mapping window parameters */
 	bool ctags_mappable;
+	u64 ctag_map_win_addr; /* non-zero if mapped */
+	u64 ctag_map_win_size; /* non-zero if ctags_mappable */
+	u32 ctag_map_win_ctagline; /* ctagline at win start, set if
+				    * ctags_mappable */

 	u32 flags;
 	u32 kind;
@@ -504,6 +510,19 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 		 u64 buffer_offset,
 		 u64 mapping_size);

+int gk20a_vm_get_compbits_info(struct vm_gk20a *vm,
+			       u64 mapping_gva,
+			       u64 *compbits_win_size,
+			       u32 *compbits_win_ctagline,
+			       u32 *mapping_ctagline,
+			       u32 *flags);
+
+int gk20a_vm_map_compbits(struct vm_gk20a *vm,
+			  u64 mapping_gva,
+			  u64 *compbits_win_gva,
+			  u64 *mapping_iova,
+			  u32 flags);
+
 /* unmap handle from kernel */
 void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset);