gpu: nvgpu: cde: CDE swizzling optimizations

Change CDE swizzling shader kernel size to 8x8 to avoid waste with relatively small surfaces. Map compbit backing store and destination surface as cacheable. Clean up kernel size calculation. Bug 1546619 Change-Id: Ie97c019b4137d2f2230da6ba3034387b1ab1468a Signed-off-by: Jussi Rasanen <jrasanen@nvidia.com> Reviewed-on: http://git-master/r/501158 Reviewed-by: Arto Merilainen <amerilainen@nvidia.com> Tested-by: Arto Merilainen <amerilainen@nvidia.com>
2025-12-22 09:12:24 +03:00 · 2014-09-22 12:05:23 +03:00
parent ad39ba2b9e
commit 47298dae35
1 changed files with 10 additions and 16 deletions
--- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c
@@ -637,7 +637,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
 	/* map the destination buffer */
 	get_dma_buf(dst); /* a ref for gk20a_vm_map */
 	dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0,
-				 0, dst_kind, NULL, true,
+				 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				 dst_kind, NULL, true,
 				 gk20a_mem_flag_none,
 				 0, 0);
 	if (!dst_vaddr) {
@@ -654,7 +655,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src,
 	/* map the source buffer to prevent premature release */
 	get_dma_buf(src); /* a ref for gk20a_vm_map */
 	src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0,
-				 0, dst_kind, NULL, true,
+				 NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				 dst_kind, NULL, true,
 				 gk20a_mem_flag_none,
 				 0, 0);
 	if (!src_vaddr) {
@@ -794,7 +796,8 @@ int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)

 	/* map backing store to gpu virtual space */
 	vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt,
-			       g->gr.compbit_store.size, 0,
+			       g->gr.compbit_store.size,
+			       NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 			       gk20a_mem_flag_read_only);

 	if (!vaddr) {
@@ -991,16 +994,14 @@ static int gk20a_buffer_convert_gpu_to_cde(
 	const int transposed_height = transpose ? width : height;
 	const int xtiles = (transposed_width + 7) >> 3;
 	const int ytiles = (transposed_height + 7) >> 3;
-	const int wgx = 16;
+	const int wgx = 8;
 	const int wgy = 8;
 	const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
 	const int dst_stride = 128; /* TODO chip constant */
 	const int xalign = compbits_per_byte * wgx;
 	const int yalign = wgy;
-	const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte;
-	const int ytilesaligned = roundup(ytiles, yalign);
-	const int gridw = roundup(tilepitch, wgx) / wgx;
-	const int gridh = roundup(ytilesaligned, wgy) / wgy;
+	const int gridw = roundup(xtiles, xalign) / xalign;
+	const int gridh = roundup(ytiles, yalign) / yalign;

 	if (!g->cde_app.initialised) {
 		err = gk20a_cde_reload(g);
@@ -1015,17 +1016,10 @@ static int gk20a_buffer_convert_gpu_to_cde(
 	gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx",
 		  width, height, block_height_log2, compbits_offset);
 	gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)",
-		  width, height, xtiles, ytiles, tilepitch, ytilesaligned);
+		  width, height, xtiles, ytiles, gridw*wgx, gridh*wgy);
 	gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)",
 		  wgx, wgy, gridw, gridh);

-	if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) {
-		gk20a_warn(&g->dev->dev,
-			"grid size (%d, %d) is not a multiple of work group size (%d, %d)",
-			tilepitch, ytilesaligned, wgx, wgy);
-		return -EINVAL;
-	}
-
 	/* Write parameters */
 #define WRITE_PATCH(NAME, VALUE) \
 		params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}