diff --git a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c index ad2ee159c..3644c2ef3 100644 --- a/drivers/gpu/nvgpu/gk20a/cde_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/cde_gk20a.c @@ -637,7 +637,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, /* map the destination buffer */ get_dma_buf(dst); /* a ref for gk20a_vm_map */ dst_vaddr = gk20a_vm_map(g->cde_app.vm, dst, 0, - 0, dst_kind, NULL, true, + NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, + dst_kind, NULL, true, gk20a_mem_flag_none, 0, 0); if (!dst_vaddr) { @@ -654,7 +655,8 @@ int gk20a_cde_convert(struct gk20a *g, struct dma_buf *src, /* map the source buffer to prevent premature release */ get_dma_buf(src); /* a ref for gk20a_vm_map */ src_vaddr = gk20a_vm_map(g->cde_app.vm, src, 0, - 0, dst_kind, NULL, true, + NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, + dst_kind, NULL, true, gk20a_mem_flag_none, 0, 0); if (!src_vaddr) { @@ -794,7 +796,8 @@ int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx) /* map backing store to gpu virtual space */ vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt, - g->gr.compbit_store.size, 0, + g->gr.compbit_store.size, + NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE, gk20a_mem_flag_read_only); if (!vaddr) { @@ -991,16 +994,14 @@ static int gk20a_buffer_convert_gpu_to_cde( const int transposed_height = transpose ? width : height; const int xtiles = (transposed_width + 7) >> 3; const int ytiles = (transposed_height + 7) >> 3; - const int wgx = 16; + const int wgx = 8; const int wgy = 8; const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */ const int dst_stride = 128; /* TODO chip constant */ const int xalign = compbits_per_byte * wgx; const int yalign = wgy; - const int tilepitch = roundup(xtiles, xalign) / compbits_per_byte; - const int ytilesaligned = roundup(ytiles, yalign); - const int gridw = roundup(tilepitch, wgx) / wgx; - const int gridh = roundup(ytilesaligned, wgy) / wgy; + const int gridw = roundup(xtiles, xalign) / xalign; + const int gridh = roundup(ytiles, yalign) / yalign; if (!g->cde_app.initialised) { err = gk20a_cde_reload(g); @@ -1015,17 +1016,10 @@ static int gk20a_buffer_convert_gpu_to_cde( gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_offset=0x%llx", width, height, block_height_log2, compbits_offset); gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d) invocations (%d, %d)", - width, height, xtiles, ytiles, tilepitch, ytilesaligned); + width, height, xtiles, ytiles, gridw*wgx, gridh*wgy); gk20a_dbg(gpu_dbg_cde, "group (%d, %d) grid (%d, %d)", wgx, wgy, gridw, gridh); - if (tilepitch % wgx != 0 || ytilesaligned % wgy != 0) { - gk20a_warn(&g->dev->dev, - "grid size (%d, %d) is not a multiple of work group size (%d, %d)", - tilepitch, ytilesaligned, wgx, wgy); - return -EINVAL; - } - /* Write parameters */ #define WRITE_PATCH(NAME, VALUE) \ params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}