diff --git a/drivers/gpu/nvgpu/common/linux/driver_common.c b/drivers/gpu/nvgpu/common/linux/driver_common.c index b4e6a02c0..7c4645a8b 100644 --- a/drivers/gpu/nvgpu/common/linux/driver_common.c +++ b/drivers/gpu/nvgpu/common/linux/driver_common.c @@ -163,6 +163,8 @@ static void nvgpu_init_mm_vars(struct gk20a *g) platform->unified_memory); __nvgpu_set_enabled(g, NVGPU_MM_UNIFY_ADDRESS_SPACES, platform->unify_address_spaces); + __nvgpu_set_enabled(g, NVGPU_MM_CE_TSG_REQUIRED, + platform->tsg_required); nvgpu_mutex_init(&g->mm.tlb_lock); nvgpu_mutex_init(&g->mm.priv_lock); diff --git a/drivers/gpu/nvgpu/common/linux/pci.c b/drivers/gpu/nvgpu/common/linux/pci.c index 92764c215..401080ed5 100644 --- a/drivers/gpu/nvgpu/common/linux/pci.c +++ b/drivers/gpu/nvgpu/common/linux/pci.c @@ -240,6 +240,7 @@ static struct gk20a_platform nvgpu_pci_device[] = { .vbios_min_version = 0x88001e00, .hardcode_sw_threshold = false, .run_preos = true, + .tsg_required = true, }, { /* DEVICE=PG503 SKU 200 ES */ /* ptimer src frequency in hz */ @@ -274,6 +275,7 @@ static struct gk20a_platform nvgpu_pci_device[] = { .vbios_min_version = 0x88001e00, .hardcode_sw_threshold = false, .run_preos = true, + .tsg_required = true, } }; diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index fdc1ac619..5314a1beb 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c @@ -28,6 +28,7 @@ #include "gk20a.h" #include +#include #include #include @@ -38,6 +39,14 @@ #include #include +/* + * Copy engine defines line size in pixels + */ +#define MAX_CE_SHIFT 31 /* 4Gpixels -1 */ +#define MAX_CE_MASK ((u32) (~(~0 << MAX_CE_SHIFT))) +#define MAX_CE_ALIGN(a) (a & MAX_CE_MASK) + + static u32 ce2_nonblockpipe_isr(struct gk20a *g, u32 fifo_intr) { gk20a_dbg(gpu_dbg_intr, "ce2 non-blocking pipe interrupt\n"); @@ -192,6 +201,10 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx) nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); } + /* unbind tsg */ + if (ce_ctx->tsg && ce_ctx->ch) + gk20a_tsg_unbind_channel(ce_ctx->ch); + /* free the channel */ if (ce_ctx->ch) gk20a_channel_close(ce_ctx->ch); @@ -206,15 +219,32 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx) nvgpu_kfree(ce_ctx->g, ce_ctx); } -static inline unsigned int gk20a_ce_get_method_size(int request_operation) +static inline unsigned int gk20a_ce_get_method_size(int request_operation, + u64 size) { /* failure size */ unsigned int methodsize = UINT_MAX; + unsigned int iterations = 0; + u32 shift; + u64 chunk = size; + u32 height, width; + + while (chunk) { + iterations++; + + shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) : + MAX_CE_SHIFT; + width = chunk >> shift; + height = 1 << shift; + width = MAX_CE_ALIGN(width); + + chunk -= (u64) height * width; + } if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) - methodsize = 10 * 2 * sizeof(u32); + methodsize = (2 + (16 * iterations)) * sizeof(u32); else if (request_operation & NVGPU_CE_MEMSET) - methodsize = 9 * 2 * sizeof(u32); + methodsize = (2 + (15 * iterations)) * sizeof(u32); return methodsize; } @@ -243,10 +273,13 @@ static int gk20a_ce_prepare_submit(u64 src_buf, { u32 launch = 0; u32 methodSize = 0; + u64 offset = 0; + u64 chunk_size = 0; + u64 chunk = size; /* failure case handling */ - if ((gk20a_ce_get_method_size(request_operation) > max_cmd_buf_size) || - (!size) || + if ((gk20a_ce_get_method_size(request_operation, size) > + max_cmd_buf_size) || (!size) || (request_operation > NVGPU_CE_MEMSET)) return 0; @@ -254,83 +287,116 @@ static int gk20a_ce_prepare_submit(u64 src_buf, cmd_buf_cpu_va[methodSize++] = 0x20018000; cmd_buf_cpu_va[methodSize++] = dma_copy_class; - if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) { - /* setup the source */ - cmd_buf_cpu_va[methodSize++] = 0x20018101; - cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf) & - NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); + /* + * The purpose clear the memory in 2D rectangles. We get the ffs to + * determine the number of lines to copy. The only constraint is that + * maximum number of pixels per line is 4Gpix - 1, which is awkward for + * calculation, so we settle to 2Gpix per line to make calculatione + * more agreable + */ - cmd_buf_cpu_va[methodSize++] = 0x20018100; - cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf) & - NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); + /* The copy engine in 2D mode can have (2^32 - 1) x (2^32 - 1) pixels in + * a single submit, we are going to try to clear a range of up to 2Gpix + * multiple lines. Because we want to copy byte aligned we will be + * setting 1 byte pixels */ - cmd_buf_cpu_va[methodSize++] = 0x20018098; - if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) { - cmd_buf_cpu_va[methodSize++] = 0x00000000; - } else if (launch_flags & NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) { - cmd_buf_cpu_va[methodSize++] = 0x00000002; + /* + * per iteration + * <------------------------- 40 bits ------------------------------> + * 1 <------ ffs -------> + * <-----------up to 30 bits-----------> + */ + while (chunk) { + u32 width, height, shift; + + /* + * We will be aligning to bytes, making the maximum number of + * pix per line 2Gb + */ + + shift = MAX_CE_ALIGN(chunk) ? __ffs(MAX_CE_ALIGN(chunk)) : + MAX_CE_SHIFT; + height = chunk >> shift; + width = 1 << shift; + height = MAX_CE_ALIGN(height); + + chunk_size = (u64) height * width; + + /* reset launch flag */ + launch = 0; + + if (request_operation & NVGPU_CE_PHYS_MODE_TRANSFER) { + /* setup the source */ + cmd_buf_cpu_va[methodSize++] = 0x20028100; + cmd_buf_cpu_va[methodSize++] = (u64_hi32(src_buf + + offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); + cmd_buf_cpu_va[methodSize++] = (u64_lo32(src_buf + + offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); + + cmd_buf_cpu_va[methodSize++] = 0x20018098; + if (launch_flags & NVGPU_CE_SRC_LOCATION_LOCAL_FB) + cmd_buf_cpu_va[methodSize++] = 0x00000000; + else if (launch_flags & + NVGPU_CE_SRC_LOCATION_NONCOHERENT_SYSMEM) + cmd_buf_cpu_va[methodSize++] = 0x00000002; + else + cmd_buf_cpu_va[methodSize++] = 0x00000001; + + launch |= 0x00001000; + } else if (request_operation & NVGPU_CE_MEMSET) { + /* Remap from component A on 1 byte wide pixels */ + cmd_buf_cpu_va[methodSize++] = 0x200181c2; + cmd_buf_cpu_va[methodSize++] = 0x00000004; + + cmd_buf_cpu_va[methodSize++] = 0x200181c0; + cmd_buf_cpu_va[methodSize++] = payload; + + launch |= 0x00000400; } else { - cmd_buf_cpu_va[methodSize++] = 0x00000001; + /* Illegal size */ + return 0; } - launch |= 0x00001000; - } else if (request_operation & NVGPU_CE_MEMSET) { - cmd_buf_cpu_va[methodSize++] = 0x200181c2; - cmd_buf_cpu_va[methodSize++] = 0x00030004; + /* setup the destination/output */ + cmd_buf_cpu_va[methodSize++] = 0x20068102; + cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf + + offset) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); + cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf + + offset) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); + /* Pitch in/out */ + cmd_buf_cpu_va[methodSize++] = width; + cmd_buf_cpu_va[methodSize++] = width; + /* width and line count */ + cmd_buf_cpu_va[methodSize++] = width; + cmd_buf_cpu_va[methodSize++] = height; - cmd_buf_cpu_va[methodSize++] = 0x200181c0; - cmd_buf_cpu_va[methodSize++] = payload; + cmd_buf_cpu_va[methodSize++] = 0x20018099; + if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) + cmd_buf_cpu_va[methodSize++] = 0x00000000; + else if (launch_flags & + NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) + cmd_buf_cpu_va[methodSize++] = 0x00000002; + else + cmd_buf_cpu_va[methodSize++] = 0x00000001; - launch |= 0x00000400; + launch |= 0x00002005; - /* converted into number of words */ - size /= sizeof(u32); + if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) + launch |= 0x00000000; + else + launch |= 0x00000080; + + if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) + launch |= 0x00000000; + else + launch |= 0x00000100; + + cmd_buf_cpu_va[methodSize++] = 0x200180c0; + cmd_buf_cpu_va[methodSize++] = launch; + offset += chunk_size; + chunk -= chunk_size; } - /* setup the destination/output */ - cmd_buf_cpu_va[methodSize++] = 0x20018103; - cmd_buf_cpu_va[methodSize++] = (u64_lo32(dst_buf) & NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK); - - cmd_buf_cpu_va[methodSize++] = 0x20018102; - cmd_buf_cpu_va[methodSize++] = (u64_hi32(dst_buf) & NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK); - - cmd_buf_cpu_va[methodSize++] = 0x20018099; - if (launch_flags & NVGPU_CE_DST_LOCATION_LOCAL_FB) { - cmd_buf_cpu_va[methodSize++] = 0x00000000; - } else if (launch_flags & NVGPU_CE_DST_LOCATION_NONCOHERENT_SYSMEM) { - cmd_buf_cpu_va[methodSize++] = 0x00000002; - } else { - cmd_buf_cpu_va[methodSize++] = 0x00000001; - } - - launch |= 0x00002000; - - /* setup the format */ - cmd_buf_cpu_va[methodSize++] = 0x20018107; - cmd_buf_cpu_va[methodSize++] = 1; - cmd_buf_cpu_va[methodSize++] = 0x20018106; - cmd_buf_cpu_va[methodSize++] = u64_lo32(size); - - launch |= 0x00000004; - - if (launch_flags & NVGPU_CE_SRC_MEMORY_LAYOUT_BLOCKLINEAR) - launch |= 0x00000000; - else - launch |= 0x00000080; - - if (launch_flags & NVGPU_CE_DST_MEMORY_LAYOUT_BLOCKLINEAR) - launch |= 0x00000000; - else - launch |= 0x00000100; - - if (launch_flags & NVGPU_CE_DATA_TRANSFER_TYPE_NON_PIPELINED) - launch |= 0x00000002; - else - launch |= 0x00000001; - - cmd_buf_cpu_va[methodSize++] = 0x200180c0; - cmd_buf_cpu_va[methodSize++] = launch; - return methodSize; } @@ -457,6 +523,16 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g, ce_ctx->vm = g->mm.ce.vm; + if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) { + /* allocate a tsg if needed */ + ce_ctx->tsg = gk20a_tsg_open(g); + + if (!ce_ctx->tsg) { + nvgpu_err(g, "ce: gk20a tsg not available"); + goto end; + } + } + /* always kernel client needs privileged channel */ ce_ctx->ch = gk20a_open_new_channel_with_cb(g, gk20a_ce_finished_ctx_cb, ce_ctx, @@ -475,6 +551,14 @@ u32 gk20a_ce_create_context_with_cb(struct gk20a *g, goto end; } + if (nvgpu_is_enabled(g, NVGPU_MM_CE_TSG_REQUIRED)) { + err = gk20a_tsg_bind_channel(ce_ctx->tsg, ce_ctx->ch); + if (err) { + nvgpu_err(g, "ce: unable to bind to tsg"); + goto end; + } + } + /* allocate gpfifo (1024 should be more than enough) */ err = gk20a_channel_alloc_gpfifo(ce_ctx->ch, 1024, 0, 0); if (err) { diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h index f1f9e260a..1dad8952d 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h @@ -36,8 +36,8 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base); #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff -#define NVGPU_CE_COMMAND_BUF_SIZE 4096 -#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 128 +#define NVGPU_CE_COMMAND_BUF_SIZE 8192 +#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 256 #define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8 typedef void (*ce_event_callback)(u32 ce_ctx_id, u32 ce_event_flag); @@ -108,6 +108,9 @@ struct gk20a_gpu_ctx { int gpu_ctx_state; ce_event_callback user_event_callback; + /* tsg related data */ + struct tsg_gk20a *tsg; + /* channel related data */ struct channel_gk20a *ch; struct vm_gk20a *vm; diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h index c2c73b9c6..d4ff17f37 100644 --- a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h @@ -233,6 +233,9 @@ struct gk20a_platform { /* unified or split memory with separate vidmem? */ bool unified_memory; + /* true if all channels must be in TSG */ + bool tsg_required; + /* minimum supported VBIOS version */ u32 vbios_min_version; diff --git a/drivers/gpu/nvgpu/include/nvgpu/enabled.h b/drivers/gpu/nvgpu/include/nvgpu/enabled.h index 41758fe70..8c0bb9d33 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/enabled.h +++ b/drivers/gpu/nvgpu/include/nvgpu/enabled.h @@ -42,6 +42,8 @@ struct gk20a; #define NVGPU_MM_HONORS_APERTURE 17 /* unified or split memory with separate vidmem? */ #define NVGPU_MM_UNIFIED_MEMORY 18 +/* kernel mode ce vidmem clearing channels need to be in a tsg */ +#define NVGPU_MM_CE_TSG_REQUIRED 19 /* * Security flags