diff --git a/drivers/gpu/nvgpu/common/linux/ce2.c b/drivers/gpu/nvgpu/common/linux/ce2.c index 97dc6678b..7cb393821 100644 --- a/drivers/gpu/nvgpu/common/linux/ce2.c +++ b/drivers/gpu/nvgpu/common/linux/ce2.c @@ -54,7 +54,6 @@ int gk20a_ce_execute_ops(struct gk20a *g, u64 cmd_buf_gpu_va = 0; u32 methodSize; u32 cmd_buf_read_offset; - u32 fence_index; u32 dma_copy_class; struct nvgpu_gpfifo gpfifo; struct nvgpu_fence fence = {0,0}; @@ -87,38 +86,22 @@ int gk20a_ce_execute_ops(struct gk20a *g, nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); - ce_ctx->cmd_buf_read_queue_offset %= ce_ctx->cmd_buf_end_queue_offset; + ce_ctx->cmd_buf_read_queue_offset %= NVGPU_CE_MAX_INFLIGHT_JOBS; cmd_buf_read_offset = (ce_ctx->cmd_buf_read_queue_offset * - (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); - - /* at end of command buffer has gk20a_fence for command buffer sync */ - fence_index = (cmd_buf_read_offset + - ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) - - (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32)))); - - if (sizeof(struct gk20a_fence *) > NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING) { - ret = -ENOMEM; - goto noop; - } + (NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF / sizeof(u32))); cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; - /* 0 is treated as invalid pre-sync */ - if (cmd_buf_cpu_va[fence_index]) { - struct gk20a_fence * ce_cmd_buf_fence_in = NULL; + if (ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset]) { + struct gk20a_fence **prev_post_fence = + &ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset]; - memcpy((void *)&ce_cmd_buf_fence_in, - (void *)(cmd_buf_cpu_va + fence_index), - sizeof(struct gk20a_fence *)); - ret = gk20a_fence_wait(g, ce_cmd_buf_fence_in, + ret = gk20a_fence_wait(g, *prev_post_fence, gk20a_get_gr_idle_timeout(g)); - gk20a_fence_put(ce_cmd_buf_fence_in); - /* Reset the stored last pre-sync */ - memset((void *)(cmd_buf_cpu_va + fence_index), - 0, - NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING); + gk20a_fence_put(*prev_post_fence); + *prev_post_fence = NULL; if (ret) goto noop; } @@ -130,7 +113,7 @@ int gk20a_ce_execute_ops(struct gk20a *g, dst_buf, size, &cmd_buf_cpu_va[cmd_buf_read_offset], - NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF, + NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF, payload, gk20a_get_valid_launch_flags(g, launch_flags), request_operation, @@ -154,10 +137,8 @@ int gk20a_ce_execute_ops(struct gk20a *g, &ce_cmd_buf_fence_out, false, NULL); if (!ret) { - memcpy((void *)(cmd_buf_cpu_va + fence_index), - (void *)&ce_cmd_buf_fence_out, - sizeof(struct gk20a_fence *)); - + ce_ctx->postfences[ce_ctx->cmd_buf_read_queue_offset] = + ce_cmd_buf_fence_out; if (gk20a_fence_out) { gk20a_fence_get(ce_cmd_buf_fence_out); *gk20a_fence_out = ce_cmd_buf_fence_out; diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c index c4fcca3c5..188789917 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.c @@ -103,39 +103,15 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base) } /* static CE app api */ -static void gk20a_ce_free_command_buffer_stored_fence(struct gk20a_gpu_ctx *ce_ctx) +static void gk20a_ce_put_fences(struct gk20a_gpu_ctx *ce_ctx) { - u32 cmd_buf_index; - u32 cmd_buf_read_offset; - u32 fence_index; - u32 *cmd_buf_cpu_va; + u32 i; - for (cmd_buf_index = 0; - cmd_buf_index < ce_ctx->cmd_buf_end_queue_offset; - cmd_buf_index++) { - cmd_buf_read_offset = (cmd_buf_index * - (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32))); - - /* at end of command buffer has gk20a_fence for command buffer sync */ - fence_index = (cmd_buf_read_offset + - ((NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF / sizeof(u32)) - - (NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING / sizeof(u32)))); - - cmd_buf_cpu_va = (u32 *)ce_ctx->cmd_buf_mem.cpu_va; - - /* 0 is treated as invalid pre-sync */ - if (cmd_buf_cpu_va[fence_index]) { - struct gk20a_fence * ce_cmd_buf_fence_in = NULL; - - memcpy((void *)&ce_cmd_buf_fence_in, - (void *)(cmd_buf_cpu_va + fence_index), - sizeof(struct gk20a_fence *)); - gk20a_fence_put(ce_cmd_buf_fence_in); - /* Reset the stored last pre-sync */ - memset((void *)(cmd_buf_cpu_va + fence_index), - 0, - NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING); - } + for (i = 0; i < NVGPU_CE_MAX_INFLIGHT_JOBS; i++) { + struct gk20a_fence **fence = &ce_ctx->postfences[i]; + if (*fence) + gk20a_fence_put(*fence); + *fence = NULL; } } @@ -148,8 +124,8 @@ static void gk20a_ce_delete_gpu_context(struct gk20a_gpu_ctx *ce_ctx) nvgpu_mutex_acquire(&ce_ctx->gpu_ctx_mutex); - if (ce_ctx->cmd_buf_mem.cpu_va) { - gk20a_ce_free_command_buffer_stored_fence(ce_ctx); + if (nvgpu_mem_is_valid(&ce_ctx->cmd_buf_mem)) { + gk20a_ce_put_fences(ce_ctx); nvgpu_dma_unmap_free(ce_ctx->vm, &ce_ctx->cmd_buf_mem); } @@ -449,8 +425,6 @@ u32 gk20a_ce_create_context(struct gk20a *g, ce_ctx->g = g; ce_ctx->cmd_buf_read_queue_offset = 0; - ce_ctx->cmd_buf_end_queue_offset = - (NVGPU_CE_COMMAND_BUF_SIZE / NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF); ce_ctx->vm = g->mm.ce.vm; @@ -491,8 +465,11 @@ u32 gk20a_ce_create_context(struct gk20a *g, goto end; } - /* allocate command buffer (4096 should be more than enough) from sysmem*/ - err = nvgpu_dma_alloc_map_sys(ce_ctx->vm, NVGPU_CE_COMMAND_BUF_SIZE, &ce_ctx->cmd_buf_mem); + /* allocate command buffer from sysmem */ + err = nvgpu_dma_alloc_map_sys(ce_ctx->vm, + NVGPU_CE_MAX_INFLIGHT_JOBS * + NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF, + &ce_ctx->cmd_buf_mem); if (err) { nvgpu_err(g, "ce: could not allocate command buffer for CE context"); diff --git a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h index 0b475f656..1a102070f 100644 --- a/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/ce2_gk20a.h @@ -36,9 +36,8 @@ int gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base); #define NVGPU_CE_LOWER_ADDRESS_OFFSET_MASK 0xffffffff #define NVGPU_CE_UPPER_ADDRESS_OFFSET_MASK 0xff -#define NVGPU_CE_COMMAND_BUF_SIZE 8192 -#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_PER_KICKOFF 256 -#define NVGPU_CE_MAX_COMMAND_BUFF_SIZE_FOR_TRACING 8 +#define NVGPU_CE_MAX_INFLIGHT_JOBS 32 +#define NVGPU_CE_MAX_COMMAND_BUFF_BYTES_PER_KICKOFF 256 /* dma launch_flags */ enum { @@ -106,11 +105,11 @@ struct gk20a_gpu_ctx { /* cmd buf mem_desc */ struct nvgpu_mem cmd_buf_mem; + struct gk20a_fence *postfences[NVGPU_CE_MAX_INFLIGHT_JOBS]; struct nvgpu_list_node list; u32 cmd_buf_read_queue_offset; - u32 cmd_buf_end_queue_offset; }; static inline struct gk20a_gpu_ctx *