From a747e3a3ba8fee8323242bd594d571274c1a1497 Mon Sep 17 00:00:00 2001 From: Vinod G Date: Wed, 7 Nov 2018 10:05:07 -0800 Subject: [PATCH] gpu: nvgpu: RTV cb support for gfxp Add new buffer support for graphics preemption in Turing. Add new hal for allocate and commit rtv circular buffer for gfxp. Add new hal for free gr_ctx for TU104. JIRA NVGPUT-98 Change-Id: I4396fd50288db55da5f924fefa96a2e3d170094b Signed-off-by: Vinod G Reviewed-on: https://git-master.nvidia.com/r/1944975 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/gr_gk20a.h | 5 +- drivers/gpu/nvgpu/gm20b/hal_gm20b.c | 2 + drivers/gpu/nvgpu/gp106/hal_gp106.c | 2 + drivers/gpu/nvgpu/gp10b/gr_gp10b.c | 10 +++ drivers/gpu/nvgpu/gp10b/hal_gp10b.c | 2 + drivers/gpu/nvgpu/gv100/hal_gv100.c | 2 + drivers/gpu/nvgpu/gv11b/gr_gv11b.c | 4 + drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 2 + drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 4 + .../include/nvgpu/hw/tu104/hw_gr_tu104.h | 12 +++ drivers/gpu/nvgpu/tu104/gr_tu104.c | 76 ++++++++++++++++++- drivers/gpu/nvgpu/tu104/gr_tu104.h | 14 +++- drivers/gpu/nvgpu/tu104/hal_tu104.c | 14 +++- 13 files changed, 137 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h index 98fe3301e..5f4fb6791 100644 --- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h @@ -81,7 +81,7 @@ enum /* global_ctx_buffer */ { ATTRIBUTE_VPR = 5, GOLDEN_CTX = 6, PRIV_ACCESS_MAP = 7, - /* #8 is reserved */ + RTV_CIRCULAR_BUFFER = 8, FECS_TRACE_BUFFER = 9, NR_GLOBAL_CTX_BUF = 10 }; @@ -93,7 +93,7 @@ enum /*global_ctx_buffer_va */ { ATTRIBUTE_VA = 2, GOLDEN_CTX_VA = 3, PRIV_ACCESS_MAP_VA = 4, - /* #5 is reserved */ + RTV_CIRCULAR_BUFFER_VA = 5, FECS_TRACE_BUFFER_VA = 6, NR_GLOBAL_CTX_BUF_VA = 7 }; @@ -403,6 +403,7 @@ struct nvgpu_gr_ctx { struct nvgpu_mem spill_ctxsw_buffer; struct nvgpu_mem betacb_ctxsw_buffer; struct nvgpu_mem pagepool_ctxsw_buffer; + struct nvgpu_mem gfxp_rtvcb_ctxsw_buffer; u32 ctx_id; bool ctx_id_valid; bool cilp_preempt_pending; diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index 9fb2ab46d..3ff567165 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -333,6 +333,8 @@ static const struct gpu_ops gm20b_ops = { .dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats, .get_fecs_ctx_state_store_major_rev_id = gk20a_gr_get_fecs_ctx_state_store_major_rev_id, + .alloc_gfxp_rtv_cb = NULL, + .commit_gfxp_rtv_cb = NULL, }, .fb = { .init_hw = gm20b_fb_init_hw, diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c index c414f0fdb..412649f08 100644 --- a/drivers/gpu/nvgpu/gp106/hal_gp106.c +++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c @@ -410,6 +410,8 @@ static const struct gpu_ops gp106_ops = { .dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats, .get_fecs_ctx_state_store_major_rev_id = gk20a_gr_get_fecs_ctx_state_store_major_rev_id, + .alloc_gfxp_rtv_cb = NULL, + .commit_gfxp_rtv_cb = NULL, }, .fb = { .init_hw = gm20b_fb_init_hw, diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c index fb6513d43..ffefb32b7 100644 --- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c @@ -1036,6 +1036,14 @@ int gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g, goto fail_free_betacb; } + if (g->ops.gr.alloc_gfxp_rtv_cb != NULL) { + err = g->ops.gr.alloc_gfxp_rtv_cb(g, gr_ctx, vm); + if (err != 0) { + nvgpu_err(g, "cannot allocate gfxp rtv_cb"); + goto fail_free_pagepool; + } + } + gr_ctx->graphics_preempt_mode = graphics_preempt_mode; break; } @@ -1063,6 +1071,8 @@ int gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g, return 0; +fail_free_pagepool: + nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer); fail_free_betacb: nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer); fail_free_spill: diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index f2e0ed5d4..3f0ad1659 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -371,6 +371,8 @@ static const struct gpu_ops gp10b_ops = { .dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats, .get_fecs_ctx_state_store_major_rev_id = gk20a_gr_get_fecs_ctx_state_store_major_rev_id, + .alloc_gfxp_rtv_cb = NULL, + .commit_gfxp_rtv_cb = NULL, }, .fb = { .init_hw = gm20b_fb_init_hw, diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index 60da8dd6b..52ab46328 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -491,6 +491,8 @@ static const struct gpu_ops gv100_ops = { .dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats, .get_fecs_ctx_state_store_major_rev_id = gk20a_gr_get_fecs_ctx_state_store_major_rev_id, + .alloc_gfxp_rtv_cb = NULL, + .commit_gfxp_rtv_cb = NULL, }, .fb = { .init_hw = gv11b_fb_init_hw, diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c index a0b373867..f6cc2117b 100644 --- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c @@ -1771,6 +1771,10 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g, g->gr.gfxp_wfi_timeout_count, true); + if (g->ops.gr.commit_gfxp_rtv_cb != NULL) { + g->ops.gr.commit_gfxp_rtv_cb(g, gr_ctx, true); + } + gr_gk20a_ctx_patch_write_end(g, gr_ctx, true); } diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 2e5e4f9a7..8c80f59f2 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -453,6 +453,8 @@ static const struct gpu_ops gv11b_ops = { .dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats, .get_fecs_ctx_state_store_major_rev_id = gk20a_gr_get_fecs_ctx_state_store_major_rev_id, + .alloc_gfxp_rtv_cb = NULL, + .commit_gfxp_rtv_cb = NULL, }, .fb = { .init_hw = gv11b_fb_init_hw, diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 1c2651d97..c9d0839d4 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -535,6 +535,10 @@ struct gpu_ops { void (*set_debug_mode)(struct gk20a *g, bool enable); void (*dump_gr_falcon_stats)(struct gk20a *g); u32 (*get_fecs_ctx_state_store_major_rev_id)(struct gk20a *g); + int (*alloc_gfxp_rtv_cb)(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm); + void (*commit_gfxp_rtv_cb)(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, bool patch); } gr; struct { void (*init_hw)(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h index 9fb9b7b93..83fa99889 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h +++ b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h @@ -2302,6 +2302,14 @@ static inline u32 gr_scc_rm_rtv_cb_size_div_256b_f(u32 v) { return (v & 0x7fffU) << 0U; } +static inline u32 gr_scc_rm_rtv_cb_size_div_256b_byte_granularity_v(void) +{ + return 0x00000100U; +} +static inline u32 gr_scc_rm_rtv_cb_size_div_256b_init_f(void) +{ + return 0x0U; +} static inline u32 gr_scc_rm_rtv_cb_size_div_256b_default_f(void) { return 0x800U; @@ -2310,6 +2318,10 @@ static inline u32 gr_scc_rm_rtv_cb_size_div_256b_db_adder_f(void) { return 0x0U; } +static inline u32 gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f(void) +{ + return 0x20U; +} static inline u32 gr_gpcs_gcc_rm_rtv_cb_base_r(void) { return 0x00419034U; diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.c b/drivers/gpu/nvgpu/tu104/gr_tu104.c index ca2adafcc..59354d00f 100644 --- a/drivers/gpu/nvgpu/tu104/gr_tu104.c +++ b/drivers/gpu/nvgpu/tu104/gr_tu104.c @@ -200,7 +200,7 @@ clean_up: static void gr_tu104_commit_rtv_circular_buffer(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, - u64 addr, u32 size, bool patch) + u64 addr, u32 size, u32 gfxpAddSize, bool patch) { gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_rtv_cb_base_r(), gr_scc_rm_rtv_cb_base_addr_39_8_f(addr), patch); @@ -209,7 +209,8 @@ static void gr_tu104_commit_rtv_circular_buffer(struct gk20a *g, gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_rm_rtv_cb_base_r(), gr_gpcs_gcc_rm_rtv_cb_base_addr_39_8_f(addr), patch); gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_gfxp_reserve_r(), - gr_scc_rm_gfxp_reserve_rtv_cb_size_div_256b_f(0), patch); + gr_scc_rm_gfxp_reserve_rtv_cb_size_div_256b_f(gfxpAddSize), + patch); } int gr_tu104_commit_global_ctx_buffers(struct gk20a *g, @@ -218,6 +219,9 @@ int gr_tu104_commit_global_ctx_buffers(struct gk20a *g, int err; u64 addr; u32 size; + u32 gfxpaddsize = 0; + + nvgpu_log_fn(g, " "); err = gr_gk20a_commit_global_ctx_buffers(g, gr_ctx, patch); if (err != 0) { @@ -239,7 +243,8 @@ int gr_tu104_commit_global_ctx_buffers(struct gk20a *g, size = (gr_scc_rm_rtv_cb_size_div_256b_default_f() + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f()); - gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, size, patch); + gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, size, + gfxpaddsize, patch); if (patch) { gr_gk20a_ctx_patch_write_end(g, gr_ctx, false); @@ -248,6 +253,55 @@ int gr_tu104_commit_global_ctx_buffers(struct gk20a *g, return 0; } +int gr_tu104_alloc_gfxp_rtv_cb(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm) +{ + int err; + u32 rtv_cb_size; + + nvgpu_log_fn(g, " "); + + rtv_cb_size = + (gr_scc_rm_rtv_cb_size_div_256b_default_f() + + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f() + + gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f()) * + gr_scc_rm_rtv_cb_size_div_256b_byte_granularity_v(); + + err = gr_gp10b_alloc_buffer(vm, + rtv_cb_size, + &gr_ctx->gfxp_rtvcb_ctxsw_buffer); + + return err; +} + +void gr_tu104_commit_gfxp_rtv_cb(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, bool patch) +{ + u64 addr; + u32 rtv_cb_size; + u32 gfxp_addr_size; + + nvgpu_log_fn(g, " "); + + rtv_cb_size = + (gr_scc_rm_rtv_cb_size_div_256b_default_f() + + gr_scc_rm_rtv_cb_size_div_256b_db_adder_f() + + gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f()); + gfxp_addr_size = gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f(); + + /* GFXP RTV circular buffer */ + addr = (u64)(u64_lo32(gr_ctx->gfxp_rtvcb_ctxsw_buffer.gpu_va) >> + gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f()) | + (u64)(u64_hi32(gr_ctx->gfxp_rtvcb_ctxsw_buffer.gpu_va) << + (32U - gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f())); + + + gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, + rtv_cb_size, + gfxp_addr_size, + patch); +} + void gr_tu104_bundle_cb_defaults(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; @@ -270,6 +324,10 @@ void gr_tu104_cb_size_default(struct gk20a *g) } gr->alpha_cb_default_size = gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v(); + gr->attrib_cb_gfxp_default_size = + gr_gpc0_ppc0_cbm_beta_cb_size_v_gfxp_v(); + gr->attrib_cb_gfxp_size = + gr_gpc0_ppc0_cbm_beta_cb_size_v_gfxp_v(); } int gr_tu104_get_preemption_mode_flags(struct gk20a *g, @@ -290,6 +348,18 @@ int gr_tu104_get_preemption_mode_flags(struct gk20a *g, return 0; } +void gr_tu104_free_gr_ctx(struct gk20a *g, + struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx) +{ + nvgpu_log_fn(g, " "); + + if (gr_ctx != NULL) { + nvgpu_dma_unmap_free(vm, &gr_ctx->gfxp_rtvcb_ctxsw_buffer); + } + + gr_gk20a_free_gr_ctx(g, vm, gr_ctx); +} + void gr_tu104_enable_gpc_exceptions(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.h b/drivers/gpu/nvgpu/tu104/gr_tu104.h index 39c0f0db2..5ac378b56 100644 --- a/drivers/gpu/nvgpu/tu104/gr_tu104.h +++ b/drivers/gpu/nvgpu/tu104/gr_tu104.h @@ -27,6 +27,7 @@ struct gk20a; struct nvgpu_preemption_modes_rec; +struct nvgpu_gr_ctx; enum { TURING_CHANNEL_GPFIFO_A = 0xC46F, @@ -55,10 +56,6 @@ enum { #define NVC597_SET_SM_DISP_CTRL 0x10c8U #define NVC597_SET_SHADER_CUT_COLLECTOR 0x10d0U -/* TODO: merge these into global context buffer list in gr_gk20a.h */ -#define RTV_CIRCULAR_BUFFER 8 -#define RTV_CIRCULAR_BUFFER_VA 5 - bool gr_tu104_is_valid_class(struct gk20a *g, u32 class_num); bool gr_tu104_is_valid_gfx_class(struct gk20a *g, u32 class_num); bool gr_tu104_is_valid_compute_class(struct gk20a *g, u32 class_num); @@ -88,6 +85,15 @@ int gr_tu104_get_offset_in_gpccs_segment(struct gk20a *g, int gr_tu104_handle_sw_method(struct gk20a *g, u32 addr, u32 class_num, u32 offset, u32 data); +int gr_tu104_alloc_gfxp_rtv_cb(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm); +void gr_tu104_commit_gfxp_rtv_cb(struct gk20a *g, + struct nvgpu_gr_ctx *gr_ctx, bool patch); + +void gr_tu104_free_gr_ctx(struct gk20a *g, + struct vm_gk20a *vm, + struct nvgpu_gr_ctx *gr_ctx); + void gr_tu104_init_sm_dsm_reg_info(void); void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g, u32 *num_sm_dsm_perf_ctrl_regs, u32 **sm_dsm_perf_ctrl_regs, diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index 95c403032..a06a92c6d 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -387,9 +387,9 @@ static const struct gpu_ops tu104_ops = { .pagepool_default_size = gr_gv11b_pagepool_default_size, .init_ctx_state = gr_gp10b_init_ctx_state, .alloc_gr_ctx = gr_gp10b_alloc_gr_ctx, - .free_gr_ctx = gr_gk20a_free_gr_ctx, + .free_gr_ctx = gr_tu104_free_gr_ctx, .update_ctxsw_preemption_mode = - gr_gp10b_update_ctxsw_preemption_mode, + gr_gv11b_update_ctxsw_preemption_mode, .dump_gr_regs = gr_gv11b_dump_gr_status_regs, .update_pc_sampling = gr_gm20b_update_pc_sampling, .get_fbp_en_mask = gr_gm20b_get_fbp_en_mask, @@ -458,11 +458,13 @@ static const struct gpu_ops tu104_ops = { .set_czf_bypass = NULL, .pre_process_sm_exception = gr_gv11b_pre_process_sm_exception, .set_preemption_buffer_va = gr_gv11b_set_preemption_buffer_va, - .init_preemption_state = NULL, + .init_preemption_state = gr_gv11b_init_preemption_state, .update_boosted_ctx = gr_gp10b_update_boosted_ctx, .set_bes_crop_debug3 = gr_gp10b_set_bes_crop_debug3, .set_bes_crop_debug4 = gr_gp10b_set_bes_crop_debug4, .init_ecc = tu104_ecc_init, + .alloc_gfxp_rtv_cb = gr_tu104_alloc_gfxp_rtv_cb, + .commit_gfxp_rtv_cb = gr_tu104_commit_gfxp_rtv_cb, .set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode, .is_etpc_addr = gv11b_gr_pri_is_etpc_addr, .egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table, @@ -485,6 +487,12 @@ static const struct gpu_ops tu104_ops = { .handle_tpc_sm_ecc_exception = gr_gv11b_handle_tpc_sm_ecc_exception, .decode_egpc_addr = gv11b_gr_decode_egpc_addr, + .init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data, + .init_gfxp_wfi_timeout_count = + gr_gv11b_init_gfxp_wfi_timeout_count, + .get_max_gfxp_wfi_timeout_count = + gr_gv11b_get_max_gfxp_wfi_timeout_count, + .dump_ctxsw_stats = gr_gp10b_dump_ctxsw_stats, .fecs_host_int_enable = gr_gv11b_fecs_host_int_enable, .handle_ssync_hww = gr_gv11b_handle_ssync_hww, .handle_notify_pending = gk20a_gr_handle_notify_pending,