From 7e68e5c83d037fbb0e71eb9156e27c878cf2f922 Mon Sep 17 00:00:00 2001 From: Thomas Fleury Date: Tue, 20 Nov 2018 16:34:21 -0800 Subject: [PATCH] gpu: nvgpu: userd slab allocator We had to force allocation of physically contiguous memory for USERD in nvlink case, as a channel's USERD address is computed as an offset from fifo->userd address, and nvlink bypasses SMMU. With 4096 channels, it can become difficult to allocate 2MB of physically contiguous sysmem for USERD on a busy system. PBDMA does not require any sort of packing or contiguous USERD allocation, as each channel has a direct pointer to that channel's 512B USERD region. When BAR1 is supported we only need the GPU VAs to be contiguous, to setup the BAR1 inst block. - Add slab allocator for USERD. - Slabs are allocated in SYSMEM, using PAGE_SIZE for slab size. - Contiguous channels share the same page (16 channels per slab). - ch->userd_mem points to related nvgpu_mem descriptor - ch->userd_offset is the offset from the beginning of the slab - Pre-allocate GPU VAs for the whole BAR1 - Add g->ops.mm.bar1_map() method - gk20a_mm_bar1_map() uses fixed mapping in BAR1 region - vgpu_mm_bar1_map() passes the offset in TEGRA_VGPU_CMD_MAP_BAR1 - TEGRA_VGPU_CMD_MAP_BAR1 is called for each slab. Bug 2422486 Bug 200474793 Change-Id: I202699fe55a454c1fc6d969e7b6196a46256d704 Signed-off-by: Thomas Fleury Reviewed-on: https://git-master.nvidia.com/r/1959032 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/common/fifo/channel.c | 23 ++- drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 171 ++++++++++++------ drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 10 +- drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 12 ++ drivers/gpu/nvgpu/gk20a/mm_gk20a.h | 1 + drivers/gpu/nvgpu/gm20b/hal_gm20b.c | 1 + drivers/gpu/nvgpu/gp106/hal_gp106.c | 1 + drivers/gpu/nvgpu/gp10b/fifo_gp10b.c | 2 +- drivers/gpu/nvgpu/gp10b/hal_gp10b.c | 1 + drivers/gpu/nvgpu/gv100/hal_gv100.c | 1 + drivers/gpu/nvgpu/gv11b/fifo_gv11b.c | 26 +-- drivers/gpu/nvgpu/gv11b/hal_gv11b.c | 1 + drivers/gpu/nvgpu/include/nvgpu/channel.h | 14 +- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 1 + .../gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h | 2 +- drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h | 2 +- drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c | 5 +- drivers/gpu/nvgpu/tu104/hal_tu104.c | 1 + drivers/gpu/nvgpu/vgpu/fifo_vgpu.c | 108 +++++------ drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c | 1 + drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c | 1 + drivers/gpu/nvgpu/vgpu/mm_vgpu.c | 26 ++- 22 files changed, 264 insertions(+), 147 deletions(-) diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c index 76e6c43cc..0c99d6cf9 100644 --- a/drivers/gpu/nvgpu/common/fifo/channel.c +++ b/drivers/gpu/nvgpu/common/fifo/channel.c @@ -419,8 +419,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force) if (ch->usermode_submit_enabled) { gk20a_channel_free_usermode_buffers(ch); - ch->userd_iova = nvgpu_mem_get_addr(g, &f->userd) + - U64(ch->chid) * U64(f->userd_entry_size); + (void) gk20a_fifo_init_userd(g, ch); ch->usermode_submit_enabled = false; } @@ -709,12 +708,14 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g, ch->pid = tid; ch->tgid = pid; /* process granularity for FECS traces */ + if (gk20a_fifo_init_userd(g, ch) != 0) { + nvgpu_err(g, "userd init failed"); + goto clean_up; + } + if (g->ops.fifo.alloc_inst(g, ch) != 0) { - ch->g = NULL; - free_channel(f, ch); - nvgpu_err(g, - "failed to open gk20a channel, out of inst mem"); - return NULL; + nvgpu_err(g, "inst allocation failed"); + goto clean_up; } /* now the channel is in a limbo out of the free list but not marked as @@ -760,6 +761,11 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g, nvgpu_smp_wmb(); return ch; + +clean_up: + ch->g = NULL; + free_channel(f, ch); + return NULL; } /* allocate private cmd buffer. @@ -1313,8 +1319,7 @@ clean_up_unmap: nvgpu_dma_unmap_free(ch_vm, &c->gpfifo.mem); if (c->usermode_submit_enabled) { gk20a_channel_free_usermode_buffers(c); - c->userd_iova = nvgpu_mem_get_addr(g, &g->fifo.userd) + - U64(c->chid) * U64(g->fifo.userd_entry_size); + (void) gk20a_fifo_init_userd(g, c); c->usermode_submit_enabled = false; } clean_up: diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index 412f15b30..2e71821b6 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "mm_gk20a.h" @@ -599,11 +600,9 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f) nvgpu_vfree(g, f->channel); nvgpu_vfree(g, f->tsg); - if (g->ops.mm.is_bar1_supported(g)) { - nvgpu_dma_unmap_free(g->mm.bar1.vm, &f->userd); - } else { - nvgpu_dma_free(g, &f->userd); - } + gk20a_fifo_free_userd_slabs(g); + (void) nvgpu_vm_area_free(g->mm.bar1.vm, f->userd_gpu_va); + f->userd_gpu_va = 0ULL; gk20a_fifo_delete_runlist(f); @@ -940,12 +939,93 @@ clean_up: return err; } +int gk20a_fifo_init_userd_slabs(struct gk20a *g) +{ + struct fifo_gk20a *f = &g->fifo; + int err; + + err = nvgpu_mutex_init(&f->userd_mutex); + if (err != 0) { + nvgpu_err(g, "failed to init userd_mutex"); + return err; + } + + f->num_channels_per_slab = PAGE_SIZE / f->userd_entry_size; + f->num_userd_slabs = + DIV_ROUND_UP(f->num_channels, f->num_channels_per_slab); + + f->userd_slabs = nvgpu_kcalloc(g, f->num_userd_slabs, + sizeof(struct nvgpu_mem)); + if (f->userd_slabs == NULL) { + nvgpu_err(g, "could not allocate userd slabs"); + return -ENOMEM; + } + + return 0; +} + +int gk20a_fifo_init_userd(struct gk20a *g, struct channel_gk20a *c) +{ + struct fifo_gk20a *f = &g->fifo; + struct nvgpu_mem *mem; + u32 slab = c->chid / f->num_channels_per_slab; + int err = 0; + + if (slab > f->num_userd_slabs) { + nvgpu_err(g, "chid %u, slab %u out of range (max=%u)", + c->chid, slab, f->num_userd_slabs); + return -EINVAL; + } + + mem = &g->fifo.userd_slabs[slab]; + + nvgpu_mutex_acquire(&f->userd_mutex); + if (!nvgpu_mem_is_valid(mem)) { + err = nvgpu_dma_alloc_sys(g, PAGE_SIZE, mem); + if (err != 0) { + nvgpu_err(g, "userd allocation failed, err=%d", err); + goto done; + } + + if (g->ops.mm.is_bar1_supported(g)) { + mem->gpu_va = g->ops.mm.bar1_map(g, mem, + slab * PAGE_SIZE); + } + } + c->userd_mem = mem; + c->userd_offset = (c->chid % f->num_channels_per_slab) * + f->userd_entry_size; + c->userd_iova = gk20a_channel_userd_addr(c); + + nvgpu_log(g, gpu_dbg_info, + "chid=%u slab=%u mem=%p offset=%u addr=%llx gpu_va=%llx", + c->chid, slab, mem, c->userd_offset, + gk20a_channel_userd_addr(c), + gk20a_channel_userd_gpu_va(c)); + +done: + nvgpu_mutex_release(&f->userd_mutex); + return err; +} + +void gk20a_fifo_free_userd_slabs(struct gk20a *g) +{ + struct fifo_gk20a *f = &g->fifo; + u32 slab; + + for (slab = 0; slab < f->num_userd_slabs; slab++) { + nvgpu_dma_free(g, &f->userd_slabs[slab]); + } + nvgpu_kfree(g, f->userd_slabs); + f->userd_slabs = NULL; +} + int gk20a_init_fifo_setup_sw(struct gk20a *g) { struct fifo_gk20a *f = &g->fifo; - unsigned int chid; - u64 userd_base; int err = 0; + u32 size; + u32 num_pages; nvgpu_log_fn(g, " "); @@ -960,34 +1040,25 @@ int gk20a_init_fifo_setup_sw(struct gk20a *g) return err; } - if (g->ops.mm.is_bar1_supported(g)) { - err = nvgpu_dma_alloc_map_sys(g->mm.bar1.vm, - (size_t)f->userd_entry_size * - (size_t)f->num_channels, - &f->userd); - } else { - err = nvgpu_dma_alloc_flags_sys(g, - NVGPU_DMA_PHYSICALLY_ADDRESSED, - (size_t)f->userd_entry_size * - (size_t)f->num_channels, &f->userd); - } + err = gk20a_fifo_init_userd_slabs(g); if (err != 0) { - nvgpu_err(g, "userd memory allocation failed"); - goto clean_up; + nvgpu_err(g, "userd slabs init fail, err=%d", err); + return err; } - nvgpu_log(g, gpu_dbg_map, "userd gpu va = 0x%llx", f->userd.gpu_va); - userd_base = nvgpu_mem_get_addr(g, &f->userd); - for (chid = 0; chid < f->num_channels; chid++) { - f->channel[chid].userd_iova = userd_base + - U64(chid) * U64(f->userd_entry_size); - f->channel[chid].userd_gpu_va = - f->userd.gpu_va + U64(chid) * U64(f->userd_entry_size); + size = f->num_channels * f->userd_entry_size; + num_pages = DIV_ROUND_UP(size, PAGE_SIZE); + err = nvgpu_vm_area_alloc(g->mm.bar1.vm, + num_pages, PAGE_SIZE, &f->userd_gpu_va, 0); + if (err != 0) { + nvgpu_err(g, "userd gpu va allocation failed, err=%d", err); + goto clean_slabs; } err = nvgpu_channel_worker_init(g); if (err != 0) { - goto clean_up; + nvgpu_err(g, "worker init fail, err=%d", err); + goto clean_vm_area; } f->sw_ready = true; @@ -995,16 +1066,12 @@ int gk20a_init_fifo_setup_sw(struct gk20a *g) nvgpu_log_fn(g, "done"); return 0; -clean_up: - nvgpu_log_fn(g, "fail"); - if (nvgpu_mem_is_valid(&f->userd)) { - if (g->ops.mm.is_bar1_supported(g)) { - nvgpu_dma_unmap_free(g->mm.bar1.vm, &f->userd); - } else { - nvgpu_dma_free(g, &f->userd); - } - } +clean_vm_area: + (void) nvgpu_vm_area_free(g->mm.bar1.vm, f->userd_gpu_va); + f->userd_gpu_va = 0ULL; +clean_slabs: + gk20a_fifo_free_userd_slabs(g); return err; } @@ -1026,9 +1093,9 @@ int gk20a_init_fifo_setup_hw(struct gk20a *g) nvgpu_log_fn(g, " "); /* set the base for the userd region now */ - shifted_addr = f->userd.gpu_va >> 12; + shifted_addr = f->userd_gpu_va >> 12; if ((shifted_addr >> 32) != 0U) { - nvgpu_err(g, "GPU VA > 32 bits %016llx\n", f->userd.gpu_va); + nvgpu_err(g, "GPU VA > 32 bits %016llx\n", f->userd_gpu_va); return -EFAULT; } gk20a_writel(g, fifo_bar1_base_r(), @@ -4281,7 +4348,7 @@ static int gk20a_fifo_commit_userd(struct channel_gk20a *c) nvgpu_mem_wr32(g, &c->inst_block, ram_in_ramfc_w() + ram_fc_userd_w(), - nvgpu_aperture_mask(g, &g->fifo.userd, + nvgpu_aperture_mask(g, c->userd_mem, pbdma_userd_target_sys_mem_ncoh_f(), pbdma_userd_target_sys_mem_coh_f(), pbdma_userd_target_vid_mem_f()) | @@ -4380,20 +4447,11 @@ void gk20a_fifo_setup_ramfc_for_privileged_channel(struct channel_gk20a *c) int gk20a_fifo_setup_userd(struct channel_gk20a *c) { struct gk20a *g = c->g; - struct nvgpu_mem *mem; - u32 offset; + struct nvgpu_mem *mem = c->userd_mem; + u32 offset = c->userd_offset / U32(sizeof(u32)); nvgpu_log_fn(g, " "); - if (nvgpu_mem_is_valid(&c->usermode_userd)) { - mem = &c->usermode_userd; - offset = 0; - } else { - mem = &g->fifo.userd; - offset = U32(c->chid) * g->fifo.userd_entry_size / - U32(sizeof(u32)); - } - nvgpu_mem_wr32(g, mem, offset + ram_userd_put_w(), 0); nvgpu_mem_wr32(g, mem, offset + ram_userd_get_w(), 0); nvgpu_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0); @@ -4432,7 +4490,8 @@ void gk20a_fifo_free_inst(struct gk20a *g, struct channel_gk20a *ch) u32 gk20a_fifo_userd_gp_get(struct gk20a *g, struct channel_gk20a *c) { - u64 addr = c->userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w(); + u64 userd_gpu_va = gk20a_channel_userd_gpu_va(c); + u64 addr = userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w(); BUG_ON(u64_hi32(addr) != 0U); @@ -4441,8 +4500,9 @@ u32 gk20a_fifo_userd_gp_get(struct gk20a *g, struct channel_gk20a *c) u64 gk20a_fifo_userd_pb_get(struct gk20a *g, struct channel_gk20a *c) { - u64 lo_addr = c->userd_gpu_va + sizeof(u32) * ram_userd_get_w(); - u64 hi_addr = c->userd_gpu_va + sizeof(u32) * ram_userd_get_hi_w(); + u64 userd_gpu_va = gk20a_channel_userd_gpu_va(c); + u64 lo_addr = userd_gpu_va + sizeof(u32) * ram_userd_get_w(); + u64 hi_addr = userd_gpu_va + sizeof(u32) * ram_userd_get_hi_w(); u32 lo, hi; BUG_ON((u64_hi32(lo_addr) != 0U) || (u64_hi32(hi_addr) != 0U)); @@ -4454,7 +4514,8 @@ u64 gk20a_fifo_userd_pb_get(struct gk20a *g, struct channel_gk20a *c) void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c) { - u64 addr = c->userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w(); + u64 userd_gpu_va = gk20a_channel_userd_gpu_va(c); + u64 addr = userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w(); BUG_ON(u64_hi32(addr) != 0U); gk20a_bar1_writel(g, (u32)addr, c->gpfifo.put); diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 57d487b88..e77055dea 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h @@ -174,8 +174,12 @@ struct fifo_gk20a { struct nvgpu_mutex lock; } profile; #endif - struct nvgpu_mem userd; + struct nvgpu_mutex userd_mutex; + struct nvgpu_mem *userd_slabs; + u32 num_userd_slabs; + u32 num_channels_per_slab; u32 userd_entry_size; + u64 userd_gpu_va; unsigned int used_channels; struct channel_gk20a *channel; @@ -490,4 +494,8 @@ void gk20a_fifo_add_sema_cmd(struct gk20a *g, struct nvgpu_semaphore *s, u64 sema_va, struct priv_cmd_entry *cmd, u32 off, bool acquire, bool wfi); +int gk20a_fifo_init_userd_slabs(struct gk20a *g); +void gk20a_fifo_free_userd_slabs(struct gk20a *g); +int gk20a_fifo_init_userd(struct gk20a *g, struct channel_gk20a *c); + #endif /* FIFO_GK20A_H */ diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c index 8e7c17edd..633c36428 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c @@ -667,3 +667,15 @@ const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g, return (big_page_size == SZ_64K) ? gk20a_mm_levels_64k : gk20a_mm_levels_128k; } + +u64 gk20a_mm_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset) +{ + struct fifo_gk20a *f = &g->fifo; + u64 gpu_va = f->userd_gpu_va + offset; + + return nvgpu_gmmu_map_fixed(g->mm.bar1.vm, mem, gpu_va, + PAGE_SIZE, 0, + gk20a_mem_flag_none, false, + mem->aperture); +} + diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h index e4f4312bf..d078b481a 100644 --- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h @@ -152,4 +152,5 @@ u32 gk20a_get_pde_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l, struct nvgpu_gmmu_pd *pd, u32 pd_idx); u32 gk20a_get_pte_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l, struct nvgpu_gmmu_pd *pd, u32 pd_idx); +u64 gk20a_mm_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset); #endif /* MM_GK20A_H */ diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index f3b0fc9f1..a99cfa72c 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -579,6 +579,7 @@ static const struct gpu_ops gm20b_ops = { .mmu_fault_pending = gk20a_fifo_mmu_fault_pending, .get_kind_invalid = gm20b_get_kind_invalid, .get_kind_pitch = gm20b_get_kind_pitch, + .bar1_map = gk20a_mm_bar1_map, }, .therm = { .init_therm_setup_hw = gm20b_init_therm_setup_hw, diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c index 2103582a6..75920a2fb 100644 --- a/drivers/gpu/nvgpu/gp106/hal_gp106.c +++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c @@ -687,6 +687,7 @@ static const struct gpu_ops gp106_ops = { .remove_bar2_vm = gp10b_remove_bar2_vm, .get_kind_invalid = gm20b_get_kind_invalid, .get_kind_pitch = gm20b_get_kind_pitch, + .bar1_map = gk20a_mm_bar1_map, }, .pramin = { .data032_r = pram_data032_r, diff --git a/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c b/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c index 74129caf5..48d483eda 100644 --- a/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c @@ -59,7 +59,7 @@ int channel_gp10b_commit_userd(struct channel_gk20a *c) nvgpu_mem_wr32(g, &c->inst_block, ram_in_ramfc_w() + ram_fc_userd_w(), - nvgpu_aperture_mask(g, &g->fifo.userd, + nvgpu_aperture_mask(g, c->userd_mem, pbdma_userd_target_sys_mem_ncoh_f(), pbdma_userd_target_sys_mem_coh_f(), pbdma_userd_target_vid_mem_f()) | diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index f820c307f..4c3f0c029 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -652,6 +652,7 @@ static const struct gpu_ops gp10b_ops = { .remove_bar2_vm = gp10b_remove_bar2_vm, .get_kind_invalid = gm20b_get_kind_invalid, .get_kind_pitch = gm20b_get_kind_pitch, + .bar1_map = gk20a_mm_bar1_map, }, .pramin = { .data032_r = pram_data032_r, diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index 6e07aebbb..b2b854f54 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -822,6 +822,7 @@ static const struct gpu_ops gv100_ops = { .fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy, .mmu_fault_disable_hw = gv11b_mm_mmu_fault_disable_hw, .get_flush_retries = gv100_mm_get_flush_retries, + .bar1_map = NULL, }, .pramin = { .data032_r = pram_data032_r, diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c index 599cd331f..3da7feb74 100644 --- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c @@ -99,7 +99,7 @@ void gv11b_get_ch_runlist_entry(struct channel_gk20a *c, u32 *runlist) ram_rl_entry_chan_runqueue_selector_f( c->runqueue_sel) | ram_rl_entry_chan_userd_target_f( - nvgpu_aperture_mask(g, &g->fifo.userd, + nvgpu_aperture_mask(g, c->userd_mem, ram_rl_entry_chan_userd_target_sys_mem_ncoh_v(), ram_rl_entry_chan_userd_target_sys_mem_coh_v(), ram_rl_entry_chan_userd_target_vid_mem_v())) | @@ -245,30 +245,30 @@ void gv11b_ring_channel_doorbell(struct channel_gk20a *c) u32 gv11b_userd_gp_get(struct gk20a *g, struct channel_gk20a *c) { - struct nvgpu_mem *userd_mem = &g->fifo.userd; - u32 offset = c->chid * (g->fifo.userd_entry_size / sizeof(u32)); + struct nvgpu_mem *mem = c->userd_mem; + u32 offset = c->userd_offset / U32(sizeof(u32)); - return nvgpu_mem_rd32(g, userd_mem, - offset + ram_userd_gp_get_w()); + return nvgpu_mem_rd32(g, mem, offset + ram_userd_gp_get_w()); } u64 gv11b_userd_pb_get(struct gk20a *g, struct channel_gk20a *c) { - struct nvgpu_mem *userd_mem = &g->fifo.userd; - u32 offset = c->chid * (g->fifo.userd_entry_size / sizeof(u32)); - u32 lo = nvgpu_mem_rd32(g, userd_mem, offset + ram_userd_get_w()); - u32 hi = nvgpu_mem_rd32(g, userd_mem, offset + ram_userd_get_hi_w()); + struct nvgpu_mem *mem = c->userd_mem; + u32 offset = c->userd_offset / U32(sizeof(u32)); + u32 lo, hi; + + lo = nvgpu_mem_rd32(g, mem, offset + ram_userd_get_w()); + hi = nvgpu_mem_rd32(g, mem, offset + ram_userd_get_hi_w()); return ((u64)hi << 32) | lo; } void gv11b_userd_gp_put(struct gk20a *g, struct channel_gk20a *c) { - struct nvgpu_mem *userd_mem = &g->fifo.userd; - u32 offset = c->chid * (g->fifo.userd_entry_size / sizeof(u32)); + struct nvgpu_mem *mem = c->userd_mem; + u32 offset = c->userd_offset / U32(sizeof(u32)); - nvgpu_mem_wr32(g, userd_mem, offset + ram_userd_gp_put_w(), - c->gpfifo.put); + nvgpu_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), c->gpfifo.put); /* Commit everything to GPU. */ nvgpu_mb(); diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index c0850acfe..f3a5547b5 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -784,6 +784,7 @@ static const struct gpu_ops gv11b_ops = { .remove_bar2_vm = gp10b_remove_bar2_vm, .fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy, .mmu_fault_disable_hw = gv11b_mm_mmu_fault_disable_hw, + .bar1_map = NULL, }, .therm = { .init_therm_setup_hw = gv11b_init_therm_setup_hw, diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h index 9b1ddd2ec..5c7f32e55 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/channel.h +++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h @@ -264,7 +264,9 @@ struct channel_gk20a { struct nvgpu_mem inst_block; u64 userd_iova; - u64 userd_gpu_va; + + struct nvgpu_mem *userd_mem; /* kernel mode userd */ + u32 userd_offset; /* in bytes from start of userd_mem */ struct priv_cmd_queue priv_cmd_q; @@ -470,4 +472,14 @@ static inline void trace_write_pushbuffers(struct channel_gk20a *c, u32 count) void gk20a_channel_set_timedout(struct channel_gk20a *ch); bool gk20a_channel_check_timedout(struct channel_gk20a *ch); +static inline u64 gk20a_channel_userd_addr(struct channel_gk20a *c) +{ + return nvgpu_mem_get_addr(c->g, c->userd_mem) + c->userd_offset; +} + +static inline u64 gk20a_channel_userd_gpu_va(struct channel_gk20a *c) +{ + struct nvgpu_mem *mem = c->userd_mem; + return (mem->gpu_va != 0ULL) ? mem->gpu_va + c->userd_offset : 0ULL; +} #endif diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 4b702885a..bed617dd4 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -1094,6 +1094,7 @@ struct gpu_ops { u32 (*get_kind_pitch)(void); u32 (*get_flush_retries)(struct gk20a *g, enum nvgpu_flush_op op); + u64 (*bar1_map)(struct gk20a *g, struct nvgpu_mem *mem, u32 offset); } mm; /* * This function is called to allocate secure memory (memory diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h index 0131db272..3134268b3 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h @@ -170,7 +170,7 @@ struct tegra_vgpu_as_map_params { u8 cacheable; u8 clear_ctags; u8 prot; - u32 ctag_offset; + u32 offset; }; #define TEGRA_VGPU_MAP_CACHEABLE (1 << 0) diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h index 15ab879e5..2eab0f14f 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h @@ -81,7 +81,7 @@ int vgpu_get_timestamps_zipper(struct gk20a *g, struct nvgpu_cpu_time_correlation_sample *samples); int vgpu_init_hal(struct gk20a *g); int vgpu_get_constants(struct gk20a *g); -u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem); +u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset); int vgpu_gr_isr(struct gk20a *g, struct tegra_vgpu_gr_intr_info *info); int vgpu_gr_alloc_gr_ctx(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, diff --git a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c index f1b4aa7db..114735bcd 100644 --- a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c +++ b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c @@ -507,10 +507,11 @@ int vgpu_remove(struct platform_device *pdev) bool vgpu_is_reduced_bar1(struct gk20a *g) { - struct fifo_gk20a *f = &g->fifo; struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); + struct fifo_gk20a *f = &g->fifo; + u32 size = f->num_channels * f->userd_entry_size; - return resource_size(l->bar1_mem) == (resource_size_t)f->userd.size; + return resource_size(l->bar1_mem) == size; } int vgpu_tegra_suspend(struct device *dev) diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index 7b8eb6a62..4268aad88 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -851,6 +851,7 @@ static const struct gpu_ops tu104_ops = { .fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy, .mmu_fault_disable_hw = gv11b_mm_mmu_fault_disable_hw, .get_flush_retries = gv100_mm_get_flush_retries, + .bar1_map = NULL, }, .pramin = { .data032_r = pram_data032_r, diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c index 5d8335cf3..ff0972686 100644 --- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "fifo_vgpu.h" @@ -294,29 +295,12 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g) f->userd_entry_size = 1 << ram_userd_base_shift_v(); - err = nvgpu_dma_alloc_sys(g, f->userd_entry_size * f->num_channels, - &f->userd); - if (err) { - nvgpu_err(g, "memory allocation failed"); - goto clean_up; + err = gk20a_fifo_init_userd_slabs(g); + if (err != 0) { + nvgpu_err(g, "userd slab init failed, err=%d", err); + return err; } - /* bar1 va */ - if (g->ops.mm.is_bar1_supported(g)) { - f->userd.gpu_va = vgpu_bar1_map(g, &f->userd); - if (!f->userd.gpu_va) { - nvgpu_err(g, "gmmu mapping failed"); - goto clean_up; - } - /* if reduced BAR1 range is specified, use offset of 0 - * (server returns offset assuming full BAR1 range) - */ - if (vgpu_is_reduced_bar1(g)) - f->userd.gpu_va = 0; - } - - nvgpu_log(g, gpu_dbg_map_v, "userd bar1 va = 0x%llx", f->userd.gpu_va); - f->channel = nvgpu_vzalloc(g, f->num_channels * sizeof(*f->channel)); f->tsg = nvgpu_vzalloc(g, f->num_channels * sizeof(*f->tsg)); f->engine_info = nvgpu_kzalloc(g, f->max_engines * @@ -338,12 +322,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g) nvgpu_mutex_init(&f->free_chs_mutex); for (chid = 0; chid < f->num_channels; chid++) { - f->channel[chid].userd_iova = - nvgpu_mem_get_addr(g, &f->userd) + - chid * f->userd_entry_size; - f->channel[chid].userd_gpu_va = - f->userd.gpu_va + chid * f->userd_entry_size; - gk20a_init_channel_support(g, chid); gk20a_init_tsg_support(g, chid); } @@ -366,9 +344,7 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g) clean_up: nvgpu_log_fn(g, "fail"); /* FIXME: unmap from bar1 */ - nvgpu_dma_free(g, &f->userd); - - (void) memset(&f->userd, 0, sizeof(f->userd)); + gk20a_fifo_free_userd_slabs(g); nvgpu_vfree(g, f->channel); f->channel = NULL; @@ -384,47 +360,59 @@ clean_up: int vgpu_init_fifo_setup_hw(struct gk20a *g) { + struct fifo_gk20a *f = &g->fifo; + u32 v, v1 = 0x33, v2 = 0x55; + struct nvgpu_mem *mem = &f->userd_slabs[0]; + u32 bar1_vaddr; + volatile u32 *cpu_vaddr; + int err; + nvgpu_log_fn(g, " "); + /* allocate and map first userd slab for bar1 test. */ + err = nvgpu_dma_alloc_sys(g, PAGE_SIZE, mem); + if (err != 0) { + nvgpu_err(g, "userd allocation failed, err=%d", err); + return err; + } + mem->gpu_va = g->ops.mm.bar1_map(g, mem, 0); + f->userd_gpu_va = mem->gpu_va; + /* test write, read through bar1 @ userd region before * turning on the snooping */ - { - struct fifo_gk20a *f = &g->fifo; - u32 v, v1 = 0x33, v2 = 0x55; - u32 bar1_vaddr = f->userd.gpu_va; - volatile u32 *cpu_vaddr = f->userd.cpu_va; + cpu_vaddr = mem->cpu_va; + bar1_vaddr = mem->gpu_va; - nvgpu_log_info(g, "test bar1 @ vaddr 0x%x", - bar1_vaddr); + nvgpu_log_info(g, "test bar1 @ vaddr 0x%x", + bar1_vaddr); - v = gk20a_bar1_readl(g, bar1_vaddr); + v = gk20a_bar1_readl(g, bar1_vaddr); - *cpu_vaddr = v1; - nvgpu_mb(); + *cpu_vaddr = v1; + nvgpu_mb(); - if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) { - nvgpu_err(g, "bar1 broken @ gk20a!"); - return -EINVAL; - } - - gk20a_bar1_writel(g, bar1_vaddr, v2); - - if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) { - nvgpu_err(g, "bar1 broken @ gk20a!"); - return -EINVAL; - } - - /* is it visible to the cpu? */ - if (*cpu_vaddr != v2) { - nvgpu_err(g, "cpu didn't see bar1 write @ %p!", - cpu_vaddr); - } - - /* put it back */ - gk20a_bar1_writel(g, bar1_vaddr, v); + if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) { + nvgpu_err(g, "bar1 broken @ gk20a!"); + return -EINVAL; } + gk20a_bar1_writel(g, bar1_vaddr, v2); + + if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) { + nvgpu_err(g, "bar1 broken @ gk20a!"); + return -EINVAL; + } + + /* is it visible to the cpu? */ + if (*cpu_vaddr != v2) { + nvgpu_err(g, "cpu didn't see bar1 write @ %p!", + cpu_vaddr); + } + + /* put it back */ + gk20a_bar1_writel(g, bar1_vaddr, v); + nvgpu_log_fn(g, "done"); return 0; diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c index efeb461fb..c20dd6ef4 100644 --- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c +++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c @@ -481,6 +481,7 @@ static const struct gpu_ops vgpu_gp10b_ops = { .remove_bar2_vm = gp10b_remove_bar2_vm, .get_kind_invalid = gm20b_get_kind_invalid, .get_kind_pitch = gm20b_get_kind_pitch, + .bar1_map = vgpu_bar1_map, }, .pramin = { .data032_r = NULL, diff --git a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c index 64b6df67f..e659f2dd0 100644 --- a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c +++ b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c @@ -559,6 +559,7 @@ static const struct gpu_ops vgpu_gv11b_ops = { .init_bar2_vm = gp10b_init_bar2_vm, .remove_bar2_vm = gp10b_remove_bar2_vm, .fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy, + .bar1_map = vgpu_bar1_map, }, .therm = { .init_therm_setup_hw = NULL, diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c index 3aa433310..ad4e504e4 100644 --- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c +++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c @@ -155,7 +155,7 @@ void vgpu_vm_remove(struct vm_gk20a *vm) WARN_ON(err || msg.ret); } -u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem) +u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset) { u64 addr = nvgpu_mem_get_addr(g, mem); struct tegra_vgpu_cmd_msg msg; @@ -167,12 +167,32 @@ u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem) p->addr = addr; p->size = mem->size; p->iova = 0; + p->offset = offset; /* offset from start of BAR1 */ err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg)); - if (err || msg.ret) + if (err || msg.ret) { addr = 0; - else + } else { addr = p->gpu_va; + /* Server returns gpu_va assuming full BAR1 range. + * In case of reduced BAR1 configuration, we only map + * the portion of BAR1 reserved for this guest. + * As a result, we need to use the offset from the start + * of this range, instead of the gpu_va. + * + * offset + * <----> + * Guest IPA +========+ + * : X : + * BAR1 PA +----+========+-----------+ + * <---------> + * gpu_va + */ + if (vgpu_is_reduced_bar1(g)) { + addr = offset; + } + } + return addr; }