From 7e68e5c83d037fbb0e71eb9156e27c878cf2f922 Mon Sep 17 00:00:00 2001
From: Thomas Fleury <tfleury@nvidia.com>
Date: Tue, 20 Nov 2018 16:34:21 -0800
Subject: [PATCH] gpu: nvgpu: userd slab allocator

We had to force allocation of physically contiguous memory for
USERD in nvlink case, as a channel's USERD address is computed as
an offset from fifo->userd address, and nvlink bypasses SMMU.

With 4096 channels, it can become difficult to allocate 2MB of
physically contiguous sysmem for USERD on a busy system.

PBDMA does not require any sort of packing or contiguous USERD
allocation, as each channel has a direct pointer to that channel's
512B USERD region. When BAR1 is supported we only need the GPU VAs
to be contiguous, to setup the BAR1 inst block.

- Add slab allocator for USERD.
- Slabs are allocated in SYSMEM, using PAGE_SIZE for slab size.
- Contiguous channels share the same page (16 channels per slab).
- ch->userd_mem points to related nvgpu_mem descriptor
- ch->userd_offset is the offset from the beginning of the slab

- Pre-allocate GPU VAs for the whole BAR1
- Add g->ops.mm.bar1_map() method
  - gk20a_mm_bar1_map() uses fixed mapping in BAR1 region
  - vgpu_mm_bar1_map() passes the offset in TEGRA_VGPU_CMD_MAP_BAR1
  - TEGRA_VGPU_CMD_MAP_BAR1 is called for each slab.

Bug 2422486
Bug 200474793

Change-Id: I202699fe55a454c1fc6d969e7b6196a46256d704
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1959032
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
---
 drivers/gpu/nvgpu/common/fifo/channel.c       |  23 ++-
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.c          | 171 ++++++++++++------
 drivers/gpu/nvgpu/gk20a/fifo_gk20a.h          |  10 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c            |  12 ++
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h            |   1 +
 drivers/gpu/nvgpu/gm20b/hal_gm20b.c           |   1 +
 drivers/gpu/nvgpu/gp106/hal_gp106.c           |   1 +
 drivers/gpu/nvgpu/gp10b/fifo_gp10b.c          |   2 +-
 drivers/gpu/nvgpu/gp10b/hal_gp10b.c           |   1 +
 drivers/gpu/nvgpu/gv100/hal_gv100.c           |   1 +
 drivers/gpu/nvgpu/gv11b/fifo_gv11b.c          |  26 +--
 drivers/gpu/nvgpu/gv11b/hal_gv11b.c           |   1 +
 drivers/gpu/nvgpu/include/nvgpu/channel.h     |  14 +-
 drivers/gpu/nvgpu/include/nvgpu/gk20a.h       |   1 +
 .../gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h |   2 +-
 drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h   |   2 +-
 drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c  |   5 +-
 drivers/gpu/nvgpu/tu104/hal_tu104.c           |   1 +
 drivers/gpu/nvgpu/vgpu/fifo_vgpu.c            | 108 +++++------
 drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c |   1 +
 drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c |   1 +
 drivers/gpu/nvgpu/vgpu/mm_vgpu.c              |  26 ++-
 22 files changed, 264 insertions(+), 147 deletions(-)

diff --git a/drivers/gpu/nvgpu/common/fifo/channel.c b/drivers/gpu/nvgpu/common/fifo/channel.c
index 76e6c43cc..0c99d6cf9 100644
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -419,8 +419,7 @@ static void gk20a_free_channel(struct channel_gk20a *ch, bool force)
 
 	if (ch->usermode_submit_enabled) {
 		gk20a_channel_free_usermode_buffers(ch);
-		ch->userd_iova = nvgpu_mem_get_addr(g, &f->userd) +
-				U64(ch->chid) * U64(f->userd_entry_size);
+		(void) gk20a_fifo_init_userd(g, ch);
 		ch->usermode_submit_enabled = false;
 	}
 
@@ -709,12 +708,14 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 	ch->pid = tid;
 	ch->tgid = pid;  /* process granularity for FECS traces */
 
+	if (gk20a_fifo_init_userd(g, ch) != 0) {
+		nvgpu_err(g, "userd init failed");
+		goto clean_up;
+	}
+
 	if (g->ops.fifo.alloc_inst(g, ch) != 0) {
-		ch->g = NULL;
-		free_channel(f, ch);
-		nvgpu_err(g,
-			   "failed to open gk20a channel, out of inst mem");
-		return NULL;
+		nvgpu_err(g, "inst allocation failed");
+		goto clean_up;
 	}
 
 	/* now the channel is in a limbo out of the free list but not marked as
@@ -760,6 +761,11 @@ struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g,
 	nvgpu_smp_wmb();
 
 	return ch;
+
+clean_up:
+	ch->g = NULL;
+	free_channel(f, ch);
+	return NULL;
 }
 
 /* allocate private cmd buffer.
@@ -1313,8 +1319,7 @@ clean_up_unmap:
 	nvgpu_dma_unmap_free(ch_vm, &c->gpfifo.mem);
 	if (c->usermode_submit_enabled) {
 		gk20a_channel_free_usermode_buffers(c);
-		c->userd_iova = nvgpu_mem_get_addr(g, &g->fifo.userd) +
-				U64(c->chid) * U64(g->fifo.userd_entry_size);
+		(void) gk20a_fifo_init_userd(g, c);
 		c->usermode_submit_enabled = false;
 	}
 clean_up:
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index 412f15b30..2e71821b6 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -47,6 +47,7 @@
 #include <nvgpu/channel.h>
 #include <nvgpu/unit.h>
 #include <nvgpu/types.h>
+#include <nvgpu/vm_area.h>
 
 #include "mm_gk20a.h"
 
@@ -599,11 +600,9 @@ static void gk20a_remove_fifo_support(struct fifo_gk20a *f)
 
 	nvgpu_vfree(g, f->channel);
 	nvgpu_vfree(g, f->tsg);
-	if (g->ops.mm.is_bar1_supported(g)) {
-		nvgpu_dma_unmap_free(g->mm.bar1.vm, &f->userd);
-	} else {
-		nvgpu_dma_free(g, &f->userd);
-	}
+	gk20a_fifo_free_userd_slabs(g);
+	(void) nvgpu_vm_area_free(g->mm.bar1.vm, f->userd_gpu_va);
+	f->userd_gpu_va = 0ULL;
 
 	gk20a_fifo_delete_runlist(f);
 
@@ -940,12 +939,93 @@ clean_up:
 	return err;
 }
 
+int gk20a_fifo_init_userd_slabs(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	int err;
+
+	err = nvgpu_mutex_init(&f->userd_mutex);
+	if (err != 0) {
+		nvgpu_err(g, "failed to init userd_mutex");
+		return err;
+	}
+
+	f->num_channels_per_slab = PAGE_SIZE /  f->userd_entry_size;
+	f->num_userd_slabs =
+		DIV_ROUND_UP(f->num_channels, f->num_channels_per_slab);
+
+	f->userd_slabs = nvgpu_kcalloc(g, f->num_userd_slabs,
+				       sizeof(struct nvgpu_mem));
+	if (f->userd_slabs == NULL) {
+		nvgpu_err(g, "could not allocate userd slabs");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int gk20a_fifo_init_userd(struct gk20a *g, struct channel_gk20a *c)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct nvgpu_mem *mem;
+	u32 slab = c->chid / f->num_channels_per_slab;
+	int err = 0;
+
+	if (slab > f->num_userd_slabs) {
+		nvgpu_err(g, "chid %u, slab %u out of range (max=%u)",
+			c->chid, slab,  f->num_userd_slabs);
+		return -EINVAL;
+	}
+
+	mem = &g->fifo.userd_slabs[slab];
+
+	nvgpu_mutex_acquire(&f->userd_mutex);
+	if (!nvgpu_mem_is_valid(mem)) {
+		err = nvgpu_dma_alloc_sys(g, PAGE_SIZE, mem);
+		if (err != 0) {
+			nvgpu_err(g, "userd allocation failed, err=%d", err);
+			goto done;
+		}
+
+		if (g->ops.mm.is_bar1_supported(g)) {
+			mem->gpu_va = g->ops.mm.bar1_map(g, mem,
+							 slab * PAGE_SIZE);
+		}
+	}
+	c->userd_mem = mem;
+	c->userd_offset = (c->chid % f->num_channels_per_slab) *
+				f->userd_entry_size;
+	c->userd_iova = gk20a_channel_userd_addr(c);
+
+	nvgpu_log(g, gpu_dbg_info,
+		"chid=%u slab=%u mem=%p offset=%u addr=%llx gpu_va=%llx",
+		c->chid, slab, mem, c->userd_offset,
+		gk20a_channel_userd_addr(c),
+		gk20a_channel_userd_gpu_va(c));
+
+done:
+	nvgpu_mutex_release(&f->userd_mutex);
+	return err;
+}
+
+void gk20a_fifo_free_userd_slabs(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 slab;
+
+	for (slab = 0; slab < f->num_userd_slabs; slab++) {
+		nvgpu_dma_free(g, &f->userd_slabs[slab]);
+	}
+	nvgpu_kfree(g, f->userd_slabs);
+	f->userd_slabs = NULL;
+}
+
 int gk20a_init_fifo_setup_sw(struct gk20a *g)
 {
 	struct fifo_gk20a *f = &g->fifo;
-	unsigned int chid;
-	u64 userd_base;
 	int err = 0;
+	u32 size;
+	u32 num_pages;
 
 	nvgpu_log_fn(g, " ");
 
@@ -960,34 +1040,25 @@ int gk20a_init_fifo_setup_sw(struct gk20a *g)
 		return err;
 	}
 
-	if (g->ops.mm.is_bar1_supported(g)) {
-		err = nvgpu_dma_alloc_map_sys(g->mm.bar1.vm,
-				   (size_t)f->userd_entry_size *
-				   (size_t)f->num_channels,
-				   &f->userd);
-	} else {
-		err = nvgpu_dma_alloc_flags_sys(g,
-				NVGPU_DMA_PHYSICALLY_ADDRESSED,
-				(size_t)f->userd_entry_size *
-				(size_t)f->num_channels, &f->userd);
-	}
+	err = gk20a_fifo_init_userd_slabs(g);
 	if (err != 0) {
-		nvgpu_err(g, "userd memory allocation failed");
-		goto clean_up;
+		nvgpu_err(g, "userd slabs init fail, err=%d", err);
+		return err;
 	}
-	nvgpu_log(g, gpu_dbg_map, "userd gpu va = 0x%llx", f->userd.gpu_va);
 
-	userd_base = nvgpu_mem_get_addr(g, &f->userd);
-	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_iova = userd_base +
-			U64(chid) * U64(f->userd_entry_size);
-		f->channel[chid].userd_gpu_va =
-			f->userd.gpu_va + U64(chid) * U64(f->userd_entry_size);
+	size = f->num_channels * f->userd_entry_size;
+	num_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+	err = nvgpu_vm_area_alloc(g->mm.bar1.vm,
+			num_pages, PAGE_SIZE, &f->userd_gpu_va, 0);
+	if (err != 0) {
+		nvgpu_err(g, "userd gpu va allocation failed, err=%d", err);
+		goto clean_slabs;
 	}
 
 	err = nvgpu_channel_worker_init(g);
 	if (err != 0) {
-		goto clean_up;
+		nvgpu_err(g, "worker init fail, err=%d", err);
+		goto clean_vm_area;
 	}
 
 	f->sw_ready = true;
@@ -995,16 +1066,12 @@ int gk20a_init_fifo_setup_sw(struct gk20a *g)
 	nvgpu_log_fn(g, "done");
 	return 0;
 
-clean_up:
-	nvgpu_log_fn(g, "fail");
-	if (nvgpu_mem_is_valid(&f->userd)) {
-		if (g->ops.mm.is_bar1_supported(g)) {
-			nvgpu_dma_unmap_free(g->mm.bar1.vm, &f->userd);
-		} else {
-			nvgpu_dma_free(g, &f->userd);
-		}
-	}
+clean_vm_area:
+	(void) nvgpu_vm_area_free(g->mm.bar1.vm, f->userd_gpu_va);
+	f->userd_gpu_va = 0ULL;
 
+clean_slabs:
+	gk20a_fifo_free_userd_slabs(g);
 	return err;
 }
 
@@ -1026,9 +1093,9 @@ int gk20a_init_fifo_setup_hw(struct gk20a *g)
 	nvgpu_log_fn(g, " ");
 
 	/* set the base for the userd region now */
-	shifted_addr = f->userd.gpu_va >> 12;
+	shifted_addr = f->userd_gpu_va >> 12;
 	if ((shifted_addr >> 32) != 0U) {
-		nvgpu_err(g, "GPU VA > 32 bits %016llx\n", f->userd.gpu_va);
+		nvgpu_err(g, "GPU VA > 32 bits %016llx\n", f->userd_gpu_va);
 		return -EFAULT;
 	}
 	gk20a_writel(g, fifo_bar1_base_r(),
@@ -4281,7 +4348,7 @@ static int gk20a_fifo_commit_userd(struct channel_gk20a *c)
 
 	nvgpu_mem_wr32(g, &c->inst_block,
 		       ram_in_ramfc_w() + ram_fc_userd_w(),
-		       nvgpu_aperture_mask(g, &g->fifo.userd,
+		       nvgpu_aperture_mask(g, c->userd_mem,
 					   pbdma_userd_target_sys_mem_ncoh_f(),
 					   pbdma_userd_target_sys_mem_coh_f(),
 					   pbdma_userd_target_vid_mem_f()) |
@@ -4380,20 +4447,11 @@ void gk20a_fifo_setup_ramfc_for_privileged_channel(struct channel_gk20a *c)
 int gk20a_fifo_setup_userd(struct channel_gk20a *c)
 {
 	struct gk20a *g = c->g;
-	struct nvgpu_mem *mem;
-	u32 offset;
+	struct nvgpu_mem *mem = c->userd_mem;
+	u32 offset = c->userd_offset / U32(sizeof(u32));
 
 	nvgpu_log_fn(g, " ");
 
-	if (nvgpu_mem_is_valid(&c->usermode_userd)) {
-		mem = &c->usermode_userd;
-		offset = 0;
-	} else {
-		mem = &g->fifo.userd;
-		offset = U32(c->chid) * g->fifo.userd_entry_size /
-			 U32(sizeof(u32));
-	}
-
 	nvgpu_mem_wr32(g, mem, offset + ram_userd_put_w(), 0);
 	nvgpu_mem_wr32(g, mem, offset + ram_userd_get_w(), 0);
 	nvgpu_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0);
@@ -4432,7 +4490,8 @@ void gk20a_fifo_free_inst(struct gk20a *g, struct channel_gk20a *ch)
 
 u32 gk20a_fifo_userd_gp_get(struct gk20a *g, struct channel_gk20a *c)
 {
-	u64 addr = c->userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w();
+	u64 userd_gpu_va = gk20a_channel_userd_gpu_va(c);
+	u64 addr = userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w();
 
 	BUG_ON(u64_hi32(addr) != 0U);
 
@@ -4441,8 +4500,9 @@ u32 gk20a_fifo_userd_gp_get(struct gk20a *g, struct channel_gk20a *c)
 
 u64 gk20a_fifo_userd_pb_get(struct gk20a *g, struct channel_gk20a *c)
 {
-	u64 lo_addr = c->userd_gpu_va + sizeof(u32) * ram_userd_get_w();
-	u64 hi_addr = c->userd_gpu_va + sizeof(u32) * ram_userd_get_hi_w();
+	u64 userd_gpu_va = gk20a_channel_userd_gpu_va(c);
+	u64 lo_addr = userd_gpu_va + sizeof(u32) * ram_userd_get_w();
+	u64 hi_addr = userd_gpu_va + sizeof(u32) * ram_userd_get_hi_w();
 	u32 lo, hi;
 
 	BUG_ON((u64_hi32(lo_addr) != 0U) || (u64_hi32(hi_addr) != 0U));
@@ -4454,7 +4514,8 @@ u64 gk20a_fifo_userd_pb_get(struct gk20a *g, struct channel_gk20a *c)
 
 void gk20a_fifo_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
 {
-	u64 addr = c->userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w();
+	u64 userd_gpu_va = gk20a_channel_userd_gpu_va(c);
+	u64 addr = userd_gpu_va + sizeof(u32) * ram_userd_gp_put_w();
 
 	BUG_ON(u64_hi32(addr) != 0U);
 	gk20a_bar1_writel(g, (u32)addr, c->gpfifo.put);
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
index 57d487b88..e77055dea 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -174,8 +174,12 @@ struct fifo_gk20a {
 		struct nvgpu_mutex lock;
 	} profile;
 #endif
-	struct nvgpu_mem userd;
+	struct nvgpu_mutex userd_mutex;
+	struct nvgpu_mem *userd_slabs;
+	u32 num_userd_slabs;
+	u32 num_channels_per_slab;
 	u32 userd_entry_size;
+	u64 userd_gpu_va;
 
 	unsigned int used_channels;
 	struct channel_gk20a *channel;
@@ -490,4 +494,8 @@ void gk20a_fifo_add_sema_cmd(struct gk20a *g,
 	struct nvgpu_semaphore *s, u64 sema_va,
 	struct priv_cmd_entry *cmd,
 	u32 off, bool acquire, bool wfi);
+int gk20a_fifo_init_userd_slabs(struct gk20a *g);
+void gk20a_fifo_free_userd_slabs(struct gk20a *g);
+int gk20a_fifo_init_userd(struct gk20a *g, struct channel_gk20a *c);
+
 #endif /* FIFO_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 8e7c17edd..633c36428 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -667,3 +667,15 @@ const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
 	return (big_page_size == SZ_64K) ?
 		 gk20a_mm_levels_64k : gk20a_mm_levels_128k;
 }
+
+u64 gk20a_mm_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u64 gpu_va = f->userd_gpu_va + offset;
+
+	return nvgpu_gmmu_map_fixed(g->mm.bar1.vm, mem, gpu_va,
+				    PAGE_SIZE, 0,
+				    gk20a_mem_flag_none, false,
+				    mem->aperture);
+}
+
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index e4f4312bf..d078b481a 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -152,4 +152,5 @@ u32 gk20a_get_pde_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l,
 				struct nvgpu_gmmu_pd *pd, u32 pd_idx);
 u32 gk20a_get_pte_pgsz(struct gk20a *g, const struct gk20a_mmu_level *l,
 				struct nvgpu_gmmu_pd *pd, u32 pd_idx);
+u64 gk20a_mm_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset);
 #endif /* MM_GK20A_H */
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index f3b0fc9f1..a99cfa72c 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -579,6 +579,7 @@ static const struct gpu_ops gm20b_ops = {
 		.mmu_fault_pending = gk20a_fifo_mmu_fault_pending,
 		.get_kind_invalid = gm20b_get_kind_invalid,
 		.get_kind_pitch = gm20b_get_kind_pitch,
+		.bar1_map = gk20a_mm_bar1_map,
 	},
 	.therm = {
 		.init_therm_setup_hw = gm20b_init_therm_setup_hw,
diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c
index 2103582a6..75920a2fb 100644
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -687,6 +687,7 @@ static const struct gpu_ops gp106_ops = {
 		.remove_bar2_vm = gp10b_remove_bar2_vm,
 		.get_kind_invalid = gm20b_get_kind_invalid,
 		.get_kind_pitch = gm20b_get_kind_pitch,
+		.bar1_map = gk20a_mm_bar1_map,
 	},
 	.pramin = {
 		.data032_r = pram_data032_r,
diff --git a/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c b/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c
index 74129caf5..48d483eda 100644
--- a/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/fifo_gp10b.c
@@ -59,7 +59,7 @@ int channel_gp10b_commit_userd(struct channel_gk20a *c)
 
 	nvgpu_mem_wr32(g, &c->inst_block,
 		       ram_in_ramfc_w() + ram_fc_userd_w(),
-		       nvgpu_aperture_mask(g, &g->fifo.userd,
+		       nvgpu_aperture_mask(g, c->userd_mem,
 					   pbdma_userd_target_sys_mem_ncoh_f(),
 					   pbdma_userd_target_sys_mem_coh_f(),
 					   pbdma_userd_target_vid_mem_f()) |
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index f820c307f..4c3f0c029 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -652,6 +652,7 @@ static const struct gpu_ops gp10b_ops = {
 		.remove_bar2_vm = gp10b_remove_bar2_vm,
 		.get_kind_invalid = gm20b_get_kind_invalid,
 		.get_kind_pitch = gm20b_get_kind_pitch,
+		.bar1_map = gk20a_mm_bar1_map,
 	},
 	.pramin = {
 		.data032_r = pram_data032_r,
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index 6e07aebbb..b2b854f54 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -822,6 +822,7 @@ static const struct gpu_ops gv100_ops = {
 		.fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy,
 		.mmu_fault_disable_hw = gv11b_mm_mmu_fault_disable_hw,
 		.get_flush_retries = gv100_mm_get_flush_retries,
+		.bar1_map = NULL,
 	},
 	.pramin = {
 		.data032_r = pram_data032_r,
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index 599cd331f..3da7feb74 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -99,7 +99,7 @@ void gv11b_get_ch_runlist_entry(struct channel_gk20a *c, u32 *runlist)
 			ram_rl_entry_chan_runqueue_selector_f(
 						c->runqueue_sel) |
 			ram_rl_entry_chan_userd_target_f(
-				nvgpu_aperture_mask(g, &g->fifo.userd,
+				nvgpu_aperture_mask(g, c->userd_mem,
 					ram_rl_entry_chan_userd_target_sys_mem_ncoh_v(),
 					ram_rl_entry_chan_userd_target_sys_mem_coh_v(),
 					ram_rl_entry_chan_userd_target_vid_mem_v())) |
@@ -245,30 +245,30 @@ void gv11b_ring_channel_doorbell(struct channel_gk20a *c)
 
 u32 gv11b_userd_gp_get(struct gk20a *g, struct channel_gk20a *c)
 {
-	struct nvgpu_mem *userd_mem = &g->fifo.userd;
-	u32 offset = c->chid * (g->fifo.userd_entry_size / sizeof(u32));
+	struct nvgpu_mem *mem = c->userd_mem;
+	u32 offset = c->userd_offset / U32(sizeof(u32));
 
-	return nvgpu_mem_rd32(g, userd_mem,
-			offset + ram_userd_gp_get_w());
+	return nvgpu_mem_rd32(g, mem, offset + ram_userd_gp_get_w());
 }
 
 u64 gv11b_userd_pb_get(struct gk20a *g, struct channel_gk20a *c)
 {
-	struct nvgpu_mem *userd_mem = &g->fifo.userd;
-	u32 offset = c->chid * (g->fifo.userd_entry_size / sizeof(u32));
-	u32 lo = nvgpu_mem_rd32(g, userd_mem, offset + ram_userd_get_w());
-	u32 hi = nvgpu_mem_rd32(g, userd_mem, offset + ram_userd_get_hi_w());
+	struct nvgpu_mem *mem = c->userd_mem;
+	u32 offset = c->userd_offset / U32(sizeof(u32));
+	u32 lo, hi;
+
+	lo = nvgpu_mem_rd32(g, mem, offset + ram_userd_get_w());
+	hi = nvgpu_mem_rd32(g, mem, offset + ram_userd_get_hi_w());
 
 	return ((u64)hi << 32) | lo;
 }
 
 void gv11b_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
 {
-	struct nvgpu_mem *userd_mem = &g->fifo.userd;
-	u32 offset = c->chid * (g->fifo.userd_entry_size / sizeof(u32));
+	struct nvgpu_mem *mem = c->userd_mem;
+	u32 offset = c->userd_offset / U32(sizeof(u32));
 
-	nvgpu_mem_wr32(g, userd_mem, offset + ram_userd_gp_put_w(),
-							c->gpfifo.put);
+	nvgpu_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), c->gpfifo.put);
 	/* Commit everything to GPU. */
 	nvgpu_mb();
 
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index c0850acfe..f3a5547b5 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -784,6 +784,7 @@ static const struct gpu_ops gv11b_ops = {
 		.remove_bar2_vm = gp10b_remove_bar2_vm,
 		.fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy,
 		.mmu_fault_disable_hw = gv11b_mm_mmu_fault_disable_hw,
+		.bar1_map = NULL,
 	},
 	.therm = {
 		.init_therm_setup_hw = gv11b_init_therm_setup_hw,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/channel.h b/drivers/gpu/nvgpu/include/nvgpu/channel.h
index 9b1ddd2ec..5c7f32e55 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -264,7 +264,9 @@ struct channel_gk20a {
 	struct nvgpu_mem inst_block;
 
 	u64 userd_iova;
-	u64 userd_gpu_va;
+
+	struct nvgpu_mem *userd_mem;	/* kernel mode userd */
+	u32 userd_offset;		/* in bytes from start of userd_mem */
 
 	struct priv_cmd_queue priv_cmd_q;
 
@@ -470,4 +472,14 @@ static inline void trace_write_pushbuffers(struct channel_gk20a *c, u32 count)
 void gk20a_channel_set_timedout(struct channel_gk20a *ch);
 bool gk20a_channel_check_timedout(struct channel_gk20a *ch);
 
+static inline u64 gk20a_channel_userd_addr(struct channel_gk20a *c)
+{
+	return nvgpu_mem_get_addr(c->g, c->userd_mem) + c->userd_offset;
+}
+
+static inline u64 gk20a_channel_userd_gpu_va(struct channel_gk20a *c)
+{
+	struct nvgpu_mem *mem = c->userd_mem;
+	return (mem->gpu_va != 0ULL) ? mem->gpu_va + c->userd_offset : 0ULL;
+}
 #endif
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 4b702885a..bed617dd4 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -1094,6 +1094,7 @@ struct gpu_ops {
 		u32 (*get_kind_pitch)(void);
 		u32 (*get_flush_retries)(struct gk20a *g,
 							enum nvgpu_flush_op op);
+		u64 (*bar1_map)(struct gk20a *g, struct nvgpu_mem *mem, u32 offset);
 	} mm;
 	/*
 	 * This function is called to allocate secure memory (memory
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
index 0131db272..3134268b3 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -170,7 +170,7 @@ struct tegra_vgpu_as_map_params {
 	u8 cacheable;
 	u8 clear_ctags;
 	u8 prot;
-	u32 ctag_offset;
+	u32 offset;
 };
 
 #define TEGRA_VGPU_MAP_CACHEABLE	(1 << 0)
diff --git a/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h b/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h
index 15ab879e5..2eab0f14f 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h
@@ -81,7 +81,7 @@ int vgpu_get_timestamps_zipper(struct gk20a *g,
 		struct nvgpu_cpu_time_correlation_sample *samples);
 int vgpu_init_hal(struct gk20a *g);
 int vgpu_get_constants(struct gk20a *g);
-u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem);
+u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset);
 int vgpu_gr_isr(struct gk20a *g, struct tegra_vgpu_gr_intr_info *info);
 int vgpu_gr_alloc_gr_ctx(struct gk20a *g,
 			struct nvgpu_gr_ctx *gr_ctx,
diff --git a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c
index f1b4aa7db..114735bcd 100644
--- a/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c
+++ b/drivers/gpu/nvgpu/os/linux/vgpu/vgpu_linux.c
@@ -507,10 +507,11 @@ int vgpu_remove(struct platform_device *pdev)
 
 bool vgpu_is_reduced_bar1(struct gk20a *g)
 {
-	struct fifo_gk20a *f = &g->fifo;
 	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
+	struct fifo_gk20a *f = &g->fifo;
+	u32 size = f->num_channels * f->userd_entry_size;
 
-	return resource_size(l->bar1_mem) == (resource_size_t)f->userd.size;
+	return resource_size(l->bar1_mem) == size;
 }
 
 int vgpu_tegra_suspend(struct device *dev)
diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c
index 7b8eb6a62..4268aad88 100644
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -851,6 +851,7 @@ static const struct gpu_ops tu104_ops = {
 		.fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy,
 		.mmu_fault_disable_hw = gv11b_mm_mmu_fault_disable_hw,
 		.get_flush_retries = gv100_mm_get_flush_retries,
+		.bar1_map = NULL,
 	},
 	.pramin = {
 		.data032_r = pram_data032_r,
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 5d8335cf3..ff0972686 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -36,6 +36,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/channel.h>
 #include <nvgpu/string.h>
+#include <nvgpu/vm_area.h>
 
 #include "fifo_vgpu.h"
 
@@ -294,29 +295,12 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 
 	f->userd_entry_size = 1 << ram_userd_base_shift_v();
 
-	err = nvgpu_dma_alloc_sys(g, f->userd_entry_size * f->num_channels,
-			&f->userd);
-	if (err) {
-		nvgpu_err(g, "memory allocation failed");
-		goto clean_up;
+	err = gk20a_fifo_init_userd_slabs(g);
+	if (err != 0) {
+		nvgpu_err(g, "userd slab init failed, err=%d", err);
+		return err;
 	}
 
-	/* bar1 va */
-	if (g->ops.mm.is_bar1_supported(g)) {
-		f->userd.gpu_va = vgpu_bar1_map(g, &f->userd);
-		if (!f->userd.gpu_va) {
-			nvgpu_err(g, "gmmu mapping failed");
-			goto clean_up;
-		}
-		/* if reduced BAR1 range is specified, use offset of 0
-		 * (server returns offset assuming full BAR1 range)
-		 */
-		if (vgpu_is_reduced_bar1(g))
-			f->userd.gpu_va = 0;
-	}
-
-	nvgpu_log(g, gpu_dbg_map_v, "userd bar1 va = 0x%llx", f->userd.gpu_va);
-
 	f->channel = nvgpu_vzalloc(g, f->num_channels * sizeof(*f->channel));
 	f->tsg = nvgpu_vzalloc(g, f->num_channels * sizeof(*f->tsg));
 	f->engine_info = nvgpu_kzalloc(g, f->max_engines *
@@ -338,12 +322,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 	nvgpu_mutex_init(&f->free_chs_mutex);
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_iova =
-			nvgpu_mem_get_addr(g, &f->userd) +
-			chid * f->userd_entry_size;
-		f->channel[chid].userd_gpu_va =
-			f->userd.gpu_va + chid * f->userd_entry_size;
-
 		gk20a_init_channel_support(g, chid);
 		gk20a_init_tsg_support(g, chid);
 	}
@@ -366,9 +344,7 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 clean_up:
 	nvgpu_log_fn(g, "fail");
 	/* FIXME: unmap from bar1 */
-	nvgpu_dma_free(g, &f->userd);
-
-	(void) memset(&f->userd, 0, sizeof(f->userd));
+	gk20a_fifo_free_userd_slabs(g);
 
 	nvgpu_vfree(g, f->channel);
 	f->channel = NULL;
@@ -384,47 +360,59 @@ clean_up:
 
 int vgpu_init_fifo_setup_hw(struct gk20a *g)
 {
+	struct fifo_gk20a *f = &g->fifo;
+	u32 v, v1 = 0x33, v2 = 0x55;
+	struct nvgpu_mem *mem = &f->userd_slabs[0];
+	u32 bar1_vaddr;
+	volatile u32 *cpu_vaddr;
+	int err;
+
 	nvgpu_log_fn(g, " ");
 
+	/* allocate and map first userd slab for bar1 test. */
+	err = nvgpu_dma_alloc_sys(g, PAGE_SIZE, mem);
+	if (err != 0) {
+		nvgpu_err(g, "userd allocation failed, err=%d", err);
+		return err;
+	}
+	mem->gpu_va = g->ops.mm.bar1_map(g, mem, 0);
+	f->userd_gpu_va = mem->gpu_va;
+
 	/* test write, read through bar1 @ userd region before
 	 * turning on the snooping */
-	{
-		struct fifo_gk20a *f = &g->fifo;
-		u32 v, v1 = 0x33, v2 = 0x55;
 
-		u32 bar1_vaddr = f->userd.gpu_va;
-		volatile u32 *cpu_vaddr = f->userd.cpu_va;
+	cpu_vaddr = mem->cpu_va;
+	bar1_vaddr = mem->gpu_va;
 
-		nvgpu_log_info(g, "test bar1 @ vaddr 0x%x",
-			   bar1_vaddr);
+	nvgpu_log_info(g, "test bar1 @ vaddr 0x%x",
+		   bar1_vaddr);
 
-		v = gk20a_bar1_readl(g, bar1_vaddr);
+	v = gk20a_bar1_readl(g, bar1_vaddr);
 
-		*cpu_vaddr = v1;
-		nvgpu_mb();
+	*cpu_vaddr = v1;
+	nvgpu_mb();
 
-		if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
-			nvgpu_err(g, "bar1 broken @ gk20a!");
-			return -EINVAL;
-		}
-
-		gk20a_bar1_writel(g, bar1_vaddr, v2);
-
-		if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
-			nvgpu_err(g, "bar1 broken @ gk20a!");
-			return -EINVAL;
-		}
-
-		/* is it visible to the cpu? */
-		if (*cpu_vaddr != v2) {
-			nvgpu_err(g, "cpu didn't see bar1 write @ %p!",
-				cpu_vaddr);
-		}
-
-		/* put it back */
-		gk20a_bar1_writel(g, bar1_vaddr, v);
+	if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
+		nvgpu_err(g, "bar1 broken @ gk20a!");
+		return -EINVAL;
 	}
 
+	gk20a_bar1_writel(g, bar1_vaddr, v2);
+
+	if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
+		nvgpu_err(g, "bar1 broken @ gk20a!");
+		return -EINVAL;
+	}
+
+	/* is it visible to the cpu? */
+	if (*cpu_vaddr != v2) {
+		nvgpu_err(g, "cpu didn't see bar1 write @ %p!",
+			cpu_vaddr);
+	}
+
+	/* put it back */
+	gk20a_bar1_writel(g, bar1_vaddr, v);
+
 	nvgpu_log_fn(g, "done");
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c
index efeb461fb..c20dd6ef4 100644
--- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c
@@ -481,6 +481,7 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.remove_bar2_vm = gp10b_remove_bar2_vm,
 		.get_kind_invalid = gm20b_get_kind_invalid,
 		.get_kind_pitch = gm20b_get_kind_pitch,
+		.bar1_map = vgpu_bar1_map,
 	},
 	.pramin = {
 		.data032_r = NULL,
diff --git a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c
index 64b6df67f..e659f2dd0 100644
--- a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c
@@ -559,6 +559,7 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.init_bar2_vm = gp10b_init_bar2_vm,
 		.remove_bar2_vm = gp10b_remove_bar2_vm,
 		.fault_info_mem_destroy = gv11b_mm_fault_info_mem_destroy,
+		.bar1_map = vgpu_bar1_map,
 	},
 	.therm = {
 		.init_therm_setup_hw = NULL,
diff --git a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
index 3aa433310..ad4e504e4 100644
--- a/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/mm_vgpu.c
@@ -155,7 +155,7 @@ void vgpu_vm_remove(struct vm_gk20a *vm)
 	WARN_ON(err || msg.ret);
 }
 
-u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem)
+u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem, u32 offset)
 {
 	u64 addr = nvgpu_mem_get_addr(g, mem);
 	struct tegra_vgpu_cmd_msg msg;
@@ -167,12 +167,32 @@ u64 vgpu_bar1_map(struct gk20a *g, struct nvgpu_mem *mem)
 	p->addr = addr;
 	p->size = mem->size;
 	p->iova = 0;
+	p->offset = offset; /* offset from start of BAR1 */
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
-	if (err || msg.ret)
+	if (err || msg.ret) {
 		addr = 0;
-	else
+	} else {
 		addr = p->gpu_va;
 
+		/* Server returns gpu_va assuming full BAR1 range.
+		 * In case of reduced BAR1 configuration, we only map
+		 * the portion of BAR1 reserved for this guest.
+		 * As a result, we need to use the offset from the start
+		 * of this range, instead of the gpu_va.
+		 *
+		 *                 offset
+		 *                 <---->
+		 *  Guest IPA      +========+
+		 *                 :    X   :
+		 *  BAR1 PA   +----+========+-----------+
+		 *            <--------->
+		 *               gpu_va
+		 */
+		if (vgpu_is_reduced_bar1(g)) {
+			addr = offset;
+		}
+	}
+
 	return addr;
 }