diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
index 990972e4c..065e8ab10 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -129,28 +129,25 @@ static int channel_gk20a_commit_userd(struct channel_gk20a *c)
 {
 	u32 addr_lo;
 	u32 addr_hi;
-	void *inst_ptr;
 	struct gk20a *g = c->g;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
 	addr_hi = u64_hi32(c->userd_iova);
 
 	gk20a_dbg_info("channel %d : set ramfc userd 0x%16llx",
 		c->hw_chid, (u64)c->userd_iova);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+	gk20a_mem_wr32(g, &c->inst_block,
+		       ram_in_ramfc_w() + ram_fc_userd_w(),
 		       (g->mm.vidmem_is_vidmem ?
 			pbdma_userd_target_sys_mem_ncoh_f() :
 			pbdma_userd_target_vid_mem_f()) |
 		       pbdma_userd_addr_f(addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+	gk20a_mem_wr32(g, &c->inst_block,
+		       ram_in_ramfc_w() + ram_fc_userd_hi_w(),
 		       pbdma_userd_hi_addr_f(addr_hi));
 
 	return 0;
@@ -186,13 +183,8 @@ int gk20a_channel_get_timescale_from_timeslice(struct gk20a *g,
 
 static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 {
-	void *inst_ptr;
 	int shift = 0, value = 0;
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	gk20a_channel_get_timescale_from_timeslice(c->g,
 		c->timeslice_us, &value, &shift);
 
@@ -203,7 +195,7 @@ static int channel_gk20a_set_schedule_params(struct channel_gk20a *c)
 	WARN_ON(c->g->ops.fifo.preempt_channel(c->g, c->hw_chid));
 
 	/* set new timeslice */
-	gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_fc_runlist_timeslice_w(),
 		value | (shift << 12) |
 		fifo_runlist_timeslice_enable_true_f());
 
@@ -255,33 +247,30 @@ u32 channel_gk20a_pbdma_acquire_val(struct channel_gk20a *c)
 int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 			u64 gpfifo_base, u32 gpfifo_entries, u32 flags)
 {
-	void *inst_ptr;
+	struct gk20a *g = c->g;
+	struct mem_desc *mem = &c->inst_block;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
+	gk20a_memset(g, mem, 0, 0, ram_fc_size_val_v());
 
-	memset(inst_ptr, 0, ram_fc_size_val_v());
-
-	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_gp_base_w(),
 		pbdma_gp_base_offset_f(
 		u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_gp_base_hi_w(),
 		pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
 		pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_signature_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_signature_w(),
 		 c->g->ops.fifo.get_pbdma_signature(c->g));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_formats_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_formats_w(),
 		pbdma_formats_gp_fermi0_f() |
 		pbdma_formats_pb_fermi1_f() |
 		pbdma_formats_mp_fermi0_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_pb_header_w(),
 		pbdma_pb_header_priv_user_f() |
 		pbdma_pb_header_method_zero_f() |
 		pbdma_pb_header_subchannel_zero_f() |
@@ -289,47 +278,49 @@ int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
 		pbdma_pb_header_first_true_f() |
 		pbdma_pb_header_type_inc_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_subdevice_w(),
 		pbdma_subdevice_id_f(1) |
 		pbdma_subdevice_status_active_f() |
 		pbdma_subdevice_channel_dma_enable_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+	gk20a_mem_wr32(g, mem, ram_fc_target_w(), pbdma_target_engine_sw_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_acquire_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_acquire_w(),
 		channel_gk20a_pbdma_acquire_val(c));
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_runlist_timeslice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_runlist_timeslice_w(),
 		fifo_runlist_timeslice_timeout_128_f() |
 		fifo_runlist_timeslice_timescale_3_f() |
 		fifo_runlist_timeslice_enable_true_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+	gk20a_mem_wr32(g, mem, ram_fc_pb_timeslice_w(),
 		fifo_pb_timeslice_timeout_16_f() |
 		fifo_pb_timeslice_timescale_0_f() |
 		fifo_pb_timeslice_enable_true_f());
 
-	gk20a_mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
+	gk20a_mem_wr32(g, mem, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
 
 	return channel_gk20a_commit_userd(c);
 }
 
 static int channel_gk20a_setup_userd(struct channel_gk20a *c)
 {
-	BUG_ON(!c->userd_cpu_va);
+	struct gk20a *g = c->g;
+	struct mem_desc *mem = &g->fifo.userd;
+	u32 offset = c->hw_chid * g->fifo.userd_entry_size / sizeof(u32);
 
 	gk20a_dbg_fn("");
 
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
-	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_put_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_ref_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_put_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_ref_threshold_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_top_level_get_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_get_hi_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_get_w(), 0);
+	gk20a_mem_wr32(g, mem, offset + ram_userd_gp_put_w(), 0);
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 8840a3aed..b1355f921 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -130,7 +130,6 @@ struct channel_gk20a {
 	struct mem_desc inst_block;
 	struct mem_desc_sub ramfc;
 
-	void *userd_cpu_va;
 	u64 userd_iova;
 	u64 userd_gpu_va;
 
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
index c2285c8af..a3fa2ea53 100644
--- a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -36,7 +36,7 @@ unsigned int gk20a_debug_trace_cmdbuf;
 struct ch_state {
 	int pid;
 	int refs;
-	u8 inst_block[0];
+	u32 inst_block[0];
 };
 
 static const char * const ccsr_chan_status_str[] = {
@@ -108,15 +108,15 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	u32 channel = gk20a_readl(g, ccsr_channel_r(hw_chid));
 	u32 status = ccsr_channel_status_v(channel);
 	u32 syncpointa, syncpointb;
-	void *inst_ptr;
+	u32 *inst_mem;
 
 	if (!ch_state)
 		return;
 
-	inst_ptr = &ch_state->inst_block[0];
+	inst_mem = &ch_state->inst_block[0];
 
-	syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
-	syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
+	syncpointa = inst_mem[ram_fc_syncpointa_w()];
+	syncpointb = inst_mem[ram_fc_syncpointb_w()];
 
 	gk20a_debug_output(o, "%d-%s, pid %d, refs: %d: ", hw_chid,
 			dev_name(g->dev),
@@ -129,23 +129,22 @@ static void gk20a_debug_show_channel(struct gk20a *g,
 	gk20a_debug_output(o, "TOP: %016llx PUT: %016llx GET: %016llx "
 			"FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
 			"SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_top_level_get_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr,
-			ram_fc_pb_top_level_get_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()) << 32ULL),
-		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_w()) +
-		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()) << 32ULL),
-		gk20a_mem_rd32(inst_ptr, ram_fc_pb_header_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_pb_count_w()),
+		(u64)inst_mem[ram_fc_pb_top_level_get_w()] +
+		((u64)inst_mem[ram_fc_pb_top_level_get_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_put_w()] +
+		((u64)inst_mem[ram_fc_pb_put_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_get_w()] +
+		((u64)inst_mem[ram_fc_pb_get_hi_w()] << 32ULL),
+		(u64)inst_mem[ram_fc_pb_fetch_w()] +
+		((u64)inst_mem[ram_fc_pb_fetch_hi_w()] << 32ULL),
+		inst_mem[ram_fc_pb_header_w()],
+		inst_mem[ram_fc_pb_count_w()],
 		syncpointa,
 		syncpointb,
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorea_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphoreb_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorec_w()),
-		gk20a_mem_rd32(inst_ptr, ram_fc_semaphored_w()));
+		inst_mem[ram_fc_semaphorea_w()],
+		inst_mem[ram_fc_semaphoreb_w()],
+		inst_mem[ram_fc_semaphorec_w()],
+		inst_mem[ram_fc_semaphored_w()]);
 
 #ifdef CONFIG_TEGRA_GK20A
 	if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
@@ -246,17 +245,15 @@ void gk20a_debug_show_dump(struct gk20a *g, struct gk20a_debug_output *o)
 
 	for (chid = 0; chid < f->num_channels; chid++) {
 		struct channel_gk20a *ch = &f->channel[chid];
-		if (ch_state[chid]) {
-			if (ch->inst_block.cpu_va) {
-				ch_state[chid]->pid = ch->pid;
-				ch_state[chid]->refs =
-					atomic_read(&ch->ref_count);
-				memcpy(&ch_state[chid]->inst_block[0],
-						ch->inst_block.cpu_va,
-						ram_in_alloc_size_v());
-			}
-			gk20a_channel_put(ch);
-		}
+		if (!ch_state[chid])
+			continue;
+
+		ch_state[chid]->pid = ch->pid;
+		ch_state[chid]->refs = atomic_read(&ch->ref_count);
+		gk20a_mem_rd_n(g, &ch->inst_block, 0,
+				&ch_state[chid]->inst_block[0],
+				ram_in_alloc_size_v());
+		gk20a_channel_put(ch);
 	}
 	for (chid = 0; chid < f->num_channels; chid++) {
 		if (ch_state[chid]) {
diff --git a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
index f9cddc416..edddcdc1b 100644
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -619,7 +619,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	phys_addr_t pa;
 	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
 	struct gk20a_fecs_trace *trace = g->fecs_trace;
-	void *ctx_ptr;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 	u32 context_ptr = gk20a_fecs_trace_fecs_context_ptr(ch);
 
 	gk20a_dbg(gpu_dbg_fn|gpu_dbg_ctxsw,
@@ -634,10 +634,7 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	if (!pa)
 		return -ENOMEM;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-		PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT, 0,
-		pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
 	lo = u64_lo32(pa);
@@ -646,18 +643,18 @@ static int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	gk20a_dbg(gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
 		lo, GK20A_FECS_TRACE_NUM_RECORDS);
 
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
-		0, lo);
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
-		0, ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
-	gk20a_mem_wr32(ctx_ptr
-		+ ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
-		0, ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+		lo);
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi));
+	gk20a_mem_wr(g, mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
 			GK20A_FECS_TRACE_NUM_RECORDS));
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 	gk20a_fecs_trace_hash_add(g, context_ptr, ch->pid);
 
 	return 0;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index dc3debf20..714003319 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -520,8 +520,6 @@ static int gk20a_init_fifo_setup_sw(struct gk20a *g)
 	mutex_init(&f->free_chs_mutex);
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_cpu_va =
-			f->userd.cpu_va + chid * f->userd_entry_size;
 		f->channel[chid].userd_iova =
 			g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
 				+ chid * f->userd_entry_size;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
index f228cce49..2f85bf96c 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -201,7 +201,7 @@ struct gpu_ops {
 			  struct gr_ctx_desc *gr_ctx);
 		void (*update_ctxsw_preemption_mode)(struct gk20a *g,
 				struct channel_ctx_gk20a *ch_ctx,
-				void *ctx_ptr);
+				struct mem_desc *mem);
 		int (*update_smpc_ctxsw_mode)(struct gk20a *g,
 				struct channel_gk20a *c,
 				bool enable);
@@ -221,7 +221,8 @@ struct gpu_ops {
 		int (*wait_empty)(struct gk20a *g, unsigned long end_jiffies,
 		       u32 expect_delay);
 		void (*init_cyclestats)(struct gk20a *g);
-		void (*enable_cde_in_fecs)(void *ctx_ptr);
+		void (*enable_cde_in_fecs)(struct gk20a *g,
+				struct mem_desc *mem);
 		int (*set_sm_debug_mode)(struct gk20a *g, struct channel_gk20a *ch,
 					u64 sms, bool enable);
 		void (*bpt_reg_info)(struct gk20a *g,
@@ -484,7 +485,7 @@ struct gpu_ops {
 		void (*cbc_clean)(struct gk20a *g);
 		void (*tlb_invalidate)(struct vm_gk20a *vm);
 		void (*set_big_page_size)(struct gk20a *g,
-					  void *inst_ptr, int size);
+					  struct mem_desc *mem, int size);
 		u32 (*get_big_page_sizes)(void);
 		u32 (*get_physical_addr_bits)(struct gk20a *g);
 		int (*init_mm_setup_hw)(struct gk20a *g);
@@ -493,7 +494,8 @@ struct gpu_ops {
 		void (*remove_bar2_vm)(struct gk20a *g);
 		const struct gk20a_mmu_level *
 			(*get_mmu_levels)(struct gk20a *g, u32 big_page_size);
-		void (*init_pdb)(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+		void (*init_pdb)(struct gk20a *g, struct mem_desc *mem,
+				u64 pdb_addr);
 		u64 (*get_iova_addr)(struct gk20a *g, struct scatterlist *sgl,
 					 u32 flags);
 		int (*bar1_bind)(struct gk20a *g, u64 bar1_iova);
@@ -859,53 +861,6 @@ do {									\
 #define gk20a_dbg_info(fmt, arg...) \
 	gk20a_dbg(gpu_dbg_info, fmt, ##arg)
 
-/* mem access with dbg_mem logging */
-static inline u8 gk20a_mem_rd08(void *ptr, int b)
-{
-	u8 _b = ((const u8 *)ptr)[b];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, _b);
-#endif
-	return _b;
-}
-static inline u16 gk20a_mem_rd16(void *ptr, int s)
-{
-	u16 _s = ((const u16 *)ptr)[s];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, _s);
-#endif
-	return _s;
-}
-static inline u32 gk20a_mem_rd32(void *ptr, int w)
-{
-	u32 _w = ((const u32 *)ptr)[w];
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + sizeof(u32)*w, _w);
-#endif
-	return _w;
-}
-static inline void gk20a_mem_wr08(void *ptr, int b, u8 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, data);
-#endif
-	((u8 *)ptr)[b] = data;
-}
-static inline void gk20a_mem_wr16(void *ptr, int s, u16 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, data);
-#endif
-	((u16 *)ptr)[s] = data;
-}
-static inline void gk20a_mem_wr32(void *ptr, int w, u32 data)
-{
-#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
-	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u32)*w, data);
-#endif
-	((u32 *)ptr)[w] = data;
-}
-
 void gk20a_init_clk_ops(struct gpu_ops *gops);
 
 /* register accessors */
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
index 4e7c36ee0..e7e6662a8 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -97,22 +97,18 @@ int gr_gk20a_get_ctx_id(struct gk20a *g,
 		u32 *ctx_id)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem))
 		return -ENOMEM;
 
-	*ctx_id = gk20a_mem_rd32(ctx_ptr +
-				 ctxsw_prog_main_image_context_id_o(), 0);
+	*ctx_id = gk20a_mem_rd(g, &ch_ctx->gr_ctx->mem,
+			ctxsw_prog_main_image_context_id_o());
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
 
 	return 0;
 }
@@ -619,22 +615,17 @@ static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
 {
 	u32 addr_lo;
 	u32 addr_hi;
-	void *inst_ptr = NULL;
 
 	gk20a_dbg_fn("");
 
-	inst_ptr = c->inst_block.cpu_va;
-	if (!inst_ptr)
-		return -ENOMEM;
-
 	addr_lo = u64_lo32(gpu_va) >> 12;
 	addr_hi = u64_hi32(gpu_va);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(),
 		 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
 		 ram_in_gr_wfi_ptr_lo_f(addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(),
 		 ram_in_gr_wfi_ptr_hi_f(addr_hi));
 
 	return 0;
@@ -658,11 +649,7 @@ int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
 		return -EBUSY;
 	}
 
-	ch_ctx->patch_ctx.mem.cpu_va = vmap(ch_ctx->patch_ctx.mem.pages,
-			PAGE_ALIGN(ch_ctx->patch_ctx.mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-
-	if (!ch_ctx->patch_ctx.mem.cpu_va)
+	if (gk20a_mem_begin(g, &ch_ctx->patch_ctx.mem))
 		return -ENOMEM;
 
 	return 0;
@@ -677,8 +664,7 @@ int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	vunmap(ch_ctx->patch_ctx.mem.cpu_va);
-	ch_ctx->patch_ctx.mem.cpu_va = NULL;
+	gk20a_mem_end(g, &ch_ctx->patch_ctx.mem);
 	return 0;
 }
 
@@ -687,7 +673,6 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
 				    u32 addr, u32 data, bool patch)
 {
 	u32 patch_slot = 0;
-	void *patch_ptr = NULL;
 	bool mapped_here = false;
 
 	BUG_ON(patch != 0 && ch_ctx == NULL);
@@ -708,11 +693,10 @@ int gr_gk20a_ctx_patch_write(struct gk20a *g,
 		} else
 			mapped_here = false;
 
-		patch_ptr = ch_ctx->patch_ctx.mem.cpu_va;
 		patch_slot = ch_ctx->patch_ctx.data_count * 2;
 
-		gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
-		gk20a_mem_wr32(patch_ptr, patch_slot++, data);
+		gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, addr);
+		gk20a_mem_wr32(g, &ch_ctx->patch_ctx.mem, patch_slot++, data);
 
 		ch_ctx->patch_ctx.data_count++;
 
@@ -760,16 +744,13 @@ static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 	u32 va_lo, va_hi, va;
 	int ret = 0;
-	void *ctx_ptr = NULL;
 
 	gk20a_dbg_fn("");
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
 	if (ch_ctx->zcull_ctx.gpu_va == 0 &&
@@ -792,15 +773,17 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c)
 		goto clean_up;
 	}
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_zcull_o(),
 		 ch_ctx->zcull_ctx.ctx_sw_mode);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_zcull_ptr_o(), va);
 
 	c->g->ops.fifo.enable_channel(c);
 
 clean_up:
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
 	return ret;
 }
@@ -1500,8 +1483,8 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	u32 ctx_header_words;
 	u32 i;
 	u32 data;
-	void *ctx_ptr = NULL;
-	void *gold_ptr = NULL;
+	struct mem_desc *gold_mem = &gr->global_ctx_buffer[GOLDEN_CTX].mem;
+	struct mem_desc *gr_mem = &ch_ctx->gr_ctx->mem;
 	u32 err = 0;
 
 	gk20a_dbg_fn("");
@@ -1527,16 +1510,10 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	if (err)
 		goto clean_up;
 
-	gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].mem.pages,
-			PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].mem.size) >>
-			PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-	if (!gold_ptr)
+	if (gk20a_mem_begin(g, gold_mem))
 		goto clean_up;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, gr_mem))
 		goto clean_up;
 
 	ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
@@ -1545,14 +1522,14 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 	g->ops.mm.l2_flush(g, true);
 
 	for (i = 0; i < ctx_header_words; i++) {
-		data = gk20a_mem_rd32(ctx_ptr, i);
-		gk20a_mem_wr32(gold_ptr, i, data);
+		data = gk20a_mem_rd32(g, gr_mem, i);
+		gk20a_mem_wr32(g, gold_mem, i, data);
 	}
 
-	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+	gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
 		 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
 
-	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
+	gk20a_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_ptr_o(), 0);
 
 	gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
 
@@ -1568,12 +1545,12 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 			goto clean_up;
 		}
 
-		for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
-			gr->ctx_vars.local_golden_image[i] =
-				gk20a_mem_rd32(gold_ptr, i);
+		gk20a_mem_rd_n(g, gold_mem, 0,
+				gr->ctx_vars.local_golden_image,
+				gr->ctx_vars.golden_image_size);
 	}
 
-	gr_gk20a_commit_inst(c, ch_ctx->gr_ctx->mem.gpu_va);
+	gr_gk20a_commit_inst(c, gr_mem->gpu_va);
 
 	gr->ctx_vars.golden_image_initialized = true;
 
@@ -1586,10 +1563,8 @@ clean_up:
 	else
 		gk20a_dbg_fn("done");
 
-	if (gold_ptr)
-		vunmap(gold_ptr);
-	if (ctx_ptr)
-		vunmap(ctx_ptr);
+	gk20a_mem_end(g, gold_mem);
+	gk20a_mem_end(g, gr_mem);
 
 	mutex_unlock(&gr->ctx_mutex);
 	return err;
@@ -1600,7 +1575,7 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 				    bool enable_smpc_ctxsw)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem;
 	u32 data;
 	int ret;
 
@@ -1611,46 +1586,39 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 		return -EFAULT;
 	}
 
+	mem = &ch_ctx->gr_ctx->mem;
+
 	c->g->ops.fifo.disable_channel(c);
 	ret = c->g->ops.fifo.preempt_channel(c->g, c->hw_chid);
 	if (ret) {
-		c->g->ops.fifo.enable_channel(c);
-		gk20a_err(dev_from_gk20a(g),
-			"failed to preempt channel\n");
-		return ret;
+		gk20a_err(dev_from_gk20a(g), "failed to preempt channel");
+		goto out;
 	}
 
 	/* Channel gr_ctx buffer is gpu cacheable.
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	if (!ch_ctx->gr_ctx) {
-		gk20a_err(dev_from_gk20a(g), "no graphics context allocated");
-		return -EFAULT;
+	if (gk20a_mem_begin(g, mem)) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr) {
-		c->g->ops.fifo.enable_channel(c);
-		return -ENOMEM;
-	}
-
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, mem,
+			ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
 	data |= enable_smpc_ctxsw ?
 		ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
 		ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
-		 data);
+	gk20a_mem_wr(g, mem,
+			ctxsw_prog_main_image_pm_o(),
+			data);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
-	/* enable channel */
+out:
 	c->g->ops.fifo.enable_channel(c);
-
-	return 0;
+	return ret;
 }
 
 int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
@@ -1659,8 +1627,7 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
 	struct pm_ctx_desc *pm_ctx = &ch_ctx->pm_ctx;
-	void *ctx_ptr = NULL;
-	void *pm_ctx_ptr;
+	struct mem_desc *gr_mem;
 	u32 data, virt_addr;
 	int ret;
 
@@ -1671,6 +1638,8 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		return -EFAULT;
 	}
 
+	gr_mem = &ch_ctx->gr_ctx->mem;
+
 	if (enable_hwpm_ctxsw) {
 		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f())
 			return 0;
@@ -1721,29 +1690,22 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		}
 
 		/* Now clear the buffer */
-		pm_ctx_ptr = vmap(pm_ctx->mem.pages,
-				PAGE_ALIGN(pm_ctx->mem.size) >> PAGE_SHIFT,
-				0, pgprot_writecombine(PAGE_KERNEL));
-
-		if (!pm_ctx_ptr) {
+		if (gk20a_mem_begin(g, &pm_ctx->mem)) {
 			ret = -ENOMEM;
 			goto cleanup_pm_buf;
 		}
 
-		memset(pm_ctx_ptr, 0, pm_ctx->mem.size);
+		gk20a_memset(g, &pm_ctx->mem, 0, 0, pm_ctx->mem.size);
 
-		vunmap(pm_ctx_ptr);
+		gk20a_mem_end(g, &pm_ctx->mem);
 	}
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr) {
+	if (gk20a_mem_begin(g, gr_mem)) {
 		ret = -ENOMEM;
 		goto cleanup_pm_buf;
 	}
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
 
 	if (enable_hwpm_ctxsw) {
@@ -1760,10 +1722,10 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 
 	data |= pm_ctx->pm_mode;
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+	gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
+	gk20a_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, gr_mem);
 
 	/* enable channel */
 	c->g->ops.fifo.enable_channel(c);
@@ -1788,9 +1750,9 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	u32 virt_addr_lo;
 	u32 virt_addr_hi;
 	u32 virt_addr = 0;
-	u32 i, v, data;
+	u32 v, data;
 	int ret = 0;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem = &ch_ctx->gr_ctx->mem;
 
 	gk20a_dbg_fn("");
 
@@ -1801,20 +1763,18 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	if (gk20a_mem_begin(g, mem))
 		return -ENOMEM;
 
-	for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
-		gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+	gk20a_mem_wr_n(g, mem, 0,
+			gr->ctx_vars.local_golden_image,
+			gr->ctx_vars.golden_image_size);
 
 	if (g->ops.gr.enable_cde_in_fecs && c->cde)
-		g->ops.gr.enable_cde_in_fecs(ctx_ptr);
+		g->ops.gr.enable_cde_in_fecs(g, mem);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_save_ops_o(), 0);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_num_restore_ops_o(), 0);
 
 	/* set priv access map */
 	virt_addr_lo =
@@ -1827,29 +1787,29 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	else
 		data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
 		 data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
 		 virt_addr_lo);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
 		 virt_addr_hi);
 	/* disable verif features */
-	v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
+	v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
 	v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
 	v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
 
 	if (g->ops.gr.update_ctxsw_preemption_mode)
-		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, ctx_ptr);
+		g->ops.gr.update_ctxsw_preemption_mode(g, ch_ctx, mem);
 
 	virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
 	virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
 		 ch_ctx->patch_ctx.data_count);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_lo_o(),
 		 virt_addr_lo);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_patch_adr_hi_o(),
 		 virt_addr_hi);
 
 	/* Update main header region of the context buffer with the info needed
@@ -1860,7 +1820,7 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 		if (ch_ctx->pm_ctx.mem.gpu_va == 0) {
 			gk20a_err(dev_from_gk20a(g),
 				"context switched pm with no pm buffer!");
-			vunmap(ctx_ptr);
+			gk20a_mem_end(g, mem);
 			return -EFAULT;
 		}
 
@@ -1871,14 +1831,14 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	} else
 		virt_addr = 0;
 
-	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
 	data = data & ~ctxsw_prog_main_image_pm_mode_m();
 	data |= ch_ctx->pm_ctx.pm_mode;
 
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, data);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, virt_addr);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_pm_ptr_o(), virt_addr);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(g, mem);
 
 	if (tegra_platform_is_linsim()) {
 		u32 inst_base_ptr =
@@ -1978,16 +1938,20 @@ static void gr_gk20a_init_ctxsw_ucode_segments(
 }
 
 static int gr_gk20a_copy_ctxsw_ucode_segments(
-	u8 *buf,
+	struct gk20a *g,
+	struct mem_desc *dst,
 	struct gk20a_ctxsw_ucode_segments *segments,
 	u32 *bootimage,
 	u32 *code, u32 *data)
 {
 	int i;
 
-	memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
-	memcpy(buf + segments->code.offset, code,      segments->code.size);
-	memcpy(buf + segments->data.offset, data,      segments->data.size);
+	gk20a_mem_wr_n(g, dst, segments->boot.offset, bootimage,
+			segments->boot.size);
+	gk20a_mem_wr_n(g, dst, segments->code.offset, code,
+			segments->code.size);
+	gk20a_mem_wr_n(g, dst, segments->data.offset, data,
+			segments->data.size);
 
 	/* compute a "checksum" for the boot binary to detect its version */
 	segments->boot_signature = 0;
@@ -2009,7 +1973,6 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	u32 *fecs_boot_image;
 	u32 *gpccs_boot_image;
 	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-	u8 *buf;
 	u32 ucode_size;
 	int err = 0;
 
@@ -2049,14 +2012,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	if (err)
 		goto clean_up;
 
-	buf = (u8 *)ucode_info->surface_desc.cpu_va;
-	if (!buf) {
-		gk20a_err(d, "failed to map surface desc buffer");
-		err = -ENOMEM;
-		goto clean_up;
-	}
-
-	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
+	gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+		&ucode_info->fecs,
 		fecs_boot_image,
 		g->gr.ctx_vars.ucode.fecs.inst.l,
 		g->gr.ctx_vars.ucode.fecs.data.l);
@@ -2064,7 +2021,8 @@ int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
 	release_firmware(fecs_fw);
 	fecs_fw = NULL;
 
-	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
+	gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc,
+		&ucode_info->gpccs,
 		gpccs_boot_image,
 		g->gr.ctx_vars.ucode.gpccs.inst.l,
 		g->gr.ctx_vars.ucode.gpccs.data.l);
@@ -4690,41 +4648,38 @@ out:
 static int gr_gk20a_init_access_map(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
-	void *data;
-	int err = 0;
+	struct mem_desc *mem = &gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem;
 	u32 w, nr_pages =
 		DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
 			     PAGE_SIZE);
 	u32 *whitelist = NULL;
 	int num_entries = 0;
 
-	data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.pages,
-		    PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].mem.size) >>
-		    PAGE_SHIFT, 0, pgprot_writecombine(PAGE_KERNEL));
-	if (!data) {
+	if (gk20a_mem_begin(g, mem)) {
 		gk20a_err(dev_from_gk20a(g),
 			  "failed to map priv access map memory");
-		err = -ENOMEM;
-		goto clean_up;
+		return -ENOMEM;
 	}
 
-	memset(data, 0x0, PAGE_SIZE * nr_pages);
+	gk20a_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages);
 
 	g->ops.gr.get_access_map(g, &whitelist, &num_entries);
 
 	for (w = 0; w < num_entries; w++) {
-		u32 map_bit, map_byte, map_shift;
+		u32 map_bit, map_byte, map_shift, x;
 		map_bit = whitelist[w] >> 2;
 		map_byte = map_bit >> 3;
 		map_shift = map_bit & 0x7; /* i.e. 0-7 */
 		gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
 			       whitelist[w], map_byte, map_shift);
-		((u8 *)data)[map_byte] |= 1 << map_shift;
+		x = gk20a_mem_rd32(g, mem, map_byte / sizeof(u32));
+		x |= 1 << (
+			   (map_byte % sizeof(u32) * BITS_PER_BYTE)
+			  + map_shift);
+		gk20a_mem_wr32(g, mem, map_byte / sizeof(u32), x);
 	}
 
-clean_up:
-	if (data)
-		vunmap(data);
+	gk20a_mem_end(g, mem);
 	return 0;
 }
 
@@ -6659,7 +6614,7 @@ static void gr_gk20a_init_sm_dsm_reg_info(void)
 static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 			    struct channel_ctx_gk20a *ch_ctx,
 			    u32 addr, u32 data,
-			    u8 *context)
+			    struct mem_desc *mem)
 {
 	u32 num_gpc = g->gr.gpc_count;
 	u32 num_tpc;
@@ -6688,8 +6643,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				/* reset the patch count from previous
 				   runs,if ucode has already processed
 				   it */
-				tmp = gk20a_mem_rd32(context +
-				       ctxsw_prog_main_image_patch_count_o(), 0);
+				tmp = gk20a_mem_rd(g, mem,
+				       ctxsw_prog_main_image_patch_count_o());
 
 				if (!tmp)
 					ch_ctx->patch_ctx.data_count = 0;
@@ -6700,15 +6655,15 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				vaddr_lo = u64_lo32(ch_ctx->patch_ctx.mem.gpu_va);
 				vaddr_hi = u64_hi32(ch_ctx->patch_ctx.mem.gpu_va);
 
-				gk20a_mem_wr32(context +
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_count_o(),
-					 0, ch_ctx->patch_ctx.data_count);
-				gk20a_mem_wr32(context +
+					 ch_ctx->patch_ctx.data_count);
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_adr_lo_o(),
-					 0, vaddr_lo);
-				gk20a_mem_wr32(context +
+					 vaddr_lo);
+				gk20a_mem_wr(g, mem,
 					 ctxsw_prog_main_image_patch_adr_hi_o(),
-					 0, vaddr_hi);
+					 vaddr_hi);
 
 				/* we're not caching these on cpu side,
 				   but later watch for it */
@@ -6760,17 +6715,15 @@ static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
 
 #define ILLEGAL_ID (~0)
 
-static inline bool check_main_image_header_magic(void *context)
+static inline bool check_main_image_header_magic(u8 *context)
 {
-	u32 magic = gk20a_mem_rd32(context +
-			     ctxsw_prog_main_image_magic_value_o(), 0);
+	u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
 	gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
 	return magic == ctxsw_prog_main_image_magic_value_v_value_v();
 }
-static inline bool check_local_header_magic(void *context)
+static inline bool check_local_header_magic(u8 *context)
 {
-	u32 magic = gk20a_mem_rd32(context +
-			     ctxsw_prog_local_magic_value_o(), 0);
+	u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
 	gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
 	return magic == ctxsw_prog_local_magic_value_v_value_v();
 
@@ -6814,7 +6767,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	u32 num_gpcs, num_tpcs;
 	u32 chk_addr;
 	u32 ext_priv_offset, ext_priv_size;
-	void *context;
+	u8 *context;
 	u32 offset_to_segment, offset_to_segment_end;
 	u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
 	u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
@@ -6856,14 +6809,14 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	/* note below is in words/num_registers */
 	marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
 
-	context = context_buffer;
+	context = (u8 *)context_buffer;
 	/* sanity check main header */
 	if (!check_main_image_header_magic(context)) {
 		gk20a_err(dev_from_gk20a(g),
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
 	if (gpc_num >= num_gpcs) {
 		gk20a_err(dev_from_gk20a(g),
 		   "GPC 0x%08x is greater than total count 0x%08x!\n",
@@ -6871,7 +6824,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 		return -EINVAL;
 	}
 
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
 	ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
 	if (0 == ext_priv_size) {
 		gk20a_dbg_info(" No extended memory in context buffer");
@@ -7149,7 +7102,7 @@ gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
 }
 
 static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
-					       void *context,
+					       u8 *context,
 					       u32 *num_ppcs, u32 *ppc_mask,
 					       u32 *reg_ppc_count)
 {
@@ -7165,7 +7118,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 	     (num_pes_per_gpc > 1)))
 		return -EINVAL;
 
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
 
 	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
 	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
@@ -7177,7 +7130,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 
 /*
  *  This function will return the 32 bit offset for a priv register if it is
- *  present in the context buffer.
+ *  present in the context buffer. The context buffer is in CPU memory.
  */
 static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 					       u32 addr,
@@ -7196,7 +7149,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	u32 offset;
 	u32 sys_priv_offset, gpc_priv_offset;
 	u32 ppc_mask, reg_list_ppc_count;
-	void *context;
+	u8 *context;
 	u32 offset_to_segment;
 
 	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
@@ -7207,13 +7160,13 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	if (err)
 		return err;
 
-	context = context_buffer;
+	context = (u8 *)context_buffer;
 	if (!check_main_image_header_magic(context)) {
 		gk20a_err(dev_from_gk20a(g),
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
 
 	/* Parse the FECS local header. */
 	context += ctxsw_prog_ucode_header_size_in_bytes();
@@ -7222,7 +7175,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			   "Invalid FECS local header: magic value\n");
 		return -EINVAL;
 	}
-	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+	data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
 	sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
 
 	/* If found in Ext buffer, ok.
@@ -7268,7 +7221,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			return -EINVAL;
 
 		}
-		data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+		data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
 		gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
 
 		err = gr_gk20a_determine_ppc_configuration(g, context,
@@ -7277,7 +7230,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 		if (err)
 			return err;
 
-		num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+		num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
 
 		if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
 			gk20a_err(dev_from_gk20a(g),
@@ -7689,9 +7642,9 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 {
 	struct gk20a *g = ch->g;
 	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
-	void *ctx_ptr = NULL;
-	void *pm_ctx_ptr = NULL;
-	void *base_ptr = NULL;
+	bool gr_ctx_ready = false;
+	bool pm_ctx_ready = false;
+	struct mem_desc *current_mem = NULL;
 	bool ch_is_curr_ctx, restart_gr_ctxsw = false;
 	u32 i, j, offset, v;
 	struct gr_gk20a *gr = &g->gr;
@@ -7821,20 +7774,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
 						ctx_ops[i].quad);
 			if (!err) {
-				if (!ctx_ptr) {
+				if (!gr_ctx_ready) {
 					/* would have been a variant of
 					 * gr_gk20a_apply_instmem_overrides,
 					 * recoded in-place instead.
 					 */
-					ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-						PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-						0, pgprot_writecombine(PAGE_KERNEL));
-					if (!ctx_ptr) {
+					if (gk20a_mem_begin(g, &ch_ctx->gr_ctx->mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
+					gr_ctx_ready = true;
 				}
-				base_ptr = ctx_ptr;
+				current_mem = &ch_ctx->gr_ctx->mem;
 			} else {
 				err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
 							ctx_ops[i].offset,
@@ -7849,7 +7800,7 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						NVGPU_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
 					continue;
 				}
-				if (!pm_ctx_ptr) {
+				if (!pm_ctx_ready) {
 					/* Make sure ctx buffer was initialized */
 					if (!ch_ctx->pm_ctx.mem.pages) {
 						gk20a_err(dev_from_gk20a(g),
@@ -7857,15 +7808,13 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 						err = -EINVAL;
 						goto cleanup;
 					}
-					pm_ctx_ptr = vmap(ch_ctx->pm_ctx.mem.pages,
-						PAGE_ALIGN(ch_ctx->pm_ctx.mem.size) >> PAGE_SHIFT,
-						0, pgprot_writecombine(PAGE_KERNEL));
-					if (!pm_ctx_ptr) {
+					if (gk20a_mem_begin(g, &ch_ctx->pm_ctx.mem)) {
 						err = -ENOMEM;
 						goto cleanup;
 					}
+					pm_ctx_ready = true;
 				}
-				base_ptr = pm_ctx_ptr;
+				current_mem = &ch_ctx->pm_ctx.mem;
 			}
 
 			/* if this is a quad access, setup for special access*/
@@ -7878,24 +7827,24 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 				/* sanity check gr ctxt offsets,
 				 * don't write outside, worst case
 				 */
-				if ((base_ptr == ctx_ptr) &&
+				if ((current_mem == &ch_ctx->gr_ctx->mem) &&
 					(offsets[j] >= g->gr.ctx_vars.golden_image_size))
 					continue;
 				if (pass == 0) { /* write pass */
-					v = gk20a_mem_rd32(base_ptr + offsets[j], 0);
+					v = gk20a_mem_rd(g, current_mem, offsets[j]);
 					v &= ~ctx_ops[i].and_n_mask_lo;
 					v |= ctx_ops[i].value_lo;
-					gk20a_mem_wr32(base_ptr + offsets[j], 0, v);
+					gk20a_mem_wr(g, current_mem, offsets[j], v);
 
 					gk20a_dbg(gpu_dbg_gpu_dbg,
 						   "context wr: offset=0x%x v=0x%x",
 						   offsets[j], v);
 
 					if (ctx_ops[i].op == REGOP(WRITE_64)) {
-						v = gk20a_mem_rd32(base_ptr + offsets[j] + 4, 0);
+						v = gk20a_mem_rd(g, current_mem, offsets[j] + 4);
 						v &= ~ctx_ops[i].and_n_mask_hi;
 						v |= ctx_ops[i].value_hi;
-						gk20a_mem_wr32(base_ptr + offsets[j] + 4, 0, v);
+						gk20a_mem_wr(g, current_mem, offsets[j] + 4, v);
 
 						gk20a_dbg(gpu_dbg_gpu_dbg,
 							   "context wr: offset=0x%x v=0x%x",
@@ -7905,18 +7854,18 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 					/* check to see if we need to add a special WAR
 					   for some of the SMPC perf regs */
 					gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
-							v, base_ptr);
+							v, current_mem);
 
 				} else { /* read pass */
 					ctx_ops[i].value_lo =
-						gk20a_mem_rd32(base_ptr + offsets[0], 0);
+						gk20a_mem_rd(g, current_mem, offsets[0]);
 
 					gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
 						   offsets[0], ctx_ops[i].value_lo);
 
 					if (ctx_ops[i].op == REGOP(READ_64)) {
 						ctx_ops[i].value_hi =
-							gk20a_mem_rd32(base_ptr + offsets[0] + 4, 0);
+							gk20a_mem_rd(g, current_mem, offsets[0] + 4);
 
 						gk20a_dbg(gpu_dbg_gpu_dbg,
 							   "context rd: offset=0x%x v=0x%x",
@@ -7943,12 +7892,10 @@ int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
 
 	if (ch_ctx->patch_ctx.mem.cpu_va)
 		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
-
-	if (ctx_ptr)
-		vunmap(ctx_ptr);
-
-	if (pm_ctx_ptr)
-		vunmap(pm_ctx_ptr);
+	if (gr_ctx_ready)
+		gk20a_mem_end(g, &ch_ctx->gr_ctx->mem);
+	if (pm_ctx_ready)
+		gk20a_mem_end(g, &ch_ctx->pm_ctx.mem);
 
 	if (restart_gr_ctxsw) {
 		int tmp_err = gr_gk20a_enable_ctxsw(g);
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 6f6734b4d..13382416d 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -44,6 +44,112 @@
 #include "kind_gk20a.h"
 #include "semaphore_gk20a.h"
 
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem)
+{
+	void *cpu_va;
+
+	if (WARN_ON(mem->cpu_va)) {
+		gk20a_warn(dev_from_gk20a(g), "nested %s", __func__);
+		return -EBUSY;
+	}
+
+	cpu_va = vmap(mem->pages,
+			PAGE_ALIGN(mem->size) >> PAGE_SHIFT,
+			0, pgprot_writecombine(PAGE_KERNEL));
+
+	if (WARN_ON(!cpu_va))
+		return -ENOMEM;
+
+	mem->cpu_va = cpu_va;
+	return 0;
+}
+
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem)
+{
+	vunmap(mem->cpu_va);
+	mem->cpu_va = NULL;
+}
+
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w)
+{
+	u32 *ptr = mem->cpu_va;
+	u32 data;
+
+	WARN_ON(!ptr);
+	data = ptr[w];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+	return data;
+}
+
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset)
+{
+	WARN_ON(offset & 3);
+	return gk20a_mem_rd32(g, mem, offset / sizeof(u32));
+}
+
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem,
+		u32 offset, void *dest, u32 size)
+{
+	u32 i;
+	u32 *dest_u32 = dest;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		dest_u32[i] = gk20a_mem_rd32(g, mem, offset + i);
+}
+
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data)
+{
+	u32 *ptr = mem->cpu_va;
+
+	WARN_ON(!ptr);
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + w, data);
+#endif
+	ptr[w] = data;
+}
+
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data)
+{
+	WARN_ON(offset & 3);
+	gk20a_mem_wr32(g, mem, offset / sizeof(u32), data);
+}
+
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *src, u32 size)
+{
+	u32 i;
+	u32 *src_u32 = src;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		gk20a_mem_wr32(g, mem, offset + i, src_u32[i]);
+}
+
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		u32 value, u32 size)
+{
+	u32 i;
+
+	WARN_ON(offset & 3);
+	WARN_ON(size & 3);
+	offset /= sizeof(u32);
+	size /= sizeof(u32);
+
+	for (i = 0; i < size; i++)
+		gk20a_mem_wr32(g, mem, offset + i, value);
+}
+
 /*
  * GPU mapping life cycle
  * ======================
@@ -780,9 +886,14 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
 		   *pde_lo, *pde_hi);
 }
 
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+static u32 pde_from_index(u32 i)
 {
-	return (u32 *) (((u8 *)vm->pdb.mem.cpu_va) + i*gmmu_pde__size_v());
+	return i * gmmu_pde__size_v() / sizeof(u32);
+}
+
+static u32 pte_from_index(u32 i)
+{
+	return i * gmmu_pte__size_v() / sizeof(u32);
 }
 
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
@@ -2323,7 +2434,7 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
 	u64 pte_addr_small = 0, pte_addr_big = 0;
 	struct gk20a_mm_entry *entry = vm->pdb.entries + i;
 	u32 pde_v[2] = {0, 0};
-	u32 *pde;
+	u32 pde;
 
 	gk20a_dbg_fn("");
 
@@ -2348,10 +2459,10 @@ static int update_gmmu_pde_locked(struct vm_gk20a *vm,
 		    (big_valid ? (gmmu_pde_vol_big_true_f()) :
 		     gmmu_pde_vol_big_false_f());
 
-	pde = pde_from_index(vm, i);
+	pde = pde_from_index(i);
 
-	gk20a_mem_wr32(pde, 0, pde_v[0]);
-	gk20a_mem_wr32(pde, 1, pde_v[1]);
+	gk20a_mem_wr32(g, &vm->pdb.mem, pde + 0, pde_v[0]);
+	gk20a_mem_wr32(g, &vm->pdb.mem, pde + 1, pde_v[1]);
 
 	gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
 		  i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
@@ -2432,8 +2543,8 @@ static int update_gmmu_pte_locked(struct vm_gk20a *vm,
 		gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
 	}
 
-	gk20a_mem_wr32(pte->mem.cpu_va + i*8, 0, pte_w[0]);
-	gk20a_mem_wr32(pte->mem.cpu_va + i*8, 1, pte_w[1]);
+	gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 0, pte_w[0]);
+	gk20a_mem_wr32(g, &pte->mem, pte_from_index(i) + 1, pte_w[1]);
 
 	if (*iova) {
 		*iova += page_size;
@@ -3489,19 +3600,19 @@ static int gk20a_init_cde_vm(struct mm_gk20a *mm)
 			false, false, "cde");
 }
 
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr)
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr)
 {
 	u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
 	u32 pdb_addr_hi = u64_hi32(pdb_addr);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+	gk20a_mem_wr32(g, mem, ram_in_page_dir_base_lo_w(),
 		(g->mm.vidmem_is_vidmem ?
 		  ram_in_page_dir_base_target_sys_mem_ncoh_f() :
 		  ram_in_page_dir_base_target_vid_mem_f()) |
 		ram_in_page_dir_base_vol_true_f() |
 		ram_in_page_dir_base_lo_f(pdb_addr_lo));
 
-	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+	gk20a_mem_wr32(g, mem, ram_in_page_dir_base_hi_w(),
 		ram_in_page_dir_base_hi_f(pdb_addr_hi));
 }
 
@@ -3510,23 +3621,22 @@ void gk20a_init_inst_block(struct mem_desc *inst_block, struct vm_gk20a *vm,
 {
 	struct gk20a *g = gk20a_from_vm(vm);
 	u64 pde_addr = g->ops.mm.get_iova_addr(g, vm->pdb.mem.sgt->sgl, 0);
-	void *inst_ptr = inst_block->cpu_va;
 
 	gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
-		gk20a_mm_inst_block_addr(g, inst_block), inst_ptr);
+		gk20a_mm_inst_block_addr(g, inst_block), inst_block->cpu_va);
 
 	gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr);
 
-	g->ops.mm.init_pdb(g, inst_ptr, pde_addr);
+	g->ops.mm.init_pdb(g, inst_block, pde_addr);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+	gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_lo_w(),
 		u64_lo32(vm->va_limit - 1) & ~0xfff);
 
-	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+	gk20a_mem_wr32(g, inst_block, ram_in_adr_limit_hi_w(),
 		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit - 1)));
 
 	if (big_page_size && g->ops.mm.set_big_page_size)
-		g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
+		g->ops.mm.set_big_page_size(g, inst_block, big_page_size);
 }
 
 int gk20a_mm_fb_flush(struct gk20a *g)
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 7fa0b7fbc..e9ac8f184 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -419,6 +419,34 @@ static inline enum gmmu_pgsz_gk20a __get_pte_size(struct vm_gk20a *vm,
 		return gmmu_page_size_small;
 }
 
+/*
+ * Buffer accessors - wrap between begin() and end() if there is no permanent
+ * kernel mapping for this buffer.
+ */
+
+int gk20a_mem_begin(struct gk20a *g, struct mem_desc *mem);
+/* nop for null mem, like with free() or vunmap() */
+void gk20a_mem_end(struct gk20a *g, struct mem_desc *mem);
+
+/* word-indexed offset */
+u32 gk20a_mem_rd32(struct gk20a *g, struct mem_desc *mem, u32 w);
+/* byte offset (32b-aligned) */
+u32 gk20a_mem_rd(struct gk20a *g, struct mem_desc *mem, u32 offset);
+/* memcpy to cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_rd_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *dest, u32 size);
+
+/* word-indexed offset */
+void gk20a_mem_wr32(struct gk20a *g, struct mem_desc *mem, u32 w, u32 data);
+/* byte offset (32b-aligned) */
+void gk20a_mem_wr(struct gk20a *g, struct mem_desc *mem, u32 offset, u32 data);
+/* memcpy from cpu, offset and size in bytes (32b-aligned) */
+void gk20a_mem_wr_n(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		void *src, u32 size);
+/* size and offset in bytes (32b-aligned), filled with u32s */
+void gk20a_memset(struct gk20a *g, struct mem_desc *mem, u32 offset,
+		u32 value, u32 size);
+
 #if 0 /*related to addr bits above, concern below TBD on which is accurate */
 #define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\
 					   bus_bar1_block_ptr_s())
@@ -673,7 +701,6 @@ void pde_range_from_vaddr_range(struct vm_gk20a *vm,
 					      u64 addr_lo, u64 addr_hi,
 					      u32 *pde_lo, u32 *pde_hi);
 int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm);
-u32 *pde_from_index(struct vm_gk20a *vm, u32 i);
 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
 			       u64 addr, enum gmmu_pgsz_gk20a pgsz_idx);
 void free_gmmu_pages(struct vm_gk20a *vm,
@@ -685,7 +712,7 @@ struct gpu_ops;
 void gk20a_init_mm(struct gpu_ops *gops);
 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
 						      u32 big_page_size);
-void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr);
+void gk20a_mm_init_pdb(struct gk20a *g, struct mem_desc *mem, u64 pdb_addr);
 
 void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block);
 
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
index 56ad0c2a1..54b2eef4b 100644
--- a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -2421,11 +2421,10 @@ static int gk20a_init_pmu_reset_enable_hw(struct gk20a *g)
 static int gk20a_prepare_ucode(struct gk20a *g)
 {
 	struct pmu_gk20a *pmu = &g->pmu;
-	int i, err = 0;
+	int err = 0;
 	struct device *d = dev_from_gk20a(g);
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
-	void *ucode_ptr;
 
 	if (g->pmu_fw) {
 		gk20a_init_pmu(pmu);
@@ -2449,11 +2448,8 @@ static int gk20a_prepare_ucode(struct gk20a *g)
 	if (err)
 		goto err_release_fw;
 
-	ucode_ptr = pmu->ucode.cpu_va;
-
-	for (i = 0; i < (pmu->desc->app_start_offset +
-			pmu->desc->app_size) >> 2; i++)
-		gk20a_mem_wr32(ucode_ptr, i, pmu->ucode_image[i]);
+	gk20a_mem_wr_n(g, &pmu->ucode, 0, pmu->ucode_image,
+			pmu->desc->app_start_offset + pmu->desc->app_size);
 
 	gk20a_init_pmu(pmu);
 
diff --git a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
index 0e6e715d2..3ac2cec88 100644
--- a/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/acr_gm20b.c
@@ -43,8 +43,8 @@ static int lsfm_add_ucode_img(struct gk20a *g, struct ls_flcn_mgr *plsfm,
 static void lsfm_free_ucode_img_res(struct flcn_ucode_img *p_img);
 static void lsfm_free_nonpmu_ucode_img_res(struct flcn_ucode_img *p_img);
 static int lsf_gen_wpr_requirements(struct gk20a *g, struct ls_flcn_mgr *plsfm);
-static int lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
-	void *nonwpr_addr);
+static void lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
+	struct mem_desc *nonwpr);
 static int acr_ucode_patch_sig(struct gk20a *g,
 		unsigned int *p_img,
 		unsigned int *p_prod_sig,
@@ -355,7 +355,7 @@ int prepare_ucode_blob(struct gk20a *g)
 
 		gm20b_dbg_pmu("managed LS falcon %d, WPR size %d bytes.\n",
 			plsfm->managed_flcn_cnt, plsfm->wpr_size);
-		lsfm_init_wpr_contents(g, plsfm, g->acr.ucode_blob.cpu_va);
+		lsfm_init_wpr_contents(g, plsfm, &g->acr.ucode_blob);
 	} else {
 		gm20b_dbg_pmu("LSFM is managing no falcons.\n");
 	}
@@ -613,120 +613,91 @@ static int lsfm_fill_flcn_bl_gen_desc(struct gk20a *g,
 }
 
 /* Initialize WPR contents */
-static int lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
-	void *nonwpr_addr)
+static void lsfm_init_wpr_contents(struct gk20a *g, struct ls_flcn_mgr *plsfm,
+	struct mem_desc *ucode)
 {
+	struct lsfm_managed_ucode_img *pnode = plsfm->ucode_img_list;
+	u32 i;
 
-	int status = 0;
-	union flcn_bl_generic_desc *nonwpr_bl_gen_desc;
-	if (nonwpr_addr == NULL) {
-		status = -ENOMEM;
-	} else {
-		struct lsfm_managed_ucode_img *pnode = plsfm->ucode_img_list;
-		struct lsf_wpr_header *wpr_hdr;
-		struct lsf_lsb_header *lsb_hdr;
-		void *ucode_off;
-		u32 i;
+	/* The WPR array is at the base of the WPR */
+	pnode = plsfm->ucode_img_list;
+	i = 0;
 
-		/* The WPR array is at the base of the WPR */
-		wpr_hdr = (struct lsf_wpr_header *)nonwpr_addr;
-		pnode = plsfm->ucode_img_list;
-		i = 0;
+	/*
+	 * Walk the managed falcons, flush WPR and LSB headers to FB.
+	 * flush any bl args to the storage area relative to the
+	 * ucode image (appended on the end as a DMEM area).
+	 */
+	while (pnode) {
+		/* Flush WPR header to memory*/
+		gk20a_mem_wr_n(g, ucode, i * sizeof(pnode->wpr_header),
+				&pnode->wpr_header, sizeof(pnode->wpr_header));
 
-		/*
-		 * Walk the managed falcons, flush WPR and LSB headers to FB.
-		 * flush any bl args to the storage area relative to the
-		 * ucode image (appended on the end as a DMEM area).
-		 */
-		while (pnode) {
-			/* Flush WPR header to memory*/
-			memcpy(&wpr_hdr[i], &pnode->wpr_header,
-					sizeof(struct lsf_wpr_header));
-			gm20b_dbg_pmu("wpr header as in memory and pnode\n");
-			gm20b_dbg_pmu("falconid :%d %d\n",
-				pnode->wpr_header.falcon_id,
-				wpr_hdr[i].falcon_id);
-			gm20b_dbg_pmu("lsb_offset :%x %x\n",
-				pnode->wpr_header.lsb_offset,
-				wpr_hdr[i].lsb_offset);
-			gm20b_dbg_pmu("bootstrap_owner :%d %d\n",
-				pnode->wpr_header.bootstrap_owner,
-				wpr_hdr[i].bootstrap_owner);
-			gm20b_dbg_pmu("lazy_bootstrap :%d %d\n",
-				pnode->wpr_header.lazy_bootstrap,
-				wpr_hdr[i].lazy_bootstrap);
-			gm20b_dbg_pmu("status :%d %d\n",
-				pnode->wpr_header.status, wpr_hdr[i].status);
+		gm20b_dbg_pmu("wpr header");
+		gm20b_dbg_pmu("falconid :%d",
+				pnode->wpr_header.falcon_id);
+		gm20b_dbg_pmu("lsb_offset :%x",
+				pnode->wpr_header.lsb_offset);
+		gm20b_dbg_pmu("bootstrap_owner :%d",
+			pnode->wpr_header.bootstrap_owner);
+		gm20b_dbg_pmu("lazy_bootstrap :%d",
+				pnode->wpr_header.lazy_bootstrap);
+		gm20b_dbg_pmu("status :%d",
+				pnode->wpr_header.status);
 
-			/*Flush LSB header to memory*/
-			lsb_hdr = (struct lsf_lsb_header *)((u8 *)nonwpr_addr +
-					pnode->wpr_header.lsb_offset);
-			memcpy(lsb_hdr, &pnode->lsb_header,
-					sizeof(struct lsf_lsb_header));
-			gm20b_dbg_pmu("lsb header as in memory and pnode\n");
-			gm20b_dbg_pmu("ucode_off :%x %x\n",
-				pnode->lsb_header.ucode_off,
-				lsb_hdr->ucode_off);
-			gm20b_dbg_pmu("ucode_size :%x %x\n",
-				pnode->lsb_header.ucode_size,
-				lsb_hdr->ucode_size);
-			gm20b_dbg_pmu("data_size :%x %x\n",
-				pnode->lsb_header.data_size,
-				lsb_hdr->data_size);
-			gm20b_dbg_pmu("bl_code_size :%x %x\n",
-				pnode->lsb_header.bl_code_size,
-				lsb_hdr->bl_code_size);
-			gm20b_dbg_pmu("bl_imem_off :%x %x\n",
-				pnode->lsb_header.bl_imem_off,
-				lsb_hdr->bl_imem_off);
-			gm20b_dbg_pmu("bl_data_off :%x %x\n",
-				pnode->lsb_header.bl_data_off,
-				lsb_hdr->bl_data_off);
-			gm20b_dbg_pmu("bl_data_size :%x %x\n",
-				pnode->lsb_header.bl_data_size,
-				lsb_hdr->bl_data_size);
-			gm20b_dbg_pmu("app_code_off :%x %x\n",
-				pnode->lsb_header.app_code_off,
-				lsb_hdr->app_code_off);
-			gm20b_dbg_pmu("app_code_size :%x %x\n",
-				pnode->lsb_header.app_code_size,
-				lsb_hdr->app_code_size);
-			gm20b_dbg_pmu("app_data_off :%x %x\n",
-				pnode->lsb_header.app_data_off,
-				lsb_hdr->app_data_off);
-			gm20b_dbg_pmu("app_data_size :%x %x\n",
-				pnode->lsb_header.app_data_size,
-				lsb_hdr->app_data_size);
-			gm20b_dbg_pmu("flags :%x %x\n",
-				pnode->lsb_header.flags, lsb_hdr->flags);
+		/*Flush LSB header to memory*/
+		gk20a_mem_wr_n(g, ucode, pnode->wpr_header.lsb_offset,
+				&pnode->lsb_header, sizeof(pnode->lsb_header));
 
-			/*If this falcon has a boot loader and related args,
-			 * flush them.*/
-			if (!pnode->ucode_img.header) {
-				nonwpr_bl_gen_desc =
-					(union flcn_bl_generic_desc *)
-					((u8 *)nonwpr_addr +
-					pnode->lsb_header.bl_data_off);
+		gm20b_dbg_pmu("lsb header");
+		gm20b_dbg_pmu("ucode_off :%x",
+				pnode->lsb_header.ucode_off);
+		gm20b_dbg_pmu("ucode_size :%x",
+				pnode->lsb_header.ucode_size);
+		gm20b_dbg_pmu("data_size :%x",
+				pnode->lsb_header.data_size);
+		gm20b_dbg_pmu("bl_code_size :%x",
+				pnode->lsb_header.bl_code_size);
+		gm20b_dbg_pmu("bl_imem_off :%x",
+				pnode->lsb_header.bl_imem_off);
+		gm20b_dbg_pmu("bl_data_off :%x",
+				pnode->lsb_header.bl_data_off);
+		gm20b_dbg_pmu("bl_data_size :%x",
+				pnode->lsb_header.bl_data_size);
+		gm20b_dbg_pmu("app_code_off :%x",
+				pnode->lsb_header.app_code_off);
+		gm20b_dbg_pmu("app_code_size :%x",
+				pnode->lsb_header.app_code_size);
+		gm20b_dbg_pmu("app_data_off :%x",
+				pnode->lsb_header.app_data_off);
+		gm20b_dbg_pmu("app_data_size :%x",
+				pnode->lsb_header.app_data_size);
+		gm20b_dbg_pmu("flags :%x",
+				pnode->lsb_header.flags);
 
-				/*Populate gen bl and flush to memory*/
-				lsfm_fill_flcn_bl_gen_desc(g, pnode);
-				memcpy(nonwpr_bl_gen_desc, &pnode->bl_gen_desc,
+		/*If this falcon has a boot loader and related args,
+		 * flush them.*/
+		if (!pnode->ucode_img.header) {
+			/*Populate gen bl and flush to memory*/
+			lsfm_fill_flcn_bl_gen_desc(g, pnode);
+			gk20a_mem_wr_n(g, ucode,
+					pnode->lsb_header.bl_data_off,
+					&pnode->bl_gen_desc,
 					pnode->bl_gen_desc_size);
-			}
-			ucode_off = (void *)(pnode->lsb_header.ucode_off +
-				(u8 *)nonwpr_addr);
-			/*Copying of ucode*/
-			memcpy(ucode_off, pnode->ucode_img.data,
-				pnode->ucode_img.data_size);
-			pnode = pnode->next;
-			i++;
 		}
-
-		/* Tag the terminator WPR header with an invalid falcon ID. */
-		gk20a_mem_wr32(&wpr_hdr[plsfm->managed_flcn_cnt].falcon_id,
-			0, LSF_FALCON_ID_INVALID);
+		/*Copying of ucode*/
+		gk20a_mem_wr_n(g, ucode, pnode->lsb_header.ucode_off,
+				pnode->ucode_img.data,
+				pnode->ucode_img.data_size);
+		pnode = pnode->next;
+		i++;
 	}
-	return status;
+
+	/* Tag the terminator WPR header with an invalid falcon ID. */
+	gk20a_mem_wr32(g, ucode,
+			plsfm->managed_flcn_cnt * sizeof(struct lsf_wpr_header) +
+			offsetof(struct lsf_wpr_header, falcon_id),
+			LSF_FALCON_ID_INVALID);
 }
 
 /*!
@@ -1000,7 +971,7 @@ int gm20b_bootstrap_hs_flcn(struct gk20a *g)
 {
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
-	int i, err = 0;
+	int err = 0;
 	u64 *acr_dmem;
 	u32 img_size_in_bytes = 0;
 	u32 status, size;
@@ -1066,10 +1037,8 @@ int gm20b_bootstrap_hs_flcn(struct gk20a *g)
 		((struct flcn_acr_desc *)acr_dmem)->regions.no_regions = 2;
 		((struct flcn_acr_desc *)acr_dmem)->wpr_offset = 0;
 
-		for (i = 0; i < (img_size_in_bytes/4); i++) {
-			gk20a_mem_wr32(acr->acr_ucode.cpu_va, i,
-					acr_ucode_data_t210_load[i]);
-		}
+		gk20a_mem_wr_n(g, &acr->acr_ucode, 0,
+				acr_ucode_data_t210_load, img_size_in_bytes);
 		/*
 		 * In order to execute this binary, we will be using
 		 * a bootloader which will load this image into PMU IMEM/DMEM.
@@ -1323,7 +1292,7 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
 	struct mm_gk20a *mm = &g->mm;
 	struct vm_gk20a *vm = &mm->pmu.vm;
 	struct device *d = dev_from_gk20a(g);
-	int i, err = 0;
+	int err = 0;
 	u32 bl_sz;
 	struct acr_gm20b *acr = &g->acr;
 	const struct firmware *hsbl_fw = acr->hsbl_fw;
@@ -1369,8 +1338,7 @@ int pmu_exec_gen_bl(struct gk20a *g, void *desc, u8 b_wait_for_halt)
 			goto err_free_ucode;
 		}
 
-		for (i = 0; i < (bl_sz) >> 2; i++)
-			gk20a_mem_wr32(acr->hsbl_ucode.cpu_va, i, pmu_bl_gm10x[i]);
+		gk20a_mem_wr_n(g, &acr->hsbl_ucode, 0, pmu_bl_gm10x, bl_sz);
 		gm20b_dbg_pmu("Copied bl ucode to bl_cpuva\n");
 	}
 	/*
diff --git a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
index b9a1e685f..2197bae56 100644
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -849,7 +849,7 @@ static int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
 
 static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
 		struct channel_ctx_gk20a *ch_ctx,
-		void *ctx_ptr)
+		struct mem_desc *mem)
 {
 	struct gr_ctx_desc *gr_ctx = ch_ctx->gr_ctx;
 	u32 cta_preempt_option =
@@ -859,7 +859,8 @@ static void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
 
 	if (gr_ctx->compute_preempt_mode == NVGPU_COMPUTE_PREEMPTION_MODE_CTA) {
 		gk20a_dbg_info("CTA: %x", cta_preempt_option);
-		gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_preemption_options_o(), 0,
+		gk20a_mem_wr(g, mem,
+				ctxsw_prog_main_image_preemption_options_o(),
 				cta_preempt_option);
 	}
 
@@ -1005,7 +1006,7 @@ static int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 				       bool enable)
 {
 	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
-	void *ctx_ptr = NULL;
+	struct mem_desc *mem;
 	u32 v;
 
 	gk20a_dbg_fn("");
@@ -1013,18 +1014,17 @@ static int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 	if (!ch_ctx || !ch_ctx->gr_ctx || c->vpr)
 		return -EINVAL;
 
-	ctx_ptr = vmap(ch_ctx->gr_ctx->mem.pages,
-			PAGE_ALIGN(ch_ctx->gr_ctx->mem.size) >> PAGE_SHIFT,
-			0, pgprot_writecombine(PAGE_KERNEL));
-	if (!ctx_ptr)
+	mem = &ch_ctx->gr_ctx->mem;
+
+	if (gk20a_mem_begin(c->g, mem))
 		return -ENOMEM;
 
-	v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	v = gk20a_mem_rd(c->g, mem, ctxsw_prog_main_image_pm_o());
 	v &= ~ctxsw_prog_main_image_pm_pc_sampling_m();
 	v |= ctxsw_prog_main_image_pm_pc_sampling_f(enable);
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0, v);
+	gk20a_mem_wr(c->g, mem, ctxsw_prog_main_image_pm_o(), v);
 
-	vunmap(ctx_ptr);
+	gk20a_mem_end(c->g, mem);
 
 	gk20a_dbg_fn("done");
 
@@ -1089,13 +1089,13 @@ static void gr_gm20b_init_cyclestats(struct gk20a *g)
 #endif
 }
 
-static void gr_gm20b_enable_cde_in_fecs(void *ctx_ptr)
+static void gr_gm20b_enable_cde_in_fecs(struct gk20a *g, struct mem_desc *mem)
 {
 	u32 cde_v;
 
-	cde_v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0);
+	cde_v = gk20a_mem_rd(g, mem, ctxsw_prog_main_image_ctl_o());
 	cde_v |=  ctxsw_prog_main_image_ctl_cde_enabled_f();
-	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_ctl_o(), 0, cde_v);
+	gk20a_mem_wr(g, mem, ctxsw_prog_main_image_ctl_o(), cde_v);
 }
 
 static void gr_gm20b_bpt_reg_info(struct gk20a *g, struct warpstate *w_state)
diff --git a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
index ac73b5c8f..726d73ed8 100644
--- a/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/mm_gm20b.c
@@ -106,14 +106,14 @@ static void gm20b_mm_mmu_set_debug_mode(struct gk20a *g, bool enable)
 }
 
 static void gm20b_mm_set_big_page_size(struct gk20a *g,
-				void *inst_ptr, int size)
+				struct mem_desc *mem, int size)
 {
 	u32 val;
 
 	gk20a_dbg_fn("");
 
 	gk20a_dbg_info("big page size %d\n", size);
-	val = gk20a_mem_rd32(inst_ptr, ram_in_big_page_size_w());
+	val = gk20a_mem_rd32(g, mem, ram_in_big_page_size_w());
 	val &= ~ram_in_big_page_size_m();
 
 	if (size == SZ_64K)
@@ -121,7 +121,7 @@ static void gm20b_mm_set_big_page_size(struct gk20a *g,
 	else
 		val |= ram_in_big_page_size_128kb_f();
 
-	gk20a_mem_wr32(inst_ptr, ram_in_big_page_size_w(), val);
+	gk20a_mem_wr32(g, mem, ram_in_big_page_size_w(), val);
 	gk20a_dbg_fn("done");
 }
 
diff --git a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
index 66b5e4103..d1cba9792 100644
--- a/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/fifo_vgpu.c
@@ -285,8 +285,6 @@ static int vgpu_init_fifo_setup_sw(struct gk20a *g)
 	mutex_init(&f->free_chs_mutex);
 
 	for (chid = 0; chid < f->num_channels; chid++) {
-		f->channel[chid].userd_cpu_va =
-			f->userd.cpu_va + chid * f->userd_entry_size;
 		f->channel[chid].userd_iova =
 			g->ops.mm.get_iova_addr(g, f->userd.sgt->sgl, 0)
 				+ chid * f->userd_entry_size;