diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
index daed29670..5bee34fc4 100644
--- a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -1,7 +1,7 @@
 /*
  * Tegra GK20A GPU Debugger/Profiler Driver
  *
- * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2013-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -29,6 +29,7 @@
 #include "regops_gk20a.h"
 #include "hw_therm_gk20a.h"
 #include "hw_gr_gk20a.h"
+#include "hw_perf_gk20a.h"
 
 struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
 	.exec_reg_ops = exec_regops_gk20a,
@@ -370,6 +371,11 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
 		struct dbg_session_gk20a *dbg_s,
 		struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *args);
 
+static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
+		struct nvgpu_dbg_gpu_perfbuf_map_args *args);
+
+static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
+		struct nvgpu_dbg_gpu_perfbuf_unmap_args *args);
 
 long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
 			     unsigned long arg)
@@ -436,6 +442,16 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
 		       (struct nvgpu_dbg_gpu_suspend_resume_all_sms_args *)buf);
 		break;
 
+	case NVGPU_DBG_GPU_IOCTL_PERFBUF_MAP:
+		err = gk20a_perfbuf_map(dbg_s,
+		       (struct nvgpu_dbg_gpu_perfbuf_map_args *)buf);
+		break;
+
+	case NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP:
+		err = gk20a_perfbuf_unmap(dbg_s,
+		       (struct nvgpu_dbg_gpu_perfbuf_unmap_args *)buf);
+		break;
+
 	default:
 		gk20a_err(dev_from_gk20a(g),
 			   "unrecognized dbg gpu ioctl cmd: 0x%x",
@@ -775,3 +791,80 @@ static int nvgpu_dbg_gpu_ioctl_suspend_resume_sm(
 	mutex_unlock(&g->dbg_sessions_lock);
 	return  err;
 }
+
+static int gk20a_perfbuf_map(struct dbg_session_gk20a *dbg_s,
+		struct nvgpu_dbg_gpu_perfbuf_map_args *args)
+{
+	struct gk20a *g = dbg_s->g;
+	int err;
+	u32 virt_size;
+	u32 virt_addr_lo;
+	u32 virt_addr_hi;
+	u32 inst_pa_page;
+
+	if (!g->allow_all)
+		return -EACCES;
+
+	err = gk20a_vm_map_buffer(&g->mm.pmu.vm,
+			args->dmabuf_fd,
+			&args->offset,
+			0,
+			0,
+			0,
+			args->mapping_size);
+	if (err)
+		return err;
+
+	/* perf output buffer may not cross a 4GB boundary - with a separate va
+	 * smaller than that, it won't */
+	virt_size = u64_lo32(args->mapping_size);
+	virt_addr_lo = u64_lo32(args->offset);
+	virt_addr_hi = u64_hi32(args->offset);
+	/* but check anyway */
+	if (args->offset + virt_size > SZ_4G) {
+		gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
+		return -EINVAL;
+	}
+
+	/* address and size are aligned to 32 bytes, the lowest bits read back
+	 * as zeros */
+	gk20a_writel(g, perf_pmasys_outbase_r(), virt_addr_lo);
+	gk20a_writel(g, perf_pmasys_outbaseupper_r(),
+			perf_pmasys_outbaseupper_ptr_f(virt_addr_hi));
+	gk20a_writel(g, perf_pmasys_outsize_r(), virt_size);
+
+	/* this field is aligned to 4K */
+	inst_pa_page = g->mm.hwpm.inst_block.cpu_pa >> 12;
+
+	/* A write to MEM_BLOCK triggers the block bind operation. MEM_BLOCK
+	 * should be written last */
+	gk20a_writel(g, perf_pmasys_mem_block_r(),
+			perf_pmasys_mem_block_base_f(inst_pa_page) |
+			perf_pmasys_mem_block_valid_true_f() |
+			perf_pmasys_mem_block_target_lfb_f());
+
+	return 0;
+}
+
+static int gk20a_perfbuf_unmap(struct dbg_session_gk20a *dbg_s,
+		struct nvgpu_dbg_gpu_perfbuf_unmap_args *args)
+{
+	struct gk20a *g = dbg_s->g;
+
+	if (!g->allow_all)
+		return -EACCES;
+
+	gk20a_writel(g, perf_pmasys_outbase_r(), 0);
+	gk20a_writel(g, perf_pmasys_outbaseupper_r(),
+			perf_pmasys_outbaseupper_ptr_f(0));
+	gk20a_writel(g, perf_pmasys_outsize_r(), 0);
+
+	gk20a_writel(g, perf_pmasys_mem_block_r(),
+			perf_pmasys_mem_block_base_f(0) |
+			perf_pmasys_mem_block_valid_false_f() |
+			perf_pmasys_mem_block_target_f(0));
+
+	gk20a_vm_unmap_buffer(&g->mm.pmu.vm, args->offset);
+
+	return 0;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
new file mode 100644
index 000000000..65d91de6c
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_perf_gk20a.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_perf_gk20a_h_
+#define _hw_perf_gk20a_h_
+
+static inline u32 perf_pmasys_mem_block_r(void)
+{
+	return 0x001b4070;
+}
+static inline u32 perf_pmasys_mem_block_base_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 perf_pmasys_mem_block_target_f(u32 v)
+{
+	return (v & 0x3) << 28;
+}
+static inline u32 perf_pmasys_mem_block_target_v(u32 r)
+{
+	return (r >> 28) & 0x3;
+}
+static inline u32 perf_pmasys_mem_block_target_lfb_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 perf_pmasys_mem_block_target_lfb_f(void)
+{
+	return 0x0;
+}
+static inline u32 perf_pmasys_mem_block_valid_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 perf_pmasys_mem_block_valid_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 perf_pmasys_mem_block_valid_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 perf_pmasys_mem_block_valid_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 perf_pmasys_mem_block_valid_false_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 perf_pmasys_mem_block_valid_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 perf_pmasys_outbase_r(void)
+{
+	return 0x001b4074;
+}
+static inline u32 perf_pmasys_outbaseupper_r(void)
+{
+	return 0x001b4078;
+}
+static inline u32 perf_pmasys_outbaseupper_ptr_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 perf_pmasys_outsize_r(void)
+{
+	return 0x001b407c;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 8d9488fd5..80c766b6f 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -101,6 +101,7 @@ static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
 				   int rw_flag);
 static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
 static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
+static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
 
 
 struct gk20a_dmabuf_priv {
@@ -280,6 +281,7 @@ static void gk20a_remove_mm_support(struct mm_gk20a *mm)
 {
 	gk20a_remove_vm(&mm->bar1.vm, &mm->bar1.inst_block);
 	gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
+	gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
 }
 
 int gk20a_init_mm_setup_sw(struct gk20a *g)
@@ -315,6 +317,10 @@ int gk20a_init_mm_setup_sw(struct gk20a *g)
 	if (err)
 		return err;
 
+	err = gk20a_init_hwpm(mm);
+	if (err)
+		return err;
+
 	/* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
 	g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
 	mm->remove_support = gk20a_remove_mm_support;
@@ -2720,6 +2726,21 @@ clean_up_va:
 	return err;
 }
 
+static int gk20a_init_hwpm(struct mm_gk20a *mm)
+{
+	int err;
+	struct vm_gk20a *vm = &mm->pmu.vm;
+	struct gk20a *g = gk20a_from_mm(mm);
+	struct inst_desc *inst_block = &mm->hwpm.inst_block;
+
+	err = gk20a_alloc_inst_block(g, inst_block);
+	if (err)
+		return err;
+	gk20a_init_inst_block(inst_block, vm, 0);
+
+	return 0;
+}
+
 void gk20a_init_inst_block(struct inst_desc *inst_block, struct vm_gk20a *vm,
 		u32 big_page_size)
 {
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 40e9488d6..7b3554368 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -342,6 +342,12 @@ struct mm_gk20a {
 		struct inst_desc inst_block;
 	} pmu;
 
+	struct {
+		/* using pmu vm currently */
+		struct inst_desc inst_block;
+	} hwpm;
+
+
 	struct mutex l2_op_lock;
 
 	void (*remove_support)(struct mm_gk20a *mm);
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 1e4387754..ebeacf9bc 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -432,10 +432,26 @@ struct nvgpu_dbg_gpu_suspend_resume_all_sms_args {
 #define NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS			\
 	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 6, struct nvgpu_dbg_gpu_suspend_resume_all_sms_args)
 
+struct nvgpu_dbg_gpu_perfbuf_map_args {
+	__u32 dmabuf_fd;	/* in */
+	__u32 reserved;
+	__u64 mapping_size;	/* in, size of mapped buffer region */
+	__u64 offset;		/* out, virtual address of the mapping */
+};
+
+struct nvgpu_dbg_gpu_perfbuf_unmap_args {
+	__u64 offset;
+};
+
+#define NVGPU_DBG_GPU_IOCTL_PERFBUF_MAP \
+	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 7, struct nvgpu_dbg_gpu_perfbuf_map_args)
+#define NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP \
+	_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 8, struct nvgpu_dbg_gpu_perfbuf_unmap_args)
+
 #define NVGPU_DBG_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_DBG_GPU_IOCTL_SUSPEND_RESUME_ALL_SMS)
+	_IOC_NR(NVGPU_DBG_GPU_IOCTL_PERFBUF_UNMAP)
 #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE		\
-	sizeof(struct nvgpu_dbg_gpu_exec_reg_ops_args)
+	sizeof(struct nvgpu_dbg_gpu_perfbuf_map_args)
 
 /*
  * /dev/nvhost-gpu device