diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 6b249a676..93e30e3dc 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -94,7 +94,7 @@ nvgpu-y += \
 	common/pmu/pmu_fw.o \
 	common/pmu/pg/pmu_pg.o \
 	common/pmu/pg/pmu_aelpg.o \
-	common/pmu/pmu_perfmon.o \
+	common/pmu/perfmon/pmu_perfmon.o \
 	common/pmu/pmu_debug.o \
 	common/pmu/pmu_gk20a.o \
 	common/pmu/pmu_gm20b.o \
diff --git a/drivers/gpu/nvgpu/Makefile.sources b/drivers/gpu/nvgpu/Makefile.sources
index bbe94dd1f..f8ded795c 100644
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -133,7 +133,7 @@ srcs += common/sim.c \
 	common/pmu/pmu_fw.c \
 	common/pmu/pg/pmu_pg.c \
 	common/pmu/pg/pmu_aelpg.c \
-	common/pmu/pmu_perfmon.c \
+	common/pmu/perfmon/pmu_perfmon.c \
 	common/pmu/pmu_debug.c \
 	common/pmu/pmu_gk20a.c \
 	common/pmu/pmu_gm20b.c \
diff --git a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
index a2b5a7f62..b0278256c 100644
--- a/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
+++ b/drivers/gpu/nvgpu/common/pmu/ipc/pmu_msg.c
@@ -30,6 +30,7 @@
 #include <nvgpu/pmu/therm.h>
 #include <nvgpu/pmu/lsfm.h>
 #include <nvgpu/pmu/super_surface.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 static int pmu_payload_extract(struct nvgpu_pmu *pmu, struct pmu_sequence *seq)
 {
diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_perfmon.c b/drivers/gpu/nvgpu/common/pmu/perfmon/pmu_perfmon.c
similarity index 82%
rename from drivers/gpu/nvgpu/common/pmu/pmu_perfmon.c
rename to drivers/gpu/nvgpu/common/pmu/perfmon/pmu_perfmon.c
index a9627c793..3ca7e56b5 100644
--- a/drivers/gpu/nvgpu/common/pmu/pmu_perfmon.c
+++ b/drivers/gpu/nvgpu/common/pmu/perfmon/pmu_perfmon.c
@@ -20,17 +20,21 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
+#include <nvgpu/pmu/pmu_perfmon.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/falcon.h>
+#include <nvgpu/bug.h>
 #include <nvgpu/enabled.h>
 #include <nvgpu/pmu.h>
 #include <nvgpu/pmu/cmd.h>
 #include <nvgpu/log.h>
 #include <nvgpu/bug.h>
 #include <nvgpu/pmuif/nvgpu_gpmu_cmdif.h>
-#include <nvgpu/gk20a.h>
+#include <nvgpu/kmem.h>
 
 static u8 get_perfmon_id(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	u32 ver = g->params.gpu_arch + g->params.gpu_impl;
 	u8 unit_id;
 
@@ -64,7 +68,7 @@ void nvgpu_pmu_perfmon_rpc_handler(struct gk20a *g, struct nvgpu_pmu *pmu,
 	case NV_PMU_RPC_ID_PERFMON_T18X_INIT:
 		nvgpu_pmu_dbg(g,
 			"reply NV_PMU_RPC_ID_PERFMON_INIT");
-		pmu->perfmon_ready = true;
+		pmu->pmu_perfmon->perfmon_ready = true;
 		break;
 	case NV_PMU_RPC_ID_PERFMON_T18X_START:
 		nvgpu_pmu_dbg(g,
@@ -79,8 +83,8 @@ void nvgpu_pmu_perfmon_rpc_handler(struct gk20a *g, struct nvgpu_pmu *pmu,
 			"reply NV_PMU_RPC_ID_PERFMON_QUERY");
 		rpc_param = (struct nv_pmu_rpc_struct_perfmon_query *)
 			rpc_payload->rpc_buff;
-		pmu->load = rpc_param->sample_buffer[0];
-		pmu->perfmon_query = 1;
+		pmu->pmu_perfmon->load = rpc_param->sample_buffer[0];
+		pmu->pmu_perfmon->perfmon_query = 1;
 		/* set perfmon_query to 1 after load is copied */
 		break;
 	default:
@@ -89,9 +93,40 @@ void nvgpu_pmu_perfmon_rpc_handler(struct gk20a *g, struct nvgpu_pmu *pmu,
 	}
 }
 
+int nvgpu_pmu_initialize_perfmon(struct gk20a *g, struct nvgpu_pmu *pmu)
+{
+	if (pmu->pmu_perfmon != NULL) {
+		/* Not to allocate a new buffer after railgating
+		   is done. Use the same memory for pmu_perfmon
+		   after railgating.
+		*/
+		return 0;
+
+	} else {
+		/* One-time memory allocation for pmu_perfmon */
+		pmu->pmu_perfmon = (struct nvgpu_pmu_perfmon *)(nvgpu_kzalloc(g,
+					sizeof(struct nvgpu_pmu_perfmon)));
+		if (pmu->pmu_perfmon == NULL) {
+			nvgpu_err(g, "failed to initialize  perfmon");
+			return -ENOMEM;
+		}
+	}
+	return 0;
+
+}
+
+void nvgpu_pmu_deinitialize_perfmon(struct gk20a *g, struct nvgpu_pmu *pmu)
+{
+	if (pmu->pmu_perfmon == NULL) {
+		return;
+	} else {
+		nvgpu_kfree(g, pmu->pmu_perfmon);
+	}
+}
+
 int nvgpu_pmu_init_perfmon(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct pmu_v *pv = &g->ops.pmu_ver;
 	struct pmu_cmd cmd;
 	struct pmu_payload payload;
@@ -104,16 +139,16 @@ int nvgpu_pmu_init_perfmon(struct nvgpu_pmu *pmu)
 
 	nvgpu_log_fn(g, " ");
 
-	pmu->perfmon_ready = false;
+	pmu->pmu_perfmon->perfmon_ready = false;
 
 	g->ops.pmu.pmu_init_perfmon_counter(g);
 
-	if (pmu->sample_buffer == 0U) {
+	if (pmu->pmu_perfmon->sample_buffer == 0U) {
 		tmp_addr = nvgpu_alloc(&pmu->dmem, 2U * sizeof(u16));
 		nvgpu_assert(tmp_addr <= U32_MAX);
-		pmu->sample_buffer = (u32)tmp_addr;
+		pmu->pmu_perfmon->sample_buffer = (u32)tmp_addr;
 	}
-	if (pmu->sample_buffer == 0U) {
+	if (pmu->pmu_perfmon->sample_buffer == 0U) {
 		nvgpu_err(g, "failed to allocate perfmon sample buffer");
 		return -ENOMEM;
 	}
@@ -133,10 +168,9 @@ int nvgpu_pmu_init_perfmon(struct nvgpu_pmu *pmu)
 	cmd.cmd.perfmon.cmd_type = PMU_PERFMON_CMD_ID_INIT;
 	/* buffer to save counter values for pmu perfmon */
 	pv->perfmon_cmd_init_set_sample_buffer(&cmd.cmd.perfmon,
-	(u16)pmu->sample_buffer);
+	(u16)pmu->pmu_perfmon->sample_buffer);
 	/* number of sample periods below lower threshold
 	 * before pmu triggers perfmon decrease event
-	 * TBD: = 15
 	 */
 	pv->perfmon_cmd_init_set_dec_cnt(&cmd.cmd.perfmon, 15);
 	/* index of base counter, aka. always ticking counter */
@@ -171,7 +205,7 @@ int nvgpu_pmu_init_perfmon(struct nvgpu_pmu *pmu)
 
 int nvgpu_pmu_perfmon_start_sampling(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct pmu_v *pv = &g->ops.pmu_ver;
 	struct pmu_cmd cmd;
 	struct pmu_payload payload;
@@ -197,7 +231,7 @@ int nvgpu_pmu_perfmon_start_sampling(struct nvgpu_pmu *pmu)
 	pv->perfmon_start_set_group_id(&cmd.cmd.perfmon,
 		PMU_DOMAIN_GROUP_PSTATE);
 	pv->perfmon_start_set_state_id(&cmd.cmd.perfmon,
-		pmu->perfmon_state_id[PMU_DOMAIN_GROUP_PSTATE]);
+		pmu->pmu_perfmon->perfmon_state_id[PMU_DOMAIN_GROUP_PSTATE]);
 
 	pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
 		PMU_PERFMON_FLAG_ENABLE_INCREASE |
@@ -230,7 +264,7 @@ int nvgpu_pmu_perfmon_start_sampling(struct nvgpu_pmu *pmu)
 
 int nvgpu_pmu_perfmon_stop_sampling(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct pmu_cmd cmd;
 	u64 tmp_size;
 
@@ -258,7 +292,7 @@ int nvgpu_pmu_perfmon_stop_sampling(struct nvgpu_pmu *pmu)
 
 int nvgpu_pmu_load_norm(struct gk20a *g, u32 *load)
 {
-	*load = g->pmu.load_shadow;
+	*load = g->pmu.pmu_perfmon->load_shadow;
 	return 0;
 }
 
@@ -267,28 +301,27 @@ int nvgpu_pmu_load_update(struct gk20a *g)
 	struct nvgpu_pmu *pmu = &g->pmu;
 	u32 load = 0;
 	int err = 0;
-
-	if (!pmu->perfmon_ready) {
-		pmu->load_shadow = 0;
-		pmu->load = 0;
+	if (!pmu->pmu_perfmon->perfmon_ready) {
+		pmu->pmu_perfmon->load_shadow = 0;
+		pmu->pmu_perfmon->load = 0;
 		return 0;
 	}
 
 	if (g->ops.pmu.pmu_perfmon_get_samples_rpc != NULL) {
 		nvgpu_pmu_perfmon_get_samples_rpc(pmu);
-		load = pmu->load;
+		load = pmu->pmu_perfmon->load;
 	} else {
 		err = nvgpu_falcon_copy_from_dmem(&pmu->flcn,
-						  pmu->sample_buffer,
-						  (u8 *)&load, 2 * 1, 0);
+			pmu->pmu_perfmon->sample_buffer, (u8 *)&load, 2 * 1, 0);
 		if (err != 0) {
 			nvgpu_err(g, "PMU falcon DMEM copy failed");
 			return err;
 		}
 	}
 
-	pmu->load_shadow = load / 10U;
-	pmu->load_avg = (((9U*pmu->load_avg) + pmu->load_shadow) / 10U);
+	pmu->pmu_perfmon->load_shadow = load / 10U;
+	pmu->pmu_perfmon->load_avg = (((9U*pmu->pmu_perfmon->load_avg) +
+		pmu->pmu_perfmon->load_shadow) / 10U);
 
 	return err;
 }
@@ -365,7 +398,7 @@ void nvgpu_pmu_reset_load_counters(struct gk20a *g)
 int nvgpu_pmu_handle_perfmon_event(struct nvgpu_pmu *pmu,
 			struct pmu_perfmon_msg *msg)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 
 	nvgpu_log_fn(g, " ");
 
@@ -374,16 +407,16 @@ int nvgpu_pmu_handle_perfmon_event(struct nvgpu_pmu *pmu,
 		nvgpu_pmu_dbg(g, "perfmon increase event: ");
 		nvgpu_pmu_dbg(g, "state_id %d, ground_id %d, pct %d",
 			msg->gen.state_id, msg->gen.group_id, msg->gen.data);
-		(pmu->perfmon_events_cnt)++;
+		(pmu->pmu_perfmon->perfmon_events_cnt)++;
 		break;
 	case PMU_PERFMON_MSG_ID_DECREASE_EVENT:
 		nvgpu_pmu_dbg(g, "perfmon decrease event: ");
 		nvgpu_pmu_dbg(g, "state_id %d, ground_id %d, pct %d",
 			msg->gen.state_id, msg->gen.group_id, msg->gen.data);
-		(pmu->perfmon_events_cnt)++;
+		(pmu->pmu_perfmon->perfmon_events_cnt)++;
 		break;
 	case PMU_PERFMON_MSG_ID_INIT_EVENT:
-		pmu->perfmon_ready = true;
+		pmu->pmu_perfmon->perfmon_ready = true;
 		nvgpu_pmu_dbg(g, "perfmon init event");
 		break;
 	default:
@@ -393,7 +426,7 @@ int nvgpu_pmu_handle_perfmon_event(struct nvgpu_pmu *pmu,
 	}
 
 	/* restart sampling */
-	if (pmu->perfmon_sampling_enabled) {
+	if (pmu->pmu_perfmon->perfmon_sampling_enabled) {
 		return g->ops.pmu.pmu_perfmon_start_sampling(&(g->pmu));
 	}
 
@@ -403,7 +436,7 @@ int nvgpu_pmu_handle_perfmon_event(struct nvgpu_pmu *pmu,
 /* Perfmon RPC */
 int nvgpu_pmu_init_perfmon_rpc(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct nv_pmu_rpc_struct_perfmon_init rpc;
 	int status = 0;
 
@@ -414,7 +447,7 @@ int nvgpu_pmu_init_perfmon_rpc(struct nvgpu_pmu *pmu)
 	nvgpu_log_fn(g, " ");
 
 	(void) memset(&rpc, 0, sizeof(struct nv_pmu_rpc_struct_perfmon_init));
-	pmu->perfmon_ready = false;
+	pmu->pmu_perfmon->perfmon_ready = false;
 
 	g->ops.pmu.pmu_init_perfmon_counter(g);
 
@@ -451,7 +484,7 @@ exit:
 
 int nvgpu_pmu_perfmon_start_sampling_rpc(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct nv_pmu_rpc_struct_perfmon_start rpc;
 	int status = 0;
 
@@ -463,7 +496,7 @@ int nvgpu_pmu_perfmon_start_sampling_rpc(struct nvgpu_pmu *pmu)
 
 	(void) memset(&rpc, 0, sizeof(struct nv_pmu_rpc_struct_perfmon_start));
 	rpc.group_id = PMU_DOMAIN_GROUP_PSTATE;
-	rpc.state_id = pmu->perfmon_state_id[PMU_DOMAIN_GROUP_PSTATE];
+	rpc.state_id = pmu->pmu_perfmon->perfmon_state_id[PMU_DOMAIN_GROUP_PSTATE];
 	rpc.flags = PMU_PERFMON_FLAG_ENABLE_INCREASE |
 				PMU_PERFMON_FLAG_ENABLE_DECREASE |
 				PMU_PERFMON_FLAG_CLEAR_PREV;
@@ -482,7 +515,7 @@ int nvgpu_pmu_perfmon_start_sampling_rpc(struct nvgpu_pmu *pmu)
 
 int nvgpu_pmu_perfmon_stop_sampling_rpc(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct nv_pmu_rpc_struct_perfmon_stop rpc;
 	int status = 0;
 
@@ -505,7 +538,7 @@ int nvgpu_pmu_perfmon_stop_sampling_rpc(struct nvgpu_pmu *pmu)
 
 int nvgpu_pmu_perfmon_get_samples_rpc(struct nvgpu_pmu *pmu)
 {
-	struct gk20a *g = gk20a_from_pmu(pmu);
+	struct gk20a *g = pmu->g;
 	struct nv_pmu_rpc_struct_perfmon_query rpc;
 	int status = 0;
 
@@ -514,7 +547,7 @@ int nvgpu_pmu_perfmon_get_samples_rpc(struct nvgpu_pmu *pmu)
 	}
 
 	nvgpu_log_fn(g, " ");
-	pmu->perfmon_query = 0;
+	pmu->pmu_perfmon->perfmon_query = 0;
 	(void) memset(&rpc, 0, sizeof(struct nv_pmu_rpc_struct_perfmon_query));
 	/* PERFMON QUERY */
 	nvgpu_pmu_dbg(g, "RPC post NV_PMU_RPC_ID_PERFMON_QUERY\n");
@@ -524,7 +557,28 @@ int nvgpu_pmu_perfmon_get_samples_rpc(struct nvgpu_pmu *pmu)
 	}
 
 	pmu_wait_message_cond(pmu, nvgpu_get_poll_timeout(g),
-				      &pmu->perfmon_query, 1);
+				      &pmu->pmu_perfmon->perfmon_query, 1);
 
 	return status;
 }
+
+int nvgpu_pmu_perfmon_get_sampling_enable_status(struct nvgpu_pmu *pmu)
+{
+	return pmu->pmu_perfmon->perfmon_sampling_enabled;
+}
+
+void nvgpu_pmu_perfmon_set_sampling_enable_status(struct nvgpu_pmu *pmu,
+							bool status)
+{
+	pmu->pmu_perfmon->perfmon_sampling_enabled = status;
+}
+
+u64 nvgpu_pmu_perfmon_get_events_count(struct nvgpu_pmu *pmu)
+{
+	return pmu->pmu_perfmon->perfmon_events_cnt;
+}
+
+u32 nvgpu_pmu_perfmon_get_load_avg(struct nvgpu_pmu *pmu)
+{
+	return pmu->pmu_perfmon->load_avg;
+}
diff --git a/drivers/gpu/nvgpu/common/pmu/pmu.c b/drivers/gpu/nvgpu/common/pmu/pmu.c
index 9b2e521e6..ee4e47b9b 100644
--- a/drivers/gpu/nvgpu/common/pmu/pmu.c
+++ b/drivers/gpu/nvgpu/common/pmu/pmu.c
@@ -37,6 +37,7 @@
 #include <nvgpu/nvgpu_err.h>
 #include <nvgpu/pmu/lsfm.h>
 #include <nvgpu/pmu/super_surface.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 static void pmu_report_error(struct gk20a *g, u32 err_type,
 		u32 status, u32 pmu_err_type)
@@ -161,10 +162,6 @@ static int nvgpu_init_pmu_setup_sw(struct gk20a *g)
 		goto skip_init;
 	}
 
-	/* no infoRom script from vbios? */
-
-	/* TBD: sysmon subtask */
-
 	err = nvgpu_pmu_mutexes_alloc(g, &pmu->mutexes);
 	if (err != 0) {
 		goto err;
@@ -367,7 +364,7 @@ int nvgpu_pmu_destroy(struct gk20a *g)
 
 	nvgpu_pmu_state_change(g, PMU_STATE_OFF, false);
 	pmu->pmu_ready = false;
-	pmu->perfmon_ready = false;
+	pmu->pmu_perfmon->perfmon_ready = false;
 	pmu->pmu_pg.zbc_ready = false;
 	nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, false);
 
@@ -461,7 +458,7 @@ void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
 }
 
 int nvgpu_pmu_lock_acquire(struct gk20a *g, struct nvgpu_pmu *pmu,
-			   u32 id, u32 *token)
+			u32 id, u32 *token)
 {
 	if (!g->support_ls_pmu) {
 		return 0;
@@ -475,7 +472,7 @@ int nvgpu_pmu_lock_acquire(struct gk20a *g, struct nvgpu_pmu *pmu,
 }
 
 int nvgpu_pmu_lock_release(struct gk20a *g, struct nvgpu_pmu *pmu,
-			   u32 id, u32 *token)
+			u32 id, u32 *token)
 {
 	if (!g->support_ls_pmu) {
 		return 0;
diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_fw.c b/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
index 5fcc53743..008eb5076 100644
--- a/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
+++ b/drivers/gpu/nvgpu/common/pmu/pmu_fw.c
@@ -34,6 +34,7 @@
 #include <nvgpu/pmu/pstate.h>
 #include <nvgpu/pmu/volt.h>
 #include <nvgpu/pmu/clk/clk.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 #include <nvgpu/pmu/allocator.h>
 #include <nvgpu/pmu/lsfm.h>
 #include <nvgpu/pmu/super_surface.h>
@@ -61,32 +62,32 @@ static u32 pmu_perfmon_cntr_sz_v2(struct nvgpu_pmu *pmu)
 
 static void *get_perfmon_cntr_ptr_v2(struct nvgpu_pmu *pmu)
 {
-	return (void *)(&pmu->perfmon_counter_v2);
+	return (void *)(&pmu->pmu_perfmon->perfmon_counter_v2);
 }
 
 static void set_perfmon_cntr_ut_v2(struct nvgpu_pmu *pmu, u16 ut)
 {
-	pmu->perfmon_counter_v2.upper_threshold = ut;
+	pmu->pmu_perfmon->perfmon_counter_v2.upper_threshold = ut;
 }
 
 static void set_perfmon_cntr_lt_v2(struct nvgpu_pmu *pmu, u16 lt)
 {
-	pmu->perfmon_counter_v2.lower_threshold = lt;
+	pmu->pmu_perfmon->perfmon_counter_v2.lower_threshold = lt;
 }
 
 static void set_perfmon_cntr_valid_v2(struct nvgpu_pmu *pmu, u8 valid)
 {
-	pmu->perfmon_counter_v2.valid = valid;
+	pmu->pmu_perfmon->perfmon_counter_v2.valid = valid;
 }
 
 static void set_perfmon_cntr_index_v2(struct nvgpu_pmu *pmu, u8 index)
 {
-	pmu->perfmon_counter_v2.index = index;
+	pmu->pmu_perfmon->perfmon_counter_v2.index = index;
 }
 
 static void set_perfmon_cntr_group_id_v2(struct nvgpu_pmu *pmu, u8 gid)
 {
-	pmu->perfmon_counter_v2.group_id = gid;
+	pmu->pmu_perfmon->perfmon_counter_v2.group_id = gid;
 }
 
 static void set_pmu_cmdline_args_falctracedmabase_v4(struct nvgpu_pmu *pmu)
@@ -1659,6 +1660,8 @@ static void nvgpu_remove_pmu_support(struct nvgpu_pmu *pmu)
 
 	nvgpu_pmu_lsfm_deinit(g, pmu, pmu->lsfm);
 
+	/* de-allocate memory space of pmu_perfmon */
+	nvgpu_pmu_deinitialize_perfmon(g, pmu);
 	nvgpu_mutex_destroy(&pmu->pmu_pg.elpg_mutex);
 	nvgpu_mutex_destroy(&pmu->pmu_pg.pg_mutex);
 	nvgpu_mutex_destroy(&pmu->isr_mutex);
@@ -1692,7 +1695,8 @@ static int init_pmu_ucode(struct nvgpu_pmu *pmu)
 	} else {
 		/* secure boot ucodes's */
 		nvgpu_pmu_dbg(g, "requesting PMU ucode image");
-		pmu->fw_image = nvgpu_request_firmware(g, NVGPU_PMU_UCODE_IMAGE, 0);
+		pmu->fw_image = nvgpu_request_firmware(g, NVGPU_PMU_UCODE_IMAGE,
+							0);
 		if (pmu->fw_image == NULL) {
 			nvgpu_err(g, "failed to load pmu ucode!!");
 			err = -ENOENT;
@@ -1700,7 +1704,8 @@ static int init_pmu_ucode(struct nvgpu_pmu *pmu)
 		}
 
 		nvgpu_pmu_dbg(g, "requesting PMU ucode desc");
-		pmu->fw_desc = nvgpu_request_firmware(g, NVGPU_PMU_UCODE_DESC, 0);
+		pmu->fw_desc = nvgpu_request_firmware(g, NVGPU_PMU_UCODE_DESC,
+							0);
 		if (pmu->fw_desc == NULL) {
 			nvgpu_err(g, "failed to load pmu ucode desc!!");
 			err = -ENOENT;
@@ -1779,6 +1784,12 @@ int nvgpu_early_init_pmu_sw(struct gk20a *g, struct nvgpu_pmu *pmu)
 		goto init_failed;
 	}
 
+	/* Allocate memory for pmu_perfmon */
+	err = nvgpu_pmu_initialize_perfmon(g, pmu);
+	if (err != 0) {
+		goto exit;
+	}
+
 	err = init_pmu_ucode(pmu);
 	if (err != 0) {
 		goto init_failed;
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index f36f0e7ed..71adfb14d 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -42,7 +42,7 @@
 #include <nvgpu/gr/gr_intr.h>
 #include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/setup.h>
-#include <nvgpu/gr/gr.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include "hal/mc/mc_gm20b.h"
 #include "hal/bus/bus_gm20b.h"
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index 94d428627..658b412c0 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -43,6 +43,7 @@
 #include <nvgpu/gr/fecs_trace.h>
 #include <nvgpu/gr/gr.h>
 #include <nvgpu/gr/gr_intr.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include "hal/mc/mc_gm20b.h"
 #include "hal/mc/mc_gp10b.h"
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index aabdda9a5..62828bcce 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -180,6 +180,7 @@
 #include <nvgpu/gr/setup.h>
 #include <nvgpu/gr/fecs_trace.h>
 #include <nvgpu/gr/gr.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 #include <nvgpu/gr/gr_intr.h>
 
 #include <nvgpu/hw/gv100/hw_proj_gv100.h>
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index 649491901..15dd03d99 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -26,6 +26,7 @@
 #include <nvgpu/regops.h>
 #include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/gr.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include "hal/mc/mc_gm20b.h"
 #include "hal/mc/mc_gp10b.h"
diff --git a/drivers/gpu/nvgpu/include/nvgpu/pmu.h b/drivers/gpu/nvgpu/include/nvgpu/pmu.h
index 41cb790a6..a2fdb0b92 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/pmu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/pmu.h
@@ -214,7 +214,6 @@ struct nvgpu_pmu {
 
 	struct nvgpu_pmu_lsfm *lsfm;
 
-	/* TBD: remove this if ZBC seq is fixed */
 	struct nvgpu_mem seq_buf;
 	struct nvgpu_mem trace_buf;
 
@@ -233,27 +232,16 @@ struct nvgpu_pmu {
 
 	bool pmu_ready;
 
-	u32 perfmon_query;
-
 	u32 mscg_stat;
 	u32 mscg_transition_state;
 
 	u32 pmu_state;
 
 	struct nvgpu_pmu_pg pmu_pg;
-	union {
-		struct pmu_perfmon_counter_v2 perfmon_counter_v2;
-	};
-	u8 perfmon_state_id[PMU_DOMAIN_GROUP_NUM];
+	struct nvgpu_pmu_perfmon *pmu_perfmon;
 
 	void (*remove_support)(struct nvgpu_pmu *pmu);
 	bool sw_ready;
-	bool perfmon_ready;
-
-	u32 sample_buffer;
-	u32 load_shadow;
-	u32 load_avg;
-	u32 load;
 
 	struct nvgpu_mutex isr_mutex;
 	bool isr_enabled;
@@ -264,8 +252,7 @@ struct nvgpu_pmu {
 		struct pmu_cmdline_args_v5 args_v5;
 		struct pmu_cmdline_args_v6 args_v6;
 	};
-	unsigned long perfmon_events_cnt;
-	bool perfmon_sampling_enabled;
+
 	u32 override_done;
 };
 
@@ -289,26 +276,6 @@ int nvgpu_pmu_lock_acquire(struct gk20a *g, struct nvgpu_pmu *pmu,
 int nvgpu_pmu_lock_release(struct gk20a *g, struct nvgpu_pmu *pmu,
 			   u32 id, u32 *token);
 
-/* perfmon */
-void nvgpu_pmu_perfmon_rpc_handler(struct gk20a *g, struct nvgpu_pmu *pmu,
-				   struct nv_pmu_rpc_header *rpc,
-				   struct rpc_handler_payload *rpc_payload);
-int nvgpu_pmu_init_perfmon(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_perfmon_start_sampling(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_perfmon_stop_sampling(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_perfmon_start_sampling_rpc(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_perfmon_stop_sampling_rpc(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_perfmon_get_samples_rpc(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_handle_perfmon_event(struct nvgpu_pmu *pmu,
-	struct pmu_perfmon_msg *msg);
-int nvgpu_pmu_init_perfmon_rpc(struct nvgpu_pmu *pmu);
-int nvgpu_pmu_load_norm(struct gk20a *g, u32 *load);
-int nvgpu_pmu_load_update(struct gk20a *g);
-int nvgpu_pmu_busy_cycles_norm(struct gk20a *g, u32 *norm);
-void nvgpu_pmu_reset_load_counters(struct gk20a *g);
-void nvgpu_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
-		u32 *total_cycles);
-
 int nvgpu_pmu_handle_therm_event(struct nvgpu_pmu *pmu,
 			struct nv_pmu_therm_msg *msg);
 
diff --git a/drivers/gpu/nvgpu/include/nvgpu/pmu/pmu_perfmon.h b/drivers/gpu/nvgpu/include/nvgpu/pmu/pmu_perfmon.h
new file mode 100644
index 000000000..e8c187e7d
--- /dev/null
+++ b/drivers/gpu/nvgpu/include/nvgpu/pmu/pmu_perfmon.h
@@ -0,0 +1,77 @@
+/*                                                      |
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef NVGPU_PMU_PERFMON_H
+#define NVGPU_PMU_PERFMON_H
+
+//#include <nvgpu/enabled.h>
+#include <nvgpu/pmuif/nvgpu_gpmu_cmdif.h>
+#include <nvgpu/pmuif/gpmuif_perfmon.h>
+
+struct gk20a;
+struct nvgpu_pmu;
+struct pmu_perfmon_msg;
+struct rpc_handler_payload;
+struct nv_pmu_rpc_header;
+
+struct nvgpu_pmu_perfmon {
+	struct pmu_perfmon_counter_v2 perfmon_counter_v2;
+	u64 perfmon_events_cnt;
+	u32 perfmon_query;
+	u8 perfmon_state_id[PMU_DOMAIN_GROUP_NUM];
+	u32 sample_buffer;
+	u32 load_shadow;
+	u32 load_avg;
+	u32 load;
+	bool perfmon_ready;
+	bool perfmon_sampling_enabled;
+};
+
+/* perfmon */
+void nvgpu_pmu_perfmon_rpc_handler(struct gk20a *g, struct nvgpu_pmu *pmu,
+		struct nv_pmu_rpc_header *rpc,
+		struct rpc_handler_payload *rpc_payload);
+int nvgpu_pmu_initialize_perfmon(struct gk20a *g, struct nvgpu_pmu *pmu);
+void nvgpu_pmu_deinitialize_perfmon(struct gk20a *g, struct nvgpu_pmu *pmu);
+int nvgpu_pmu_init_perfmon(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_perfmon_start_sampling(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_perfmon_stop_sampling(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_perfmon_start_sampling_rpc(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_perfmon_stop_sampling_rpc(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_perfmon_get_samples_rpc(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_handle_perfmon_event(struct nvgpu_pmu *pmu,
+		struct pmu_perfmon_msg *msg);
+int nvgpu_pmu_init_perfmon_rpc(struct nvgpu_pmu *pmu);
+int nvgpu_pmu_load_norm(struct gk20a *g, u32 *load);
+int nvgpu_pmu_load_update(struct gk20a *g);
+int nvgpu_pmu_busy_cycles_norm(struct gk20a *g, u32 *norm);
+void nvgpu_pmu_reset_load_counters(struct gk20a *g);
+void nvgpu_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
+		u32 *total_cycles);
+int nvgpu_pmu_perfmon_get_sampling_enable_status(struct nvgpu_pmu *pmu);
+void nvgpu_pmu_perfmon_set_sampling_enable_status(struct nvgpu_pmu *pmu,
+		bool status);
+u64 nvgpu_pmu_perfmon_get_events_count(struct nvgpu_pmu *pmu);
+u32 nvgpu_pmu_perfmon_get_load_avg(struct nvgpu_pmu *pmu);
+
+#endif /* NVGPU_PMU_PERFMON_H */
diff --git a/drivers/gpu/nvgpu/os/linux/debug_pmu.c b/drivers/gpu/nvgpu/os/linux/debug_pmu.c
index 6028e533d..d81241144 100644
--- a/drivers/gpu/nvgpu/os/linux/debug_pmu.c
+++ b/drivers/gpu/nvgpu/os/linux/debug_pmu.c
@@ -13,6 +13,8 @@
  */
 
 #include <nvgpu/enabled.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
+
 #include "debug_pmu.h"
 #include "os_linux.h"
 
@@ -314,7 +316,8 @@ static int perfmon_events_enable_show(struct seq_file *s, void *data)
 {
 	struct gk20a *g = s->private;
 
-	seq_printf(s, "%u\n", g->pmu.perfmon_sampling_enabled ? 1 : 0);
+	seq_printf(s, "%u\n",
+		nvgpu_pmu_perfmon_get_sampling_enable_status(&(g->pmu)) ? 1 : 0);
 	return 0;
 
 }
@@ -333,6 +336,7 @@ static ssize_t perfmon_events_enable_write(struct file *file,
 	char buf[40];
 	int buf_size;
 	int err;
+	bool status;
 
 	(void) memset(buf, 0, sizeof(buf));
 	buf_size = min(count, (sizeof(buf)-1));
@@ -349,18 +353,22 @@ static ssize_t perfmon_events_enable_write(struct file *file,
 		if (err)
 			return err;
 
-		if (val && !g->pmu.perfmon_sampling_enabled &&
-				nvgpu_is_enabled(g, NVGPU_PMU_PERFMON)) {
-			g->pmu.perfmon_sampling_enabled = true;
+		if (val && !nvgpu_pmu_perfmon_get_sampling_enable_status(&(g->pmu))
+			&& nvgpu_is_enabled(g, NVGPU_PMU_PERFMON)) {
+			nvgpu_pmu_perfmon_set_sampling_enable_status(&(g->pmu),
+									true);
 			g->ops.pmu.pmu_perfmon_start_sampling(&(g->pmu));
-		} else if (!val && g->pmu.perfmon_sampling_enabled &&
-				nvgpu_is_enabled(g, NVGPU_PMU_PERFMON)) {
-			g->pmu.perfmon_sampling_enabled = false;
+		} else if (!val
+			&& nvgpu_pmu_perfmon_get_sampling_enable_status(&(g->pmu))
+			&& nvgpu_is_enabled(g, NVGPU_PMU_PERFMON)) {
+			nvgpu_pmu_perfmon_set_sampling_enable_status(&(g->pmu),
+									false);
 			g->ops.pmu.pmu_perfmon_stop_sampling(&(g->pmu));
 		}
 		gk20a_idle(g);
 	} else {
-		g->pmu.perfmon_sampling_enabled = val ? true : false;
+		status = val ? true : false;
+		nvgpu_pmu_perfmon_set_sampling_enable_status(&(g->pmu), status);
 	}
 
 	return count;
@@ -378,7 +386,7 @@ static int perfmon_events_count_show(struct seq_file *s, void *data)
 {
 	struct gk20a *g = s->private;
 
-	seq_printf(s, "%lu\n", g->pmu.perfmon_events_cnt);
+	seq_printf(s, "%llu\n", nvgpu_pmu_perfmon_get_events_count(&(g->pmu)));
 	return 0;
 
 }
diff --git a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c
index bcc13c463..390a1a71e 100644
--- a/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c
+++ b/drivers/gpu/nvgpu/os/linux/platform_gk20a_tegra.c
@@ -44,7 +44,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/gr/global_ctx.h>
 #include <nvgpu/nvhost.h>
-
+#include <nvgpu/pmu/pmu_perfmon.h>
 #include <nvgpu/linux/dma.h>
 
 #include "gm20b/clk_gm20b.h"
@@ -172,7 +172,8 @@ static unsigned long gk20a_tegra_get_emc_rate(struct gk20a *g,
 	/* When scaling emc, account for the gpu load when the
 	 * gpu frequency is less than or equal to fmax@vmin. */
 	if (gpu_freq <= gpu_fmax_at_vmin)
-		emc_scale = min(g->pmu.load_avg, g->emc3d_ratio);
+		emc_scale = min(nvgpu_pmu_perfmon_get_load_avg(&(g->pmu)),
+					g->emc3d_ratio);
 	else
 		emc_scale = g->emc3d_ratio;
 
diff --git a/drivers/gpu/nvgpu/os/linux/platform_gp10b_tegra.c b/drivers/gpu/nvgpu/os/linux/platform_gp10b_tegra.c
index 126ab8a16..6992fa5ff 100644
--- a/drivers/gpu/nvgpu/os/linux/platform_gp10b_tegra.c
+++ b/drivers/gpu/nvgpu/os/linux/platform_gp10b_tegra.c
@@ -34,6 +34,7 @@
 #include <nvgpu/enabled.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/nvhost.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include "os_linux.h"
 
diff --git a/drivers/gpu/nvgpu/os/linux/scale.c b/drivers/gpu/nvgpu/os/linux/scale.c
index e6bb5c505..c6a784a26 100644
--- a/drivers/gpu/nvgpu/os/linux/scale.c
+++ b/drivers/gpu/nvgpu/os/linux/scale.c
@@ -1,7 +1,7 @@
 /*
  * gk20a clock scaling profile
  *
- * Copyright (c) 2013-2018, NVIDIA Corporation. All rights reserved.
+ * Copyright (c) 2013-2019, NVIDIA Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -27,6 +27,7 @@
 #include <nvgpu/log.h>
 #include <nvgpu/gk20a.h>
 #include <nvgpu/clk_arb.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include "platform_gk20a.h"
 #include "scale.h"
diff --git a/drivers/gpu/nvgpu/os/linux/sysfs.c b/drivers/gpu/nvgpu/os/linux/sysfs.c
index 8d7ad31ab..f3cb7878e 100644
--- a/drivers/gpu/nvgpu/os/linux/sysfs.c
+++ b/drivers/gpu/nvgpu/os/linux/sysfs.c
@@ -27,6 +27,7 @@
 #include <nvgpu/gr/obj_ctx.h>
 #include <nvgpu/power_features/cg.h>
 #include <nvgpu/power_features/pg.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include "os_linux.h"
 #include "sysfs.h"
diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c
index f13dfcd59..9ddf4b1cf 100644
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -199,6 +199,7 @@
 #include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/gr.h>
 #include <nvgpu/gr/gr_intr.h>
+#include <nvgpu/pmu/pmu_perfmon.h>
 
 #include <nvgpu/hw/tu104/hw_proj_tu104.h>
 #include <nvgpu/hw/tu104/hw_top_tu104.h>