gpu: nvgpu: add separate unit for gr/ctxsw_prog

Add separate new unit gr/ctxsw_prog that provides interface to access h/w header files hw_ctxsw_prog_*.h Add below chip specific files that access above h/w unit and provide interface through g->ops.gr.ctxsw_prog.*() HAL for rest of the units common/gr/ctxsw_prog/ctxsw_prog_gm20b.c common/gr/ctxsw_prog/ctxsw_prog_gp10b.c common/gr/ctxsw_prog/ctxsw_prog_gv11b.c Remove all the h/w header includes from rest of the units and code. Remove direct calls to h/w headers ctxsw_prog_*() and use HALs g->ops.gr.ctxsw_prog.*() instead In gr_gk20a_find_priv_offset_in_ext_buffer(), h/w header ctxsw_prog_extended_num_smpc_quadrants_v() is only defined on gk20a And since we don't support gk20a remove corresponding code Add missing h/w header ctxsw_prog_main_image_pm_mode_ctxsw_f() for some chips Add new h/w header ctxsw_prog_gpccs_header_stride_v() Jira NVGPU-1526 Change-Id: I170f5c0da26ada833f94f5479ff299c0db56a732 Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1966111 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2018-11-30 17:19:50 +05:30
parent 8ef20036c7
commit 6777bd5ed2
41 changed files with 1748 additions and 556 deletions
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -68,6 +68,9 @@ nvgpu-y += common/bus/bus_gk20a.o \
 	common/ltc/ltc_gp10b.o \
 	common/ltc/ltc_gv11b.o  \
 	common/ltc/ltc_tu104.o \
+	common/gr/ctxsw_prog/ctxsw_prog_gm20b.o \
+	common/gr/ctxsw_prog/ctxsw_prog_gp10b.o \
+	common/gr/ctxsw_prog/ctxsw_prog_gv11b.o \
 	common/netlist/netlist.o \
 	common/netlist/netlist_sim.o \
 	common/netlist/netlist_gm20b.o \
--- a/drivers/gpu/nvgpu/Makefile.sources
+++ b/drivers/gpu/nvgpu/Makefile.sources
@@ -106,6 +106,9 @@ srcs :=	os/posix/nvgpu.c \
 	common/falcon/falcon_gp106.c \
 	common/falcon/falcon_gv100.c \
 	common/falcon/falcon_tu104.c \
+	common/gr/ctxsw_prog/ctxsw_prog_gm20b.c \
+	common/gr/ctxsw_prog/ctxsw_prog_gp10b.c \
+	common/gr/ctxsw_prog/ctxsw_prog_gv11b.c \
 	common/netlist/netlist.c \
 	common/netlist/netlist_sim.c \
 	common/netlist/netlist_gm20b.c \
--- a/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gm20b.c
+++ b/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gm20b.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/nvgpu_mem.h>
+
+#include "ctxsw_prog_gm20b.h"
+
+#include <nvgpu/hw/gm20b/hw_ctxsw_prog_gm20b.h>
+
+u32 gm20b_ctxsw_prog_hw_get_fecs_header_size(void)
+{
+	return ctxsw_prog_fecs_header_v();
+}
+
+u32 gm20b_ctxsw_prog_hw_get_gpccs_header_size(void)
+{
+	return ctxsw_prog_gpccs_header_stride_v();
+}
+
+u32 gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes(void)
+{
+	return ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+}
+
+u32 gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes(void)
+{
+	return ctxsw_prog_extended_marker_size_in_bytes_v();
+}
+
+u32 gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride(void)
+{
+	return ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+}
+
+u32 gm20b_ctxsw_prog_get_main_image_ctx_id(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	return nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_context_id_o());
+}
+
+u32 gm20b_ctxsw_prog_get_patch_count(struct gk20a *g, struct nvgpu_mem *ctx_mem)
+{
+	return nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_patch_count_o());
+}
+
+void gm20b_ctxsw_prog_set_patch_count(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 count)
+{
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_patch_count_o(), count);
+}
+
+void gm20b_ctxsw_prog_set_patch_addr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_patch_adr_lo_o(), u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_patch_adr_hi_o(), u64_hi32(addr));
+}
+
+void gm20b_ctxsw_prog_set_zcull_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_zcull_ptr_o(),
+		u64_lo32(addr));
+}
+
+void gm20b_ctxsw_prog_set_zcull(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u32 mode)
+{
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_zcull_o(), mode);
+}
+
+void gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_zcull_o(),
+			ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
+}
+
+bool gm20b_ctxsw_prog_is_zcull_mode_separate_buffer(u32 mode)
+{
+	return mode == ctxsw_prog_main_image_zcull_mode_separate_buffer_v();
+}
+
+void gm20b_ctxsw_prog_set_pm_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pm_ptr_o(),
+		u64_lo32(addr));
+}
+
+void gm20b_ctxsw_prog_set_pm_mode(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 mode)
+{
+	u32 data;
+
+	data = nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_pm_o());
+
+	data = data & ~ctxsw_prog_main_image_pm_mode_m();
+	data |= mode;
+
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pm_o(), data);
+}
+
+void gm20b_ctxsw_prog_set_pm_smpc_mode(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, bool enable)
+{
+	u32 data;
+
+	data = nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_pm_o());
+
+	data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
+	data |= enable ?
+		ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
+		ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
+
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pm_o(), data);
+}
+
+u32 gm20b_ctxsw_prog_set_pm_mode_no_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	gm20b_ctxsw_prog_set_pm_mode(g, ctx_mem,
+		ctxsw_prog_main_image_pm_mode_no_ctxsw_f());
+	return ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+}
+
+u32 gm20b_ctxsw_prog_set_pm_mode_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	gm20b_ctxsw_prog_set_pm_mode(g, ctx_mem,
+		ctxsw_prog_main_image_pm_mode_ctxsw_f());
+	return ctxsw_prog_main_image_pm_mode_ctxsw_f();
+}
+
+
+u32 gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw(void)
+{
+	return ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+}
+
+u32 gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw(void)
+{
+	return ctxsw_prog_main_image_pm_mode_ctxsw_f();
+}
+
+void gm20b_ctxsw_prog_init_ctxsw_hdr_data(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_num_save_ops_o(), 0);
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_num_restore_ops_o(), 0);
+}
+
+void gm20b_ctxsw_prog_set_compute_preemption_mode_cta(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_preemption_options_o(),
+		ctxsw_prog_main_image_preemption_options_control_cta_enabled_f());
+}
+
+void gm20b_ctxsw_prog_set_cde_enabled(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	u32 data = nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_ctl_o());
+
+	data |=  ctxsw_prog_main_image_ctl_cde_enabled_f();
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_ctl_o(), data);
+}
+
+void gm20b_ctxsw_prog_set_pc_sampling(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, bool enable)
+{
+	u32 data = nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_pm_o());
+
+	data &= ~ctxsw_prog_main_image_pm_pc_sampling_m();
+	data |= ctxsw_prog_main_image_pm_pc_sampling_f(enable);
+
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pm_o(), data);
+}
+
+void gm20b_ctxsw_prog_set_priv_access_map_config_mode(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, bool allow_all)
+{
+	if (allow_all) {
+		nvgpu_mem_wr(g, ctx_mem,
+			ctxsw_prog_main_image_priv_access_map_config_o(),
+			ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f());
+	} else {
+		nvgpu_mem_wr(g, ctx_mem,
+			ctxsw_prog_main_image_priv_access_map_config_o(),
+			ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
+	}
+}
+
+void gm20b_ctxsw_prog_set_priv_access_map_addr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
+		u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
+		u64_hi32(addr));
+}
+
+void gm20b_ctxsw_prog_disable_verif_features(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	u32 data;
+
+	data = nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_misc_options_o());
+
+	data = data & ~ctxsw_prog_main_image_misc_options_verif_features_m();
+	data = data | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
+
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_misc_options_o(), data);
+}
+
+bool gm20b_ctxsw_prog_check_main_image_header_magic(u8 *context)
+{
+        u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
+        return magic == ctxsw_prog_main_image_magic_value_v_value_v();
+}
+
+bool gm20b_ctxsw_prog_check_local_header_magic(u8 *context)
+{
+        u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
+        return magic == ctxsw_prog_local_magic_value_v_value_v();
+}
+
+u32 gm20b_ctxsw_prog_get_num_gpcs(u8 *context)
+{
+	return *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
+}
+
+u32 gm20b_ctxsw_prog_get_num_tpcs(u8 *context)
+{
+	return *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
+}
+
+void gm20b_ctxsw_prog_get_extended_buffer_size_offset(u8 *context,
+	u32 *size, u32 *offset)
+{
+	u32 data = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
+
+	*size = ctxsw_prog_main_extended_buffer_ctl_size_v(data);
+	*offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data);
+}
+
+void gm20b_ctxsw_prog_get_ppc_info(u8 *context, u32 *num_ppcs, u32 *ppc_mask)
+{
+	u32 data = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
+
+	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data);
+	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data);
+}
+
+u32 gm20b_ctxsw_prog_get_local_priv_register_ctl_offset(u8 *context)
+{
+	u32 data = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
+	return ctxsw_prog_local_priv_register_ctl_offset_v(data);
+}
+
+u32 gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp(void)
+{
+	return ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
+}
+
+u32 gm20b_ctxsw_prog_hw_get_ts_tag(u64 ts)
+{
+	return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
+}
+
+u64 gm20b_ctxsw_prog_hw_record_ts_timestamp(u64 ts)
+{
+	return ts &
+	       ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
+}
+
+u32 gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes(void)
+{
+	return ctxsw_prog_record_timestamp_record_size_in_bytes_v();
+}
+
+u32 gm20b_ctxsw_prog_is_ts_valid_record(u32 magic_hi)
+{
+	return magic_hi ==
+		ctxsw_prog_record_timestamp_magic_value_hi_v_value_v();
+}
+
+u32 gm20b_ctxsw_prog_get_ts_buffer_aperture_mask(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	return nvgpu_aperture_mask(g, ctx_mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());
+}
+
+void gm20b_ctxsw_prog_set_ts_num_records(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 num)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(num));
+}
+
+void gm20b_ctxsw_prog_set_ts_buffer_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr, u32 aperture_mask)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
+		u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
+		ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(u64_hi32(addr)) |
+		aperture_mask);
+}
--- a/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gm20b.h
+++ b/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gm20b.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_CTXSW_PROG_GM20B_H
+#define NVGPU_CTXSW_PROG_GM20B_H
+
+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_mem;
+
+u32 gm20b_ctxsw_prog_hw_get_fecs_header_size(void);
+u32 gm20b_ctxsw_prog_hw_get_gpccs_header_size(void);
+u32 gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes(void);
+u32 gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes(void);
+u32 gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride(void);
+u32 gm20b_ctxsw_prog_get_main_image_ctx_id(struct gk20a *g, struct nvgpu_mem *ctx_mem);
+u32 gm20b_ctxsw_prog_get_patch_count(struct gk20a *g, struct nvgpu_mem *ctx_mem);
+void gm20b_ctxsw_prog_set_patch_count(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 count);
+void gm20b_ctxsw_prog_set_patch_addr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr);
+void gm20b_ctxsw_prog_set_zcull_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr);
+void gm20b_ctxsw_prog_set_zcull(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u32 mode);
+void gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+bool gm20b_ctxsw_prog_is_zcull_mode_separate_buffer(u32 mode);
+void gm20b_ctxsw_prog_set_pm_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr);
+void gm20b_ctxsw_prog_set_pm_mode(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 mode);
+void gm20b_ctxsw_prog_set_pm_smpc_mode(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, bool enable);
+u32 gm20b_ctxsw_prog_set_pm_mode_no_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+u32 gm20b_ctxsw_prog_set_pm_mode_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+u32 gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw(void);
+u32 gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw(void);
+void gm20b_ctxsw_prog_init_ctxsw_hdr_data(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gm20b_ctxsw_prog_set_compute_preemption_mode_cta(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gm20b_ctxsw_prog_set_cde_enabled(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gm20b_ctxsw_prog_set_pc_sampling(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, bool enable);
+void gm20b_ctxsw_prog_set_priv_access_map_config_mode(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, bool allow_all);
+void gm20b_ctxsw_prog_set_priv_access_map_addr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr);
+void gm20b_ctxsw_prog_disable_verif_features(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+bool gm20b_ctxsw_prog_check_main_image_header_magic(u8 *context);
+bool gm20b_ctxsw_prog_check_local_header_magic(u8 *context);
+u32 gm20b_ctxsw_prog_get_num_gpcs(u8 *context);
+u32 gm20b_ctxsw_prog_get_num_tpcs(u8 *context);
+void gm20b_ctxsw_prog_get_extended_buffer_size_offset(u8 *context,
+	u32 *size, u32 *offset);
+void gm20b_ctxsw_prog_get_ppc_info(u8 *context, u32 *num_ppcs, u32 *ppc_mask);
+u32 gm20b_ctxsw_prog_get_local_priv_register_ctl_offset(u8 *context);
+u32 gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp(void);
+u32 gm20b_ctxsw_prog_hw_get_ts_tag(u64 ts);
+u64 gm20b_ctxsw_prog_hw_record_ts_timestamp(u64 ts);
+u32 gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes(void);
+u32 gm20b_ctxsw_prog_is_ts_valid_record(u32 magic_hi);
+u32 gm20b_ctxsw_prog_get_ts_buffer_aperture_mask(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gm20b_ctxsw_prog_set_ts_num_records(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 num);
+void gm20b_ctxsw_prog_set_ts_buffer_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr, u32 aperture_mask);
+
+#endif /* NVGPU_CTXSW_PROG_GM20B_H */
--- a/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gp10b.c
+++ b/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gp10b.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/nvgpu_mem.h>
+
+#include "ctxsw_prog_gm20b.h"
+#include "ctxsw_prog_gp10b.h"
+
+#include <nvgpu/hw/gp10b/hw_ctxsw_prog_gp10b.h>
+
+void gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_graphics_preemption_options_o(),
+		ctxsw_prog_main_image_graphics_preemption_options_control_gfxp_f());
+}
+
+void gp10b_ctxsw_prog_set_compute_preemption_mode_cta(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_compute_preemption_options_o(),
+		ctxsw_prog_main_image_compute_preemption_options_control_cta_f());
+}
+
+void gp10b_ctxsw_prog_set_compute_preemption_mode_cilp(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_compute_preemption_options_o(),
+		ctxsw_prog_main_image_compute_preemption_options_control_cilp_f());
+}
+
+void gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 boosted_ctx)
+{
+	u32 data = ctxsw_prog_main_image_pmu_options_boost_clock_frequencies_f(boosted_ctx);
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pmu_options_o(), data);
+}
+
+void gp10b_ctxsw_prog_set_full_preemption_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_full_preemption_ptr_o(), u64_lo32(addr));
+}
+
+void gp10b_ctxsw_prog_init_ctxsw_hdr_data(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_num_wfi_save_ops_o(), 0);
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_num_cta_save_ops_o(), 0);
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_num_gfxp_save_ops_o(), 0);
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_num_cilp_save_ops_o(), 0);
+
+	gm20b_ctxsw_prog_init_ctxsw_hdr_data(g, ctx_mem);
+}
+
+void gp10b_ctxsw_prog_dump_ctxsw_stats(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_err(g, "ctxsw_prog_main_image_magic_value_o : %x (expect %x)",
+		nvgpu_mem_rd(g, ctx_mem,
+				ctxsw_prog_main_image_magic_value_o()),
+		ctxsw_prog_main_image_magic_value_v_value_v());
+
+	nvgpu_err(g, "ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi : %x",
+		nvgpu_mem_rd(g, ctx_mem,
+				ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o()));
+
+	nvgpu_err(g, "ctxsw_prog_main_image_context_timestamp_buffer_ptr : %x",
+		nvgpu_mem_rd(g, ctx_mem,
+				ctxsw_prog_main_image_context_timestamp_buffer_ptr_o()));
+
+	nvgpu_err(g, "ctxsw_prog_main_image_context_timestamp_buffer_control : %x",
+		nvgpu_mem_rd(g, ctx_mem,
+				ctxsw_prog_main_image_context_timestamp_buffer_control_o()));
+
+	nvgpu_err(g, "NUM_SAVE_OPERATIONS : %d",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_num_save_ops_o()));
+	nvgpu_err(g, "WFI_SAVE_OPERATIONS : %d",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_num_wfi_save_ops_o()));
+	nvgpu_err(g, "CTA_SAVE_OPERATIONS : %d",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_num_cta_save_ops_o()));
+	nvgpu_err(g, "GFXP_SAVE_OPERATIONS : %d",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_num_gfxp_save_ops_o()));
+	nvgpu_err(g, "CILP_SAVE_OPERATIONS : %d",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_num_cilp_save_ops_o()));
+	nvgpu_err(g,
+		"image gfx preemption option (GFXP is 1) %x",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_graphics_preemption_options_o()));
+	nvgpu_err(g,
+		"image compute preemption option (CTA is 1) %x",
+		nvgpu_mem_rd(g, ctx_mem,
+			ctxsw_prog_main_image_compute_preemption_options_o()));
+}
--- a/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gp10b.h
+++ b/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gp10b.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_CTXSW_PROG_GP10B_H
+#define NVGPU_CTXSW_PROG_GP10B_H
+
+#include <nvgpu/types.h>
+
+struct gk20a;
+struct nvgpu_mem;
+
+void gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gp10b_ctxsw_prog_set_compute_preemption_mode_cta(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gp10b_ctxsw_prog_set_compute_preemption_mode_cilp(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u32 boosted_ctx);
+void gp10b_ctxsw_prog_set_full_preemption_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr);
+void gp10b_ctxsw_prog_init_ctxsw_hdr_data(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gp10b_ctxsw_prog_dump_ctxsw_stats(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+
+#endif /* NVGPU_CTXSW_PROG_GP10B_H */
--- a/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gv11b.c
+++ b/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gv11b.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/utils.h>
+#include <nvgpu/nvgpu_mem.h>
+
+#include "ctxsw_prog_gm20b.h"
+#include "ctxsw_prog_gv11b.h"
+
+#include <nvgpu/hw/gv11b/hw_ctxsw_prog_gv11b.h>
+
+void gv11b_ctxsw_prog_set_zcull_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_zcull_ptr_o(),
+		u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_zcull_ptr_hi_o(),
+		u64_hi32(addr));
+}
+
+void gv11b_ctxsw_prog_set_pm_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pm_ptr_o(),
+		u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem, ctxsw_prog_main_image_pm_ptr_hi_o(),
+		u64_hi32(addr));
+}
+
+u32 gv11b_ctxsw_prog_hw_get_pm_mode_stream_out_ctxsw(void)
+{
+	return ctxsw_prog_main_image_pm_mode_stream_out_ctxsw_f();
+}
+
+u32 gv11b_ctxsw_prog_set_pm_mode_stream_out_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	gm20b_ctxsw_prog_set_pm_mode(g, ctx_mem,
+		ctxsw_prog_main_image_pm_mode_stream_out_ctxsw_f());
+	return ctxsw_prog_main_image_pm_mode_stream_out_ctxsw_f();
+}
+
+void gv11b_ctxsw_prog_set_full_preemption_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_full_preemption_ptr_o(),
+		u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_full_preemption_ptr_hi_o(),
+		u64_hi32(addr));
+}
+
+void gv11b_ctxsw_prog_set_full_preemption_ptr_veid0(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr)
+{
+	addr = addr >> 8;
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_full_preemption_ptr_veid0_o(),
+		u64_lo32(addr));
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_full_preemption_ptr_veid0_hi_o(),
+		u64_hi32(addr));
+}
+
+u32 gv11b_ctxsw_prog_hw_get_perf_counter_register_stride(void)
+{
+	return ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
+}
+
+void gv11b_ctxsw_prog_set_context_buffer_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_context_buffer_ptr_hi_o(),
+		u64_hi32(addr));
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_context_buffer_ptr_o(),
+		u64_lo32(addr));
+}
+
+void gv11b_ctxsw_prog_set_type_per_veid_header(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem)
+{
+	nvgpu_mem_wr(g, ctx_mem,
+		ctxsw_prog_main_image_ctl_o(),
+		ctxsw_prog_main_image_ctl_type_per_veid_header_v());
+}
--- a/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gv11b.h
+++ b/drivers/gpu/nvgpu/common/gr/ctxsw_prog/ctxsw_prog_gv11b.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_CTXSW_PROG_GV11B_H
+#define NVGPU_CTXSW_PROG_GV11B_H
+
+#include <nvgpu/types.h>
+
+void gv11b_ctxsw_prog_set_zcull_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr);
+void gv11b_ctxsw_prog_set_pm_ptr(struct gk20a *g, struct nvgpu_mem *ctx_mem,
+	u64 addr);
+u32 gv11b_ctxsw_prog_hw_get_pm_mode_stream_out_ctxsw(void);
+u32 gv11b_ctxsw_prog_set_pm_mode_stream_out_ctxsw(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+void gv11b_ctxsw_prog_set_full_preemption_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr);
+void gv11b_ctxsw_prog_set_full_preemption_ptr_veid0(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr);
+u32 gv11b_ctxsw_prog_hw_get_perf_counter_register_stride(void);
+void gv11b_ctxsw_prog_set_context_buffer_ptr(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem, u64 addr);
+void gv11b_ctxsw_prog_set_type_per_veid_header(struct gk20a *g,
+	struct nvgpu_mem *ctx_mem);
+
+#endif /* NVGPU_CTXSW_PROG_GV11B_H */
--- a/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fecs_trace_gk20a.c
@@ -43,7 +43,6 @@
 #include <nvgpu/log.h>
 #include <nvgpu/fecs_trace.h>

-#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>

 struct gk20a_fecs_trace_hash_ent {
@@ -62,29 +61,14 @@ struct gk20a_fecs_trace {
 };

 #ifdef CONFIG_GK20A_CTXSW_TRACE
-u32 gk20a_fecs_trace_record_ts_tag_invalid_ts_v(void)
-{
-	return ctxsw_prog_record_timestamp_timestamp_hi_tag_invalid_timestamp_v();
-}
-
-u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts)
-{
-	return ctxsw_prog_record_timestamp_timestamp_hi_tag_v((u32) (ts >> 32));
-}
-
-u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts)
-{
-	return ts & ~(((u64)ctxsw_prog_record_timestamp_timestamp_hi_tag_m()) << 32);
-}
-
 static u32 gk20a_fecs_trace_fecs_context_ptr(struct gk20a *g, struct channel_gk20a *ch)
 {
 	return (u32) (nvgpu_inst_block_addr(g, &ch->inst_block) >> 12LL);
 }

-int gk20a_fecs_trace_num_ts(void)
+int gk20a_fecs_trace_num_ts(struct gk20a *g)
 {
-	return (ctxsw_prog_record_timestamp_record_size_in_bytes_v()
+	return (g->ops.gr.ctxsw_prog.hw_get_ts_record_size_in_bytes()
 		- sizeof(struct gk20a_fecs_trace_record)) / sizeof(u64);
 }

@@ -94,18 +78,18 @@ struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(
 	struct nvgpu_mem *mem = &g->gr.global_ctx_buffer[FECS_TRACE_BUFFER].mem;

 	return (struct gk20a_fecs_trace_record *)
-		((u8 *) mem->cpu_va
-		+ (idx * ctxsw_prog_record_timestamp_record_size_in_bytes_v()));
+		((u8 *) mem->cpu_va +
+		(idx * g->ops.gr.ctxsw_prog.hw_get_ts_record_size_in_bytes()));
 }

-bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r)
+bool gk20a_fecs_trace_is_valid_record(struct gk20a *g,
+	struct gk20a_fecs_trace_record *r)
 {
 	/*
 	 * testing magic_hi should suffice. magic_lo is sometimes used
 	 * as a sequence number in experimental ucode.
 	 */
-	return (r->magic_hi
-		== ctxsw_prog_record_timestamp_magic_value_hi_v_value_v());
+	return g->ops.gr.ctxsw_prog.is_ts_valid_record(r->magic_hi);
 }

 int gk20a_fecs_trace_get_read_index(struct gk20a *g)
@@ -254,7 +238,7 @@ static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
 		"consuming record trace=%p read=%d record=%p", trace, index, r);

-	if (unlikely(!gk20a_fecs_trace_is_valid_record(r))) {
+	if (unlikely(!gk20a_fecs_trace_is_valid_record(g, r))) {
 		nvgpu_warn(g,
 			"trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
 			trace, index, r, r->magic_lo, r->magic_hi);
@@ -278,10 +262,11 @@ static int gk20a_fecs_trace_ring_read(struct gk20a *g, int index)
 	entry.vmid = vmid;

 	/* break out FECS record into trace events */
-	for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
+	for (i = 0; i < gk20a_fecs_trace_num_ts(g); i++) {

-		entry.tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
-		entry.timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
+		entry.tag = g->ops.gr.ctxsw_prog.hw_get_ts_tag(r->ts[i]);
+		entry.timestamp =
+			g->ops.gr.ctxsw_prog.hw_record_ts_timestamp(r->ts[i]);
 		entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;

 		nvgpu_log(g, gpu_dbg_ctxsw,
@@ -402,7 +387,7 @@ static int gk20a_fecs_trace_periodic_polling(void *arg)
 size_t gk20a_fecs_trace_buffer_size(struct gk20a *g)
 {
 	return GK20A_FECS_TRACE_NUM_RECORDS
-			* ctxsw_prog_record_timestamp_record_size_in_bytes_v();
+			* g->ops.gr.ctxsw_prog.hw_get_ts_record_size_in_bytes();
 }

 int gk20a_fecs_trace_init(struct gk20a *g)
@@ -449,8 +434,6 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	 * in the context header.
 	 */

-	u32 lo;
-	u32 hi;
 	u64 addr;
 	struct gk20a_fecs_trace *trace = g->fecs_trace;
 	struct nvgpu_mem *mem;
@@ -475,37 +458,24 @@ int gk20a_fecs_trace_bind_channel(struct gk20a *g,
 	} else {
 		addr = nvgpu_inst_block_addr(g, mem);
 		nvgpu_log(g, gpu_dbg_ctxsw, "pa=%llx", addr);
-		aperture_mask = nvgpu_aperture_mask(g, mem,
-			ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_noncoherent_f(),
-			ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_sys_mem_coherent_f(),
-			ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_target_vid_mem_f());
+		aperture_mask =
+		       g->ops.gr.ctxsw_prog.get_ts_buffer_aperture_mask(g, mem);
 	}
 	if (!addr)
 		return -ENOMEM;

-	lo = u64_lo32(addr);
-	hi = u64_hi32(addr);
-
 	mem = &gr_ctx->mem;

-	nvgpu_log(g, gpu_dbg_ctxsw, "addr_hi=%x addr_lo=%x count=%d", hi,
-		lo, GK20A_FECS_TRACE_NUM_RECORDS);
+	nvgpu_log(g, gpu_dbg_ctxsw, "addr=%llx count=%d", addr,
+		GK20A_FECS_TRACE_NUM_RECORDS);

-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_context_timestamp_buffer_control_o(),
-		ctxsw_prog_main_image_context_timestamp_buffer_control_num_records_f(
-			GK20A_FECS_TRACE_NUM_RECORDS));
+	g->ops.gr.ctxsw_prog.set_ts_num_records(g, mem,
+		GK20A_FECS_TRACE_NUM_RECORDS);

 	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA))
 		mem = &ch->ctx_header;

-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_context_timestamp_buffer_ptr_o(),
-		lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o(),
-		ctxsw_prog_main_image_context_timestamp_buffer_ptr_v_f(hi) |
-		aperture_mask);
+	g->ops.gr.ctxsw_prog.set_ts_buffer_ptr(g, mem, addr, aperture_mask);

 	/* pid (process identifier) in user space, corresponds to tgid (thread
 	 * group id) in kernel space.
@@ -573,7 +543,7 @@ int gk20a_gr_max_entries(struct gk20a *g,
 	int tag;

 	/* Compute number of entries per record, with given filter */
-	for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(); tag++)
+	for (n = 0, tag = 0; tag < gk20a_fecs_trace_num_ts(g); tag++)
 		n += (NVGPU_GPU_CTXSW_FILTER_ISSET(tag, filter) != 0);

 	/* Return max number of entries generated for the whole ring */
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -53,7 +53,6 @@
 #include "gr_pri_gk20a.h"
 #include "regops_gk20a.h"

-#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_ram_gk20a.h>
@@ -93,7 +92,7 @@ u32 gr_gk20a_get_ctx_id(struct gk20a *g, struct nvgpu_mem *ctx_mem)
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);

-	ctx_id = nvgpu_mem_rd(g, ctx_mem, ctxsw_prog_main_image_context_id_o());
+	ctx_id = g->ops.gr.ctxsw_prog.get_main_image_ctx_id(g, ctx_mem);
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr, "ctx_id: 0x%x", ctx_id);
 	return ctx_id;
 }
@@ -619,9 +618,8 @@ int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
 {
 	if (update_patch_count) {
 		/* reset patch count if ucode has already processed it */
-		gr_ctx->patch_ctx.data_count = nvgpu_mem_rd(g,
-						&gr_ctx->mem,
-					ctxsw_prog_main_image_patch_count_o());
+		gr_ctx->patch_ctx.data_count =
+			g->ops.gr.ctxsw_prog.get_patch_count(g, &gr_ctx->mem);
 		nvgpu_log(g, gpu_dbg_info, "patch count reset to %d",
 					gr_ctx->patch_ctx.data_count);
 	}
@@ -634,8 +632,7 @@ void gr_gk20a_ctx_patch_write_end(struct gk20a *g,
 {
 	/* Write context count to context image if it is mapped */
 	if (update_patch_count) {
-		nvgpu_mem_wr(g, &gr_ctx->mem,
-			     ctxsw_prog_main_image_patch_count_o(),
+		g->ops.gr.ctxsw_prog.set_patch_count(g, &gr_ctx->mem,
 			     gr_ctx->patch_ctx.data_count);
 		nvgpu_log(g, gpu_dbg_info, "write patch count %d",
 			gr_ctx->patch_ctx.data_count);
@@ -710,24 +707,6 @@ int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
 	return ret;
 }

-void gr_gk20a_write_zcull_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va)
-{
-	u32 va = u64_lo32(gpu_va >> 8);
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_zcull_ptr_o(), va);
-}
-
-void gr_gk20a_write_pm_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va)
-{
-	u32 va = u64_lo32(gpu_va >> 8);
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_pm_ptr_o(), va);
-}
-
 static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
 		struct nvgpu_gr_ctx *gr_ctx)
 {
@@ -740,8 +719,8 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
 	mem = &gr_ctx->mem;

 	if (gr_ctx->zcull_ctx.gpu_va == 0ULL &&
-	    gr_ctx->zcull_ctx.ctx_sw_mode ==
-		ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
+	    g->ops.gr.ctxsw_prog.is_zcull_mode_separate_buffer(
+			gr_ctx->zcull_ctx.ctx_sw_mode)) {
 		return -EINVAL;
 	}

@@ -757,15 +736,14 @@ static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
 		return ret;
 	}

-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_zcull_o(),
-		 gr_ctx->zcull_ctx.ctx_sw_mode);
+	g->ops.gr.ctxsw_prog.set_zcull(g, mem, gr_ctx->zcull_ctx.ctx_sw_mode);

 	if (ctxheader->gpu_va != 0ULL) {
-		g->ops.gr.write_zcull_ptr(g, ctxheader,
-					gr_ctx->zcull_ctx.gpu_va);
+		g->ops.gr.ctxsw_prog.set_zcull_ptr(g, ctxheader,
+			gr_ctx->zcull_ctx.gpu_va);
 	} else {
-		g->ops.gr.write_zcull_ptr(g, mem, gr_ctx->zcull_ctx.gpu_va);
+		g->ops.gr.ctxsw_prog.set_zcull_ptr(g, mem,
+			gr_ctx->zcull_ctx.gpu_va);
 	}

 	gk20a_enable_channel_tsg(g, c);
@@ -1302,7 +1280,7 @@ static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
 					  struct nvgpu_gr_ctx *gr_ctx)
 {
 	struct gr_gk20a *gr = &g->gr;
-	u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
+	u32 ctx_header_bytes = g->ops.gr.ctxsw_prog.hw_get_fecs_header_size();
 	u32 ctx_header_words;
 	u32 i;
 	u32 data;
@@ -1497,10 +1475,9 @@ restore_fe_go_idle:
 		data = nvgpu_mem_rd32(g, gr_mem, i);
 		nvgpu_mem_wr32(g, gold_mem, i, data);
 	}
-	nvgpu_mem_wr(g, gold_mem, ctxsw_prog_main_image_zcull_o(),
-		 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
+	g->ops.gr.ctxsw_prog.set_zcull_mode_no_ctxsw(g, gold_mem);

-	g->ops.gr.write_zcull_ptr(g, gold_mem, 0);
+	g->ops.gr.ctxsw_prog.set_zcull_ptr(g, gold_mem, 0);

 	err = g->ops.gr.commit_inst(c, gr_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
 	if (err != 0) {
@@ -1554,7 +1531,6 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 	struct tsg_gk20a *tsg;
 	struct nvgpu_gr_ctx *gr_ctx = NULL;
 	struct nvgpu_mem *mem = NULL;
-	u32 data;
 	int ret;

 	nvgpu_log_fn(g, " ");
@@ -1587,16 +1563,7 @@ int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
 	   Flush and invalidate before cpu update. */
 	g->ops.mm.l2_flush(g, true);

-	data = nvgpu_mem_rd(g, mem,
-		ctxsw_prog_main_image_pm_o());
-
-	data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
-	data |= enable_smpc_ctxsw ?
-		ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
-		ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_pm_o(), data);
+	g->ops.gr.ctxsw_prog.set_pm_smpc_mode(g, mem, enable_smpc_ctxsw);

 out:
 	gk20a_enable_channel_tsg(g, c);
@@ -1612,7 +1579,6 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	struct nvgpu_mem *gr_mem = NULL;
 	struct nvgpu_gr_ctx *gr_ctx;
 	struct pm_ctx_desc *pm_ctx;
-	u32 data;
 	u64 virt_addr = 0;
 	struct nvgpu_mem *ctxheader = &c->ctx_header;
 	int ret;
@@ -1633,24 +1599,29 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	}

 	if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
-		(g->ops.gr.get_hw_accessor_stream_out_mode == NULL)) {
-		nvgpu_err(g, "Mode-E hwpm context switch mode is not supported");
+			(g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw ==
+			NULL)) {
+		nvgpu_err(g,
+			"Mode-E hwpm context switch mode is not supported");
 		return -EINVAL;
 	}

 	switch (mode) {
 	case NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
-		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
+		if (pm_ctx->pm_mode ==
+		    g->ops.gr.ctxsw_prog.hw_get_pm_mode_ctxsw()) {
 			return 0;
 		}
 		break;
 	case  NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
-		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
+		if (pm_ctx->pm_mode ==
+		    g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw()) {
 			return 0;
 		}
 		break;
 	case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
-		if (pm_ctx->pm_mode == g->ops.gr.get_hw_accessor_stream_out_mode()) {
+		if (pm_ctx->pm_mode ==
+		    g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw()) {
 			return 0;
 		}
 		break;
@@ -1711,37 +1682,34 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 		}
 	}

-	data = nvgpu_mem_rd(g, gr_mem, ctxsw_prog_main_image_pm_o());
-	data = data & ~ctxsw_prog_main_image_pm_mode_m();
-
 	switch (mode) {
 	case  NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW:
-		pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
+		pm_ctx->pm_mode =
+			g->ops.gr.ctxsw_prog.set_pm_mode_ctxsw(g, gr_mem);
 		virt_addr = pm_ctx->mem.gpu_va;
 		break;
 	case NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW:
-		pm_ctx->pm_mode = g->ops.gr.get_hw_accessor_stream_out_mode();
+		pm_ctx->pm_mode =
+			g->ops.gr.ctxsw_prog.set_pm_mode_stream_out_ctxsw(g, gr_mem);
 		virt_addr = pm_ctx->mem.gpu_va;
 		break;
 	case NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW:
-		pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+		pm_ctx->pm_mode =
+			g->ops.gr.ctxsw_prog.set_pm_mode_no_ctxsw(g, gr_mem);
 		virt_addr = 0;
 	}

-	data |= pm_ctx->pm_mode;
-
-	nvgpu_mem_wr(g, gr_mem, ctxsw_prog_main_image_pm_o(), data);
-
 	if (ctxheader->gpu_va != 0ULL) {
 		struct channel_gk20a *ch;

 		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
 		nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
-			g->ops.gr.write_pm_ptr(g, &ch->ctx_header, virt_addr);
+			g->ops.gr.ctxsw_prog.set_pm_ptr(g, &ch->ctx_header,
+				virt_addr);
 		}
 		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
 	} else {
-		g->ops.gr.write_pm_ptr(g, gr_mem, virt_addr);
+		g->ops.gr.ctxsw_prog.set_pm_ptr(g, gr_mem, virt_addr);
 	}

 	/* enable channel */
@@ -1750,26 +1718,13 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	return 0;
 }

-void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
-				struct nvgpu_mem *mem)
-{
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_num_save_ops_o(), 0);
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_num_restore_ops_o(), 0);
-}
-
 /* load saved fresh copy of gloden image into channel gr_ctx */
 int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 					struct channel_gk20a *c,
 					struct nvgpu_gr_ctx *gr_ctx)
 {
 	struct gr_gk20a *gr = &g->gr;
-	u32 virt_addr_lo;
-	u32 virt_addr_hi;
 	u64 virt_addr = 0;
-	u32 v, data;
-	int ret = 0;
 	struct nvgpu_mem *mem;

 	nvgpu_log_fn(g, " ");
@@ -1787,8 +1742,8 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 		gr->ctx_vars.local_golden_image,
 		gr->ctx_vars.golden_image_size);

-	if (g->ops.gr.init_ctxsw_hdr_data != NULL) {
-		g->ops.gr.init_ctxsw_hdr_data(g, mem);
+	if (g->ops.gr.ctxsw_prog.init_ctxsw_hdr_data != NULL) {
+		g->ops.gr.ctxsw_prog.init_ctxsw_hdr_data(g, mem);
 	}

 	if ((g->ops.gr.enable_cde_in_fecs != NULL) && c->cde) {
@@ -1796,32 +1751,13 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 	}

 	/* set priv access map */
-	virt_addr_lo =
-		 u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
-	virt_addr_hi =
-		 u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
-
-	if (g->allow_all) {
-		data = ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f();
-	} else {
-		data = ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f();
-	}
-
-	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_priv_access_map_config_o(),
-		 data);
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
-		virt_addr_lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
-		virt_addr_hi);
+	g->ops.gr.ctxsw_prog.set_priv_access_map_config_mode(g, mem,
+		g->allow_all);
+	g->ops.gr.ctxsw_prog.set_priv_access_map_addr(g, mem,
+		gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);

 	/* disable verif features */
-	v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_misc_options_o());
-	v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
-	v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
-	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_misc_options_o(), v);
+	g->ops.gr.ctxsw_prog.disable_verif_features(g, mem);

 	if (g->ops.gr.update_ctxsw_preemption_mode != NULL) {
 		g->ops.gr.update_ctxsw_preemption_mode(g, gr_ctx, &c->ctx_header);
@@ -1831,26 +1767,19 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 		g->ops.gr.update_boosted_ctx(g, mem, gr_ctx);
 	}

-	virt_addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
-	virt_addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
-
 	nvgpu_log(g, gpu_dbg_info, "write patch count = %d",
 			gr_ctx->patch_ctx.data_count);
-	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_patch_count_o(),
-		 gr_ctx->patch_ctx.data_count);
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_patch_adr_lo_o(),
-		virt_addr_lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_patch_adr_hi_o(),
-		virt_addr_hi);
+	g->ops.gr.ctxsw_prog.set_patch_count(g, mem,
+		gr_ctx->patch_ctx.data_count);
+	g->ops.gr.ctxsw_prog.set_patch_addr(g, mem,
+		gr_ctx->patch_ctx.mem.gpu_va);

 	/* Update main header region of the context buffer with the info needed
 	 * for PM context switching, including mode and possibly a pointer to
 	 * the PM backing store.
 	 */
-	if (gr_ctx->pm_ctx.pm_mode != ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
+	if (gr_ctx->pm_ctx.pm_mode !=
+	    g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw()) {
 		if (gr_ctx->pm_ctx.mem.gpu_va == 0ULL) {
 			nvgpu_err(g,
 				"context switched pm with no pm buffer!");
@@ -1862,15 +1791,10 @@ int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
 		virt_addr = 0;
 	}

-	data = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_pm_o());
-	data = data & ~ctxsw_prog_main_image_pm_mode_m();
-	data |= gr_ctx->pm_ctx.pm_mode;
+	g->ops.gr.ctxsw_prog.set_pm_mode(g, mem, gr_ctx->pm_ctx.pm_mode);
+	g->ops.gr.ctxsw_prog.set_pm_ptr(g, mem, virt_addr);

-	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pm_o(), data);
-
-	g->ops.gr.write_pm_ptr(g, mem, virt_addr);
-
-	return ret;
+	return 0;
 }

 static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
@@ -2959,7 +2883,8 @@ int gk20a_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 		}

 		/* PM ctxt switch is off by default */
-		gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+		gr_ctx->pm_ctx.pm_mode =
+			g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();
 	} else {
 		/* commit gr ctx buffer */
 		err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va);
@@ -6654,8 +6579,6 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 	u32 num_tpc;
 	u32 tpc, gpc, reg;
 	u32 chk_addr;
-	u32 vaddr_lo;
-	u32 vaddr_hi;
 	u32 tmp;
 	u32 num_ovr_perf_regs = 0;
 	u32 *ovr_perf_regs = NULL;
@@ -6682,8 +6605,8 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				/* reset the patch count from previous
 				   runs,if ucode has already processed
 				   it */
-				tmp = nvgpu_mem_rd(g, mem,
-				       ctxsw_prog_main_image_patch_count_o());
+				tmp = g->ops.gr.ctxsw_prog.get_patch_count(g,
+					mem);

 				if (tmp == 0U) {
 					gr_ctx->patch_ctx.data_count = 0;
@@ -6692,26 +6615,17 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
 				gr_gk20a_ctx_patch_write(g, gr_ctx,
 							 addr, data, true);

-				vaddr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
-				vaddr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
+				g->ops.gr.ctxsw_prog.set_patch_count(g, mem,
+					gr_ctx->patch_ctx.data_count);

-				nvgpu_mem_wr(g, mem,
-					 ctxsw_prog_main_image_patch_count_o(),
-					 gr_ctx->patch_ctx.data_count);
 				if (ctxheader->gpu_va != 0ULL) {
-					nvgpu_mem_wr(g, ctxheader,
-						ctxsw_prog_main_image_patch_adr_lo_o(),
-						vaddr_lo);
-					nvgpu_mem_wr(g, ctxheader,
-						ctxsw_prog_main_image_patch_adr_hi_o(),
-						vaddr_hi);
+					g->ops.gr.ctxsw_prog.set_patch_addr(g,
+						ctxheader,
+						gr_ctx->patch_ctx.mem.gpu_va);
 				} else {
-					nvgpu_mem_wr(g, mem,
-						ctxsw_prog_main_image_patch_adr_lo_o(),
-						vaddr_lo);
-					nvgpu_mem_wr(g, mem,
-						ctxsw_prog_main_image_patch_adr_hi_o(),
-						vaddr_hi);
+					g->ops.gr.ctxsw_prog.set_patch_addr(g,
+						mem,
+						gr_ctx->patch_ctx.mem.gpu_va);
 				}

 				/* we're not caching these on cpu side,
@@ -6726,24 +6640,6 @@ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,

 #define ILLEGAL_ID ((u32)~0)

-static inline bool check_main_image_header_magic(u8 *context)
-{
-	u32 magic = *(u32 *)(context + ctxsw_prog_main_image_magic_value_o());
-	return magic == ctxsw_prog_main_image_magic_value_v_value_v();
-}
-static inline bool check_local_header_magic(u8 *context)
-{
-	u32 magic = *(u32 *)(context + ctxsw_prog_local_magic_value_o());
-	return magic == ctxsw_prog_local_magic_value_v_value_v();
-
-}
-
-/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
-static inline u32 ctxsw_prog_ucode_header_size_in_bytes(void)
-{
-	return 256U;
-}
-
 void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
 					       u32 **ovr_perf_regs)
 {
@@ -6758,9 +6654,9 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 						   u32 context_buffer_size,
 						   u32 *priv_offset)
 {
-	u32 i, data32;
+	u32 i;
 	u32 gpc_num, tpc_num;
-	u32 num_gpcs, num_tpcs;
+	u32 num_gpcs;
 	u32 chk_addr;
 	u32 ext_priv_offset, ext_priv_size;
 	u8 *context;
@@ -6809,18 +6705,18 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 		return -EINVAL;
 	}

-	buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+	buffer_segments_size = g->ops.gr.ctxsw_prog.hw_get_extended_buffer_segments_size_in_bytes();
 	/* note below is in words/num_registers */
-	marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
+	marker_size = g->ops.gr.ctxsw_prog.hw_extended_marker_size_in_bytes() >> 2;

 	context = (u8 *)context_buffer;
 	/* sanity check main header */
-	if (!check_main_image_header_magic(context)) {
+	if (!g->ops.gr.ctxsw_prog.check_main_image_header_magic(context)) {
 		nvgpu_err(g,
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
+	num_gpcs = g->ops.gr.ctxsw_prog.get_num_gpcs(context);
 	if (gpc_num >= num_gpcs) {
 		nvgpu_err(g,
 		   "GPC 0x%08x is greater than total count 0x%08x!",
@@ -6828,21 +6724,20 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 		return -EINVAL;
 	}

-	data32 = *(u32 *)(context + ctxsw_prog_main_extended_buffer_ctl_o());
-	ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
+	g->ops.gr.ctxsw_prog.get_extended_buffer_size_offset(context,
+		&ext_priv_size, &ext_priv_offset);
 	if (0U == ext_priv_size) {
 		nvgpu_log_info(g, " No extended memory in context buffer");
 		return -EINVAL;
 	}
-	ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);

-	offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
+	offset_to_segment = ext_priv_offset * 256U;
 	offset_to_segment_end = offset_to_segment +
 		(ext_priv_size * buffer_segments_size);

 	/* check local header magic */
-	context += ctxsw_prog_ucode_header_size_in_bytes();
-	if (!check_local_header_magic(context)) {
+	context += g->ops.gr.ctxsw_prog.hw_get_fecs_header_size();
+	if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) {
 		nvgpu_err(g,
 			   "Invalid local header: magic value");
 		return -EINVAL;
@@ -6937,8 +6832,6 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 	offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
 			      buffer_segments_size * gpc_num);

-	num_tpcs = g->gr.gpc_tpc_count[gpc_num];
-
 	/* skip the head marker to start with */
 	inter_seg_offset = marker_size;

@@ -6949,23 +6842,7 @@ static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
 			(tpc_num * control_register_stride) +
 			sm_dsm_perf_ctrl_reg_id;
 	} else {
-		/* skip all the control registers */
-		inter_seg_offset = inter_seg_offset +
-			(num_tpcs * control_register_stride);
-
-		/* skip the marker between control and counter segments */
-		inter_seg_offset += marker_size;
-
-		/* skip over counter regs of TPCs before the one we want */
-		inter_seg_offset = inter_seg_offset +
-			(tpc_num * perf_register_stride) *
-			ctxsw_prog_extended_num_smpc_quadrants_v();
-
-		/* skip over the register for the quadrants we do not want.
-		 *  then skip to the register in this tpc */
-		inter_seg_offset = inter_seg_offset +
-			(perf_register_stride * quad) +
-			sm_dsm_perf_reg_id;
+		return -EINVAL;
 	}

 	/* set the offset to the segment offset plus the inter segment offset to
@@ -7146,7 +7023,6 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 					       u32 *num_ppcs, u32 *ppc_mask,
 					       u32 *reg_ppc_count)
 {
-	u32 data32;
 	u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);

 	/*
@@ -7159,11 +7035,7 @@ static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
 		return -EINVAL;
 	}

-	data32 = *(u32 *)(context + ctxsw_prog_local_image_ppc_info_o());
-
-	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
-	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
-
+	g->ops.gr.ctxsw_prog.get_ppc_info(context, num_ppcs, ppc_mask);
 	*reg_ppc_count = g->netlist_vars->ctxsw_regs.ppc.count;

 	return 0;
@@ -7242,7 +7114,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 					       u32 context_buffer_size,
 					       u32 *priv_offset)
 {
-	u32 i, data32;
+	u32 i;
 	int err;
 	enum ctxsw_addr_type addr_type;
 	u32 broadcast_flags;
@@ -7267,22 +7139,23 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	}

 	context = (u8 *)context_buffer;
-	if (!check_main_image_header_magic(context)) {
+	if (!g->ops.gr.ctxsw_prog.check_main_image_header_magic(context)) {
 		nvgpu_err(g,
 			   "Invalid main header: magic value");
 		return -EINVAL;
 	}
-	num_gpcs = *(u32 *)(context + ctxsw_prog_main_image_num_gpcs_o());
+	num_gpcs = g->ops.gr.ctxsw_prog.get_num_gpcs(context);

 	/* Parse the FECS local header. */
-	context += ctxsw_prog_ucode_header_size_in_bytes();
-	if (!check_local_header_magic(context)) {
+	context += g->ops.gr.ctxsw_prog.hw_get_fecs_header_size();
+	if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) {
 		nvgpu_err(g,
 			   "Invalid FECS local header: magic value");
 		return -EINVAL;
 	}
-	data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
-	sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+	sys_priv_offset =
+	       g->ops.gr.ctxsw_prog.get_local_priv_register_ctl_offset(context);
 	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset);

 	/* If found in Ext buffer, ok.
@@ -7302,8 +7175,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
 	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
 		/* Find the offset in the FECS segment. */
-		offset_to_segment = sys_priv_offset *
-			ctxsw_prog_ucode_header_size_in_bytes();
+		offset_to_segment = sys_priv_offset * 256U;

 		err = gr_gk20a_process_context_buffer_priv_segment(g,
 					   addr_type, addr,
@@ -7326,15 +7198,14 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,

 	/* Parse the GPCCS local header(s).*/
 	for (i = 0; i < num_gpcs; i++) {
-		context += ctxsw_prog_ucode_header_size_in_bytes();
-		if (!check_local_header_magic(context)) {
+		context += g->ops.gr.ctxsw_prog.hw_get_gpccs_header_size();
+		if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) {
 			nvgpu_err(g,
 				   "Invalid GPCCS local header: magic value");
 			return -EINVAL;

 		}
-		data32 = *(u32 *)(context + ctxsw_prog_local_priv_register_ctl_o());
-		gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+		gpc_priv_offset = g->ops.gr.ctxsw_prog.get_local_priv_register_ctl_offset(context);

 		err = gr_gk20a_determine_ppc_configuration(g, context,
 							   &num_ppcs, &ppc_mask,
@@ -7345,7 +7216,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 		}


-		num_tpcs = *(u32 *)(context + ctxsw_prog_local_image_num_tpcs_o());
+		num_tpcs = g->ops.gr.ctxsw_prog.get_num_tpcs(context);

 		if ((i == gpc_num) && ((tpc_num + 1U) > num_tpcs)) {
 			nvgpu_err(g,
@@ -7359,8 +7230,7 @@ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
 			nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
 					"gpc_priv_offset 0x%#08x",
 					gpc_priv_offset);
-			offset_to_segment = gpc_priv_offset *
-				ctxsw_prog_ucode_header_size_in_bytes();
+			offset_to_segment = gpc_priv_offset * 256U;

 			err = g->ops.gr.get_offset_in_gpccs_segment(g,
 					addr_type,
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -738,12 +738,6 @@ int gr_gk20a_init_sm_id_table(struct gk20a *g);

 int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);

-void gr_gk20a_write_zcull_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va);
-
-void gr_gk20a_write_pm_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va);
-
 u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc);
 u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc);
 void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
@@ -751,8 +745,6 @@ void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
 void gk20a_gr_init_ovr_sm_dsm_perf(void);
 void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
 					       u32 **ovr_perf_regs);
-void gk20a_gr_init_ctxsw_hdr_data(struct gk20a *g,
-					struct nvgpu_mem *mem);
 u32 gr_gk20a_get_patch_slots(struct gk20a *g);
 int gk20a_gr_handle_notify_pending(struct gk20a *g,
 				struct gr_gk20a_isr_data *isr_data);
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -39,7 +39,6 @@
 #include <nvgpu/hw/gm20b/hw_gr_gm20b.h>
 #include <nvgpu/hw/gm20b/hw_fifo_gm20b.h>
 #include <nvgpu/hw/gm20b/hw_top_gm20b.h>
-#include <nvgpu/hw/gm20b/hw_ctxsw_prog_gm20b.h>
 #include <nvgpu/hw/gm20b/hw_perf_gm20b.h>

 void gr_gm20b_init_gpc_mmu(struct gk20a *g)
@@ -537,7 +536,7 @@ void gr_gm20b_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
 	*sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;

 	*ctrl_register_stride =
-	    ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+	    g->ops.gr.ctxsw_prog.hw_get_perf_counter_control_register_stride();
 }

 u32 gr_gm20b_get_gpc_mask(struct gk20a *g)
@@ -908,16 +907,11 @@ int gr_gm20b_alloc_gr_ctx(struct gk20a *g,
 void gr_gm20b_update_ctxsw_preemption_mode(struct gk20a *g,
 		struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_mem *ctxheader)
 {
-	u32 cta_preempt_option =
-		ctxsw_prog_main_image_preemption_options_control_cta_enabled_f();
-
 	nvgpu_log_fn(g, " ");

 	if (gr_ctx->compute_preempt_mode == NVGPU_PREEMPTION_MODE_COMPUTE_CTA) {
-		nvgpu_log_info(g, "CTA: %x", cta_preempt_option);
-		nvgpu_mem_wr(g, &gr_ctx->mem,
-				ctxsw_prog_main_image_preemption_options_o(),
-				cta_preempt_option);
+		g->ops.gr.ctxsw_prog.set_compute_preemption_mode_cta(g,
+			&gr_ctx->mem);
 	}

 	nvgpu_log_fn(g, "done");
@@ -1069,7 +1063,6 @@ int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 	struct tsg_gk20a *tsg;
 	struct nvgpu_gr_ctx *gr_ctx;
 	struct nvgpu_mem *mem;
-	u32 v;

 	nvgpu_log_fn(c->g, " ");

@@ -1084,11 +1077,7 @@ int gr_gm20b_update_pc_sampling(struct channel_gk20a *c,
 		return -EINVAL;
 	}

-
-	v = nvgpu_mem_rd(c->g, mem, ctxsw_prog_main_image_pm_o());
-	v &= ~ctxsw_prog_main_image_pm_pc_sampling_m();
-	v |= ctxsw_prog_main_image_pm_pc_sampling_f(enable);
-	nvgpu_mem_wr(c->g, mem, ctxsw_prog_main_image_pm_o(), v);
+	c->g->ops.gr.ctxsw_prog.set_pc_sampling(c->g, mem, enable);

 	nvgpu_log_fn(c->g, "done");

@@ -1176,11 +1165,7 @@ void gr_gm20b_init_cyclestats(struct gk20a *g)

 void gr_gm20b_enable_cde_in_fecs(struct gk20a *g, struct nvgpu_mem *mem)
 {
-	u32 cde_v;
-
-	cde_v = nvgpu_mem_rd(g, mem, ctxsw_prog_main_image_ctl_o());
-	cde_v |=  ctxsw_prog_main_image_ctl_cde_enabled_f();
-	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_ctl_o(), cde_v);
+	g->ops.gr.ctxsw_prog.set_cde_enabled(g, mem);
 }

 void gr_gm20b_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state)
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -41,6 +41,7 @@
 #include "common/ptimer/ptimer_gk20a.h"
 #include "common/fb/fb_gm20b.h"
 #include "common/netlist/netlist_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/ltc/ltc_gm20b.h"
 #include "common/fuse/fuse_gm20b.h"
@@ -288,8 +289,6 @@ static const struct gpu_ops gm20b_ops = {
 		.program_zcull_mapping = gr_gk20a_program_zcull_mapping,
 		.commit_global_timeslice = gr_gk20a_commit_global_timeslice,
 		.commit_inst = gr_gk20a_commit_inst,
-		.write_zcull_ptr = gr_gk20a_write_zcull_ptr,
-		.write_pm_ptr = gr_gk20a_write_pm_ptr,
 		.load_tpc_mask = gr_gm20b_load_tpc_mask,
 		.trigger_suspend = gr_gk20a_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -312,7 +311,6 @@ static const struct gpu_ops gm20b_ops = {
 		.init_ovr_sm_dsm_perf =  gk20a_gr_init_ovr_sm_dsm_perf,
 		.get_ovr_perf_regs = gk20a_gr_get_ovr_perf_regs,
 		.disable_rd_coalesce = gm20a_gr_disable_rd_coalesce,
-		.init_ctxsw_hdr_data = gk20a_gr_init_ctxsw_hdr_data,
 		.fecs_host_int_enable = gr_gk20a_fecs_host_int_enable,
 		.handle_notify_pending = gk20a_gr_handle_notify_pending,
 		.handle_semaphore_pending = gk20a_gr_handle_semaphore_pending,
@@ -335,6 +333,72 @@ static const struct gpu_ops gm20b_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.alloc_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gm20b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gm20b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.init_ctxsw_hdr_data = gm20b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gm20b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+		}
 	},
 	.fb = {
 		.init_hw = gm20b_fb_init_hw,
@@ -695,6 +759,7 @@ int gm20b_init_hal(struct gk20a *g)
 	gops->ltc = gm20b_ops.ltc;
 	gops->ce2 = gm20b_ops.ce2;
 	gops->gr = gm20b_ops.gr;
+	gops->gr.ctxsw_prog = gm20b_ops.gr.ctxsw_prog;
 	gops->fb = gm20b_ops.fb;
 	gops->clock_gating = gm20b_ops.clock_gating;
 	gops->fifo = gm20b_ops.fifo;
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -33,6 +33,8 @@
 #include "common/fb/fb_gm20b.h"
 #include "common/fb/fb_gp106.h"
 #include "common/netlist/netlist_gp106.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
 #include "common/xve/xve_gp106.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp106.h"
@@ -354,8 +356,6 @@ static const struct gpu_ops gp106_ops = {
 		.program_zcull_mapping = gr_gk20a_program_zcull_mapping,
 		.commit_global_timeslice = gr_gk20a_commit_global_timeslice,
 		.commit_inst = gr_gk20a_commit_inst,
-		.write_zcull_ptr = gr_gk20a_write_zcull_ptr,
-		.write_pm_ptr = gr_gk20a_write_pm_ptr,
 		.load_tpc_mask = gr_gm20b_load_tpc_mask,
 		.trigger_suspend = gr_gk20a_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -412,6 +412,81 @@ static const struct gpu_ops gp106_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.alloc_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gm20b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gm20b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gp10b_ctxsw_prog_set_full_preemption_ptr,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = gm20b_fb_init_hw,
@@ -848,6 +923,7 @@ int gp106_init_hal(struct gk20a *g)
 	gops->ltc = gp106_ops.ltc;
 	gops->ce2 = gp106_ops.ce2;
 	gops->gr = gp106_ops.gr;
+	gops->gr.ctxsw_prog = gp106_ops.gr.ctxsw_prog;
 	gops->fb = gp106_ops.fb;
 	gops->clock_gating = gp106_ops.clock_gating;
 	gops->fifo = gp106_ops.fifo;
--- a/drivers/gpu/nvgpu/gp10b/fecs_trace_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/fecs_trace_gp10b.c
@@ -28,7 +28,6 @@

 #include "fecs_trace_gp10b.h"

-#include <nvgpu/hw/gp10b/hw_ctxsw_prog_gp10b.h>
 #include <nvgpu/hw/gp10b/hw_gr_gp10b.h>

 #ifdef CONFIG_GK20A_CTXSW_TRACE
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -44,7 +44,6 @@

 #include <nvgpu/hw/gp10b/hw_gr_gp10b.h>
 #include <nvgpu/hw/gp10b/hw_fifo_gp10b.h>
-#include <nvgpu/hw/gp10b/hw_ctxsw_prog_gp10b.h>

 #define GFXP_WFI_TIMEOUT_COUNT_DEFAULT 100000U

@@ -1134,83 +1133,27 @@ fail_free_gk20a_ctx:
 void gr_gp10b_dump_ctxsw_stats(struct gk20a *g, struct vm_gk20a *vm,
 			       struct nvgpu_gr_ctx *gr_ctx)
 {
-	struct nvgpu_mem *mem = &gr_ctx->mem;
-
-	nvgpu_err(g, "ctxsw_prog_main_image_magic_value_o : %x (expect %x)",
-		nvgpu_mem_rd(g, mem,
-				ctxsw_prog_main_image_magic_value_o()),
-		ctxsw_prog_main_image_magic_value_v_value_v());
-
-	nvgpu_err(g, "ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi : %x",
-		nvgpu_mem_rd(g, mem,
-				ctxsw_prog_main_image_context_timestamp_buffer_ptr_hi_o()));
-
-	nvgpu_err(g, "ctxsw_prog_main_image_context_timestamp_buffer_ptr : %x",
-		nvgpu_mem_rd(g, mem,
-				ctxsw_prog_main_image_context_timestamp_buffer_ptr_o()));
-
-	nvgpu_err(g, "ctxsw_prog_main_image_context_timestamp_buffer_control : %x",
-		nvgpu_mem_rd(g, mem,
-				ctxsw_prog_main_image_context_timestamp_buffer_control_o()));
-
-	nvgpu_err(g, "NUM_SAVE_OPERATIONS : %d",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_num_save_ops_o()));
-	nvgpu_err(g, "WFI_SAVE_OPERATIONS : %d",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_num_wfi_save_ops_o()));
-	nvgpu_err(g, "CTA_SAVE_OPERATIONS : %d",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_num_cta_save_ops_o()));
-	nvgpu_err(g, "GFXP_SAVE_OPERATIONS : %d",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_num_gfxp_save_ops_o()));
-	nvgpu_err(g, "CILP_SAVE_OPERATIONS : %d",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_num_cilp_save_ops_o()));
-	nvgpu_err(g,
-		"image gfx preemption option (GFXP is 1) %x",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_graphics_preemption_options_o()));
-	nvgpu_err(g,
-		"image compute preemption option (CTA is 1) %x",
-		nvgpu_mem_rd(g, mem,
-			ctxsw_prog_main_image_compute_preemption_options_o()));
+	g->ops.gr.ctxsw_prog.dump_ctxsw_stats(g, &gr_ctx->mem);
 }

 void gr_gp10b_update_ctxsw_preemption_mode(struct gk20a *g,
 		struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_mem *ctxheader)
 {
 	struct nvgpu_mem *mem = &gr_ctx->mem;
-	u32 gfxp_preempt_option =
-		ctxsw_prog_main_image_graphics_preemption_options_control_gfxp_f();
-	u32 cilp_preempt_option =
-		ctxsw_prog_main_image_compute_preemption_options_control_cilp_f();
-	u32 cta_preempt_option =
-		ctxsw_prog_main_image_compute_preemption_options_control_cta_f();
 	int err;

 	nvgpu_log_fn(g, " ");

 	if (gr_ctx->graphics_preempt_mode == NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP) {
-		nvgpu_log_info(g, "GfxP: %x", gfxp_preempt_option);
-		nvgpu_mem_wr(g, mem,
-				ctxsw_prog_main_image_graphics_preemption_options_o(),
-				gfxp_preempt_option);
+		g->ops.gr.ctxsw_prog.set_graphics_preemption_mode_gfxp(g, mem);
 	}

 	if (gr_ctx->compute_preempt_mode == NVGPU_PREEMPTION_MODE_COMPUTE_CILP) {
-		nvgpu_log_info(g, "CILP: %x", cilp_preempt_option);
-		nvgpu_mem_wr(g, mem,
-				ctxsw_prog_main_image_compute_preemption_options_o(),
-				cilp_preempt_option);
+		g->ops.gr.ctxsw_prog.set_compute_preemption_mode_cilp(g, mem);
 	}

 	if (gr_ctx->compute_preempt_mode == NVGPU_PREEMPTION_MODE_COMPUTE_CTA) {
-		nvgpu_log_info(g, "CTA: %x", cta_preempt_option);
-		nvgpu_mem_wr(g, mem,
-				ctxsw_prog_main_image_compute_preemption_options_o(),
-				cta_preempt_option);
+		g->ops.gr.ctxsw_prog.set_compute_preemption_mode_cta(g, mem);
 	}

 	if (gr_ctx->preempt_ctxsw_buffer.gpu_va != 0ULL) {
@@ -2226,12 +2169,10 @@ enable_ch:
 }

 void gr_gp10b_update_boosted_ctx(struct gk20a *g, struct nvgpu_mem *mem,
-				       struct nvgpu_gr_ctx *gr_ctx) {
-	u32 v;
-
-	v = ctxsw_prog_main_image_pmu_options_boost_clock_frequencies_f(
+				       struct nvgpu_gr_ctx *gr_ctx)
+{
+	g->ops.gr.ctxsw_prog.set_pmu_options_boost_clock_frequencies(g, mem,
 		gr_ctx->boosted_ctx);
-	nvgpu_mem_wr(g, mem, ctxsw_prog_main_image_pmu_options_o(), v);
 }

 int gr_gp10b_set_preemption_mode(struct channel_gk20a *ch,
@@ -2357,11 +2298,7 @@ int gr_gp10b_init_preemption_state(struct gk20a *g)
 void gr_gp10b_set_preemption_buffer_va(struct gk20a *g,
 			struct nvgpu_mem *mem, u64 gpu_va)
 {
-	u32 va = u64_lo32(gpu_va >> 8);
-
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_full_preemption_ptr_o(), va);
-
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, mem, gpu_va);
 }

 void gr_gp10b_init_czf_bypass(struct gk20a *g)
@@ -2386,20 +2323,6 @@ int gr_gp10b_set_czf_bypass(struct gk20a *g, struct channel_gk20a *ch)
 	return __gr_gk20a_exec_ctx_ops(ch, &ops, 1, 1, 0, false);
 }

-void gr_gp10b_init_ctxsw_hdr_data(struct gk20a *g, struct nvgpu_mem *mem)
-{
-	gk20a_gr_init_ctxsw_hdr_data(g, mem);
-
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_num_wfi_save_ops_o(), 0);
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_num_cta_save_ops_o(), 0);
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_num_gfxp_save_ops_o(), 0);
-	nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_num_cilp_save_ops_o(), 0);
-}
-
 void gr_gp10b_init_gfxp_wfi_timeout_count(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.h
@@ -147,7 +147,6 @@ void gr_gp10b_set_preemption_buffer_va(struct gk20a *g,
 			struct nvgpu_mem *mem, u64 gpu_va);
 int gr_gp10b_set_czf_bypass(struct gk20a *g, struct channel_gk20a *ch);
 void gr_gp10b_init_czf_bypass(struct gk20a *g);
-void gr_gp10b_init_ctxsw_hdr_data(struct gk20a *g, struct nvgpu_mem *mem);
 void gr_gp10b_init_gfxp_wfi_timeout_count(struct gk20a *g);
 unsigned long gr_gp10b_get_max_gfxp_wfi_timeout_count(struct gk20a *g);
 bool gr_gp10b_suspend_context(struct channel_gk20a *ch,
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -45,6 +45,8 @@
 #include "common/fb/fb_gm20b.h"
 #include "common/fb/fb_gp10b.h"
 #include "common/netlist/netlist_gp10b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp10b.h"
 #include "common/ltc/ltc_gm20b.h"
@@ -310,8 +312,6 @@ static const struct gpu_ops gp10b_ops = {
 		.program_zcull_mapping = gr_gk20a_program_zcull_mapping,
 		.commit_global_timeslice = gr_gk20a_commit_global_timeslice,
 		.commit_inst = gr_gk20a_commit_inst,
-		.write_zcull_ptr = gr_gk20a_write_zcull_ptr,
-		.write_pm_ptr = gr_gk20a_write_pm_ptr,
 		.load_tpc_mask = gr_gm20b_load_tpc_mask,
 		.trigger_suspend = gr_gk20a_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -345,7 +345,6 @@ static const struct gpu_ops gp10b_ops = {
 		.set_bes_crop_debug3 = gr_gp10b_set_bes_crop_debug3,
 		.set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode,
 		.init_ecc = gp10b_ecc_init,
-		.init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data,
 		.init_gfxp_wfi_timeout_count =
 				gr_gp10b_init_gfxp_wfi_timeout_count,
 		.get_max_gfxp_wfi_timeout_count =
@@ -373,6 +372,81 @@ static const struct gpu_ops gp10b_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.alloc_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gm20b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gm20b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gp10b_ctxsw_prog_set_full_preemption_ptr,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = gm20b_fb_init_hw,
@@ -763,6 +837,7 @@ int gp10b_init_hal(struct gk20a *g)
 	gops->ltc = gp10b_ops.ltc;
 	gops->ce2 = gp10b_ops.ce2;
 	gops->gr = gp10b_ops.gr;
+	gops->gr.ctxsw_prog = gp10b_ops.gr.ctxsw_prog;
 	gops->fb = gp10b_ops.fb;
 	gops->clock_gating = gp10b_ops.clock_gating;
 	gops->fifo = gp10b_ops.fifo;
--- a/drivers/gpu/nvgpu/gv100/gr_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.c
@@ -37,7 +37,6 @@
 #include <nvgpu/hw/gv100/hw_gr_gv100.h>
 #include <nvgpu/hw/gv100/hw_proj_gv100.h>
 #include <nvgpu/hw/gv100/hw_top_gv100.h>
-#include <nvgpu/hw/gv100/hw_ctxsw_prog_gv100.h>
 #include <nvgpu/hw/gv100/hw_perf_gv100.h>


@@ -429,11 +428,6 @@ void gr_gv100_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
 	}
 }

-u32 gr_gv100_get_hw_accessor_stream_out_mode(void)
-{
-	return ctxsw_prog_main_image_pm_mode_stream_out_ctxsw_f();
-}
-
 void gr_gv100_set_pmm_register(struct gk20a *g, u32 offset, u32 val,
 				u32 num_chiplets, u32 num_perfmons)
 {
--- a/drivers/gpu/nvgpu/gv100/gr_gv100.h
+++ b/drivers/gpu/nvgpu/gv100/gr_gv100.h
@@ -46,7 +46,6 @@ int gr_gv100_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map,
 void gr_gv100_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
 	u32 num_fbpas,
 	u32 *priv_addr_table, u32 *t);
-u32 gr_gv100_get_hw_accessor_stream_out_mode(void);
 void gr_gv100_init_hwpm_pmm_register(struct gk20a *g);
 void gr_gv100_set_pmm_register(struct gk20a *g, u32 offset, u32 val,
 				u32 num_chiplets, u32 num_perfmons);
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -36,6 +36,9 @@
 #include "common/fb/fb_gv100.h"
 #include "common/xve/xve_gp106.h"
 #include "common/netlist/netlist_gv100.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gv11b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp106.h"
 #include "common/therm/therm_gp10b.h"
@@ -398,8 +401,6 @@ static const struct gpu_ops gv100_ops = {
 		.enable_exceptions = gr_gv11b_enable_exceptions,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
-		.get_hw_accessor_stream_out_mode =
-			gr_gv100_get_hw_accessor_stream_out_mode,
 		.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
 		.set_pmm_register = gr_gv100_set_pmm_register,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -416,8 +417,6 @@ static const struct gpu_ops gv100_ops = {
 		.program_zcull_mapping = gr_gv11b_program_zcull_mapping,
 		.commit_global_timeslice = gr_gv11b_commit_global_timeslice,
 		.commit_inst = gr_gv11b_commit_inst,
-		.write_zcull_ptr = gr_gv11b_write_zcull_ptr,
-		.write_pm_ptr = gr_gv11b_write_pm_ptr,
 		.load_tpc_mask = gr_gv11b_load_tpc_mask,
 		.trigger_suspend = gv11b_gr_sm_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -493,6 +492,93 @@ static const struct gpu_ops gv100_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.alloc_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gv11b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gv11b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.hw_get_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_hw_get_pm_mode_stream_out_ctxsw,
+			.set_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_set_pm_mode_stream_out_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gv11b_ctxsw_prog_set_full_preemption_ptr,
+			.set_full_preemption_ptr_veid0 =
+				gv11b_ctxsw_prog_set_full_preemption_ptr_veid0,
+			.hw_get_perf_counter_register_stride =
+				gv11b_ctxsw_prog_hw_get_perf_counter_register_stride,
+			.set_context_buffer_ptr =
+				gv11b_ctxsw_prog_set_context_buffer_ptr,
+			.set_type_per_veid_header =
+				gv11b_ctxsw_prog_set_type_per_veid_header,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = gv11b_fb_init_hw,
@@ -1001,6 +1087,7 @@ int gv100_init_hal(struct gk20a *g)
 	gops->ltc = gv100_ops.ltc;
 	gops->ce2 = gv100_ops.ce2;
 	gops->gr = gv100_ops.gr;
+	gops->gr.ctxsw_prog = gv100_ops.gr.ctxsw_prog;
 	gops->fb = gv100_ops.fb;
 	gops->clock_gating = gv100_ops.clock_gating;
 	gops->fifo = gv100_ops.fifo;
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -56,7 +56,6 @@
 #include <nvgpu/hw/gv11b/hw_gr_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_fifo_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_proj_gv11b.h>
-#include <nvgpu/hw/gv11b/hw_ctxsw_prog_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_ram_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_perf_gv11b.h>

@@ -1663,38 +1662,23 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 		struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_mem *ctxheader)
 {
 	struct nvgpu_mem *mem = &gr_ctx->mem;
-	u32 gfxp_preempt_option =
-		ctxsw_prog_main_image_graphics_preemption_options_control_gfxp_f();
-	u32 cilp_preempt_option =
-		ctxsw_prog_main_image_compute_preemption_options_control_cilp_f();
-	u32 cta_preempt_option =
-		ctxsw_prog_main_image_compute_preemption_options_control_cta_f();
 	int err;

 	nvgpu_log_fn(g, " ");

 	if (gr_ctx->graphics_preempt_mode ==
 					NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP) {
-		nvgpu_log_info(g, "GfxP: %x", gfxp_preempt_option);
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_graphics_preemption_options_o(),
-			gfxp_preempt_option);
+		g->ops.gr.ctxsw_prog.set_graphics_preemption_mode_gfxp(g, mem);
 	}

 	if (gr_ctx->compute_preempt_mode ==
 					NVGPU_PREEMPTION_MODE_COMPUTE_CILP) {
-		nvgpu_log_info(g, "CILP: %x", cilp_preempt_option);
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_compute_preemption_options_o(),
-			cilp_preempt_option);
+		g->ops.gr.ctxsw_prog.set_compute_preemption_mode_cilp(g, mem);
 	}

 	if (gr_ctx->compute_preempt_mode ==
 					NVGPU_PREEMPTION_MODE_COMPUTE_CTA) {
-		nvgpu_log_info(g, "CTA: %x", cta_preempt_option);
-		nvgpu_mem_wr(g, mem,
-			ctxsw_prog_main_image_compute_preemption_options_o(),
-			cta_preempt_option);
+		g->ops.gr.ctxsw_prog.set_compute_preemption_mode_cta(g, mem);
 	}

 	if (gr_ctx->preempt_ctxsw_buffer.gpu_va != 0ULL) {
@@ -2947,35 +2931,6 @@ int gr_gv11b_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c)
 	return 0;
 }

-void gr_gv11b_write_zcull_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va)
-{
-	u32 va_lo, va_hi;
-
-	gpu_va = gpu_va >> 8;
-	va_lo = u64_lo32(gpu_va);
-	va_hi = u64_hi32(gpu_va);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_zcull_ptr_o(), va_lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_zcull_ptr_hi_o(), va_hi);
-}
-
-
-void gr_gv11b_write_pm_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va)
-{
-	u32 va_lo, va_hi;
-
-	gpu_va = gpu_va >> 8;
-	va_lo = u64_lo32(gpu_va);
-	va_hi = u64_hi32(gpu_va);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_pm_ptr_o(), va_lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_pm_ptr_hi_o(), va_hi);
-}
-
 void gr_gv11b_load_tpc_mask(struct gk20a *g)
 {
 	u32 pes_tpc_mask = 0, fuse_tpc_mask;
@@ -3009,25 +2964,9 @@ void gr_gv11b_load_tpc_mask(struct gk20a *g)
 void gr_gv11b_set_preemption_buffer_va(struct gk20a *g,
 			struct nvgpu_mem *mem, u64 gpu_va)
 {
-	u32 addr_lo, addr_hi;
-
 	/* gpu va still needs to be 8 bit aligned */
-	gpu_va = gpu_va >> 8;
-
-	addr_lo = u64_lo32(gpu_va);
-	addr_hi = u64_hi32(gpu_va);
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_full_preemption_ptr_o(), addr_lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_full_preemption_ptr_hi_o(), addr_hi);
-
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_full_preemption_ptr_veid0_o(), addr_lo);
-	nvgpu_mem_wr(g, mem,
-		ctxsw_prog_main_image_full_preemption_ptr_veid0_hi_o(),
-		addr_hi);
-
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, mem, gpu_va);
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0(g, mem, gpu_va);
 }

 int gr_gv11b_init_fs_state(struct gk20a *g)
@@ -3936,7 +3875,7 @@ void gv11b_gr_get_sm_dsm_perf_regs(struct gk20a *g,
 	*num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
 	*sm_dsm_perf_regs = _sm_dsm_perf_regs;
 	*perf_register_stride =
-		ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
+		g->ops.gr.ctxsw_prog.hw_get_perf_counter_register_stride();
 }

 void gv11b_gr_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
@@ -3947,7 +3886,7 @@ void gv11b_gr_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
 	*num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
 	*sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
 	*ctrl_register_stride =
-		ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+		g->ops.gr.ctxsw_prog.hw_get_perf_counter_control_register_stride();
 }

 void gv11b_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.h
@@ -152,10 +152,6 @@ void gr_gv11b_program_sm_id_numbering(struct gk20a *g,
 int gr_gv11b_load_smid_config(struct gk20a *g);
 int gr_gv11b_commit_inst(struct channel_gk20a *c, u64 gpu_va);
 int gr_gv11b_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c);
-void gr_gv11b_write_zcull_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va);
-void gr_gv11b_write_pm_ptr(struct gk20a *g,
-				struct nvgpu_mem *mem, u64 gpu_va);
 void gr_gv11b_load_tpc_mask(struct gk20a *g);
 void gr_gv11b_set_preemption_buffer_va(struct gk20a *g,
 			struct nvgpu_mem *mem, u64 gpu_va);
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -35,6 +35,9 @@
 #include "common/fb/fb_gp10b.h"
 #include "common/fb/fb_gv11b.h"
 #include "common/netlist/netlist_gv11b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gv11b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp10b.h"
 #include "common/therm/therm_gv11b.h"
@@ -350,8 +353,6 @@ static const struct gpu_ops gv11b_ops = {
 		.enable_exceptions = gr_gv11b_enable_exceptions,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
-		.get_hw_accessor_stream_out_mode =
-			gr_gv100_get_hw_accessor_stream_out_mode,
 		.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
 		.set_pmm_register = gr_gv100_set_pmm_register,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -368,8 +369,6 @@ static const struct gpu_ops gv11b_ops = {
 		.program_zcull_mapping = gr_gv11b_program_zcull_mapping,
 		.commit_global_timeslice = gr_gv11b_commit_global_timeslice,
 		.commit_inst = gr_gv11b_commit_inst,
-		.write_zcull_ptr = gr_gv11b_write_zcull_ptr,
-		.write_pm_ptr = gr_gv11b_write_pm_ptr,
 		.load_tpc_mask = gr_gv11b_load_tpc_mask,
 		.trigger_suspend = gv11b_gr_sm_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -424,7 +423,6 @@ static const struct gpu_ops gv11b_ops = {
 		.handle_tpc_sm_ecc_exception =
 			gr_gv11b_handle_tpc_sm_ecc_exception,
 		.decode_egpc_addr = gv11b_gr_decode_egpc_addr,
-		.init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data,
 		.init_gfxp_wfi_timeout_count =
 				gr_gv11b_init_gfxp_wfi_timeout_count,
 		.get_max_gfxp_wfi_timeout_count =
@@ -455,6 +453,93 @@ static const struct gpu_ops gv11b_ops = {
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
 		.alloc_gfxp_rtv_cb = NULL,
 		.commit_gfxp_rtv_cb = NULL,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gv11b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gv11b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.hw_get_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_hw_get_pm_mode_stream_out_ctxsw,
+			.set_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_set_pm_mode_stream_out_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gv11b_ctxsw_prog_set_full_preemption_ptr,
+			.set_full_preemption_ptr_veid0 =
+				gv11b_ctxsw_prog_set_full_preemption_ptr_veid0,
+			.hw_get_perf_counter_register_stride =
+				gv11b_ctxsw_prog_hw_get_perf_counter_register_stride,
+			.set_context_buffer_ptr =
+				gv11b_ctxsw_prog_set_context_buffer_ptr,
+			.set_type_per_veid_header =
+				gv11b_ctxsw_prog_set_type_per_veid_header,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = gv11b_fb_init_hw,
@@ -889,6 +974,7 @@ int gv11b_init_hal(struct gk20a *g)
 	gops->ltc = gv11b_ops.ltc;
 	gops->ce2 = gv11b_ops.ce2;
 	gops->gr = gv11b_ops.gr;
+	gops->gr.ctxsw_prog = gv11b_ops.gr.ctxsw_prog;
 	gops->fb = gv11b_ops.fb;
 	gops->clock_gating = gv11b_ops.clock_gating;
 	gops->fifo = gv11b_ops.fifo;
--- a/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
@@ -31,7 +31,6 @@
 #include <nvgpu/channel.h>

 #include <nvgpu/hw/gv11b/hw_ram_gv11b.h>
-#include <nvgpu/hw/gv11b/hw_ctxsw_prog_gv11b.h>
 #include <nvgpu/hw/gv11b/hw_gr_gv11b.h>

 #include "gv11b/subctx_gv11b.h"
@@ -65,7 +64,8 @@ int gv11b_alloc_subctx_header(struct channel_gk20a *c)
 	nvgpu_log(g, gpu_dbg_fn, "gv11b_alloc_subctx_header");

 	if (!nvgpu_mem_is_valid(ctxheader)) {
-		ret = nvgpu_dma_alloc_sys(g, ctxsw_prog_fecs_header_v(),
+		ret = nvgpu_dma_alloc_sys(g,
+				g->ops.gr.ctxsw_prog.hw_get_fecs_header_size(),
 				ctxheader);
 		if (ret != 0) {
 			nvgpu_err(g, "failed to allocate sub ctx header");
@@ -100,7 +100,6 @@ int gv11b_update_subctx_header(struct channel_gk20a *c, u64 gpu_va)
 	struct nvgpu_mem *ctxheader = &c->ctx_header;
 	struct gk20a *g = c->g;
 	int ret = 0;
-	u32 addr_lo, addr_hi;
 	struct tsg_gk20a *tsg;
 	struct nvgpu_gr_ctx *gr_ctx;

@@ -114,38 +113,20 @@ int gv11b_update_subctx_header(struct channel_gk20a *c, u64 gpu_va)
 	g->ops.mm.l2_flush(g, true);

 	/* set priv access map */
-	addr_lo = u64_lo32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
-	addr_hi = u64_hi32(gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
-	nvgpu_mem_wr(g, ctxheader,
-		ctxsw_prog_main_image_priv_access_map_addr_lo_o(),
-		addr_lo);
-	nvgpu_mem_wr(g, ctxheader,
-		ctxsw_prog_main_image_priv_access_map_addr_hi_o(),
-		addr_hi);
+	g->ops.gr.ctxsw_prog.set_priv_access_map_addr(g, ctxheader,
+		gr_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);

-	addr_lo = u64_lo32(gr_ctx->patch_ctx.mem.gpu_va);
-	addr_hi = u64_hi32(gr_ctx->patch_ctx.mem.gpu_va);
-	nvgpu_mem_wr(g, ctxheader,
-		ctxsw_prog_main_image_patch_adr_lo_o(),
-		addr_lo);
-	nvgpu_mem_wr(g, ctxheader,
-		ctxsw_prog_main_image_patch_adr_hi_o(),
-		addr_hi);
+	g->ops.gr.ctxsw_prog.set_patch_addr(g, ctxheader,
+		gr_ctx->patch_ctx.mem.gpu_va);

-	g->ops.gr.write_pm_ptr(g, ctxheader, gr_ctx->pm_ctx.mem.gpu_va);
-	g->ops.gr.write_zcull_ptr(g, ctxheader, gr_ctx->zcull_ctx.gpu_va);
+	g->ops.gr.ctxsw_prog.set_pm_ptr(g, ctxheader,
+		gr_ctx->pm_ctx.mem.gpu_va);
+	g->ops.gr.ctxsw_prog.set_zcull_ptr(g, ctxheader,
+		gr_ctx->zcull_ctx.gpu_va);

-	addr_lo = u64_lo32(gpu_va);
-	addr_hi = u64_hi32(gpu_va);
+	g->ops.gr.ctxsw_prog.set_context_buffer_ptr(g, ctxheader, gpu_va);

-	nvgpu_mem_wr(g, ctxheader,
-		ctxsw_prog_main_image_context_buffer_ptr_hi_o(), addr_hi);
-	nvgpu_mem_wr(g, ctxheader,
-		ctxsw_prog_main_image_context_buffer_ptr_o(), addr_lo);
-
-	nvgpu_mem_wr(g, ctxheader,
-                ctxsw_prog_main_image_ctl_o(),
-                ctxsw_prog_main_image_ctl_type_per_veid_header_v());
+	g->ops.gr.ctxsw_prog.set_type_per_veid_header(g, ctxheader);

 	return ret;
 }
--- a/drivers/gpu/nvgpu/include/nvgpu/fecs_trace.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/fecs_trace.h
@@ -45,13 +45,11 @@ struct gk20a_fecs_trace_record {
 };

 #ifdef CONFIG_GK20A_CTXSW_TRACE
-u32 gk20a_fecs_trace_record_ts_tag_invalid_ts_v(void);
-u32 gk20a_fecs_trace_record_ts_tag_v(u64 ts);
-u64 gk20a_fecs_trace_record_ts_timestamp_v(u64 ts);
-int gk20a_fecs_trace_num_ts(void);
+int gk20a_fecs_trace_num_ts(struct gk20a *g);
 struct gk20a_fecs_trace_record *gk20a_fecs_trace_get_record(struct gk20a *g,
 	int idx);
-bool gk20a_fecs_trace_is_valid_record(struct gk20a_fecs_trace_record *r);
+bool gk20a_fecs_trace_is_valid_record(struct gk20a *g,
+	struct gk20a_fecs_trace_record *r);
 int gk20a_fecs_trace_get_read_index(struct gk20a *g);
 int gk20a_fecs_trace_get_write_index(struct gk20a *g);

--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -342,7 +342,6 @@ struct gpu_ops {
 		int (*update_smpc_ctxsw_mode)(struct gk20a *g,
 				struct channel_gk20a *c,
 				bool enable);
-		u32 (*get_hw_accessor_stream_out_mode)(void);
 		int (*update_hwpm_ctxsw_mode)(struct gk20a *g,
 				struct channel_gk20a *c,
 				u64 gpu_va,
@@ -454,10 +453,6 @@ struct gpu_ops {
 		int (*commit_global_timeslice)(struct gk20a *g,
 					struct channel_gk20a *c);
 		int (*commit_inst)(struct channel_gk20a *c, u64 gpu_va);
-		void (*write_zcull_ptr)(struct gk20a *g,
-					struct nvgpu_mem *mem, u64 gpu_va);
-		void (*write_pm_ptr)(struct gk20a *g,
-					struct nvgpu_mem *mem, u64 gpu_va);
 		void (*set_preemption_buffer_va)(struct gk20a *g,
 					struct nvgpu_mem *mem, u64 gpu_va);
 		void (*load_tpc_mask)(struct gk20a *g);
@@ -479,8 +474,6 @@ struct gpu_ops {
 				u32 gpc, u32 tpc, u32 sm);
 		void (*resume_all_sms)(struct gk20a *g);
 		void (*disable_rd_coalesce)(struct gk20a *g);
-		void (*init_ctxsw_hdr_data)(struct gk20a *g,
-					struct nvgpu_mem *mem);
 		void (*init_gfxp_wfi_timeout_count)(struct gk20a *g);
 		unsigned long (*get_max_gfxp_wfi_timeout_count)
 					(struct gk20a *g);
@@ -539,6 +532,96 @@ struct gpu_ops {
 			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm);
 		void (*commit_gfxp_rtv_cb)(struct gk20a *g,
 			  struct nvgpu_gr_ctx *gr_ctx, bool patch);
+		struct {
+			u32 (*hw_get_fecs_header_size)(void);
+			u32 (*hw_get_gpccs_header_size)(void);
+			u32 (*hw_get_extended_buffer_segments_size_in_bytes)(void);
+			u32 (*hw_extended_marker_size_in_bytes)(void);
+			u32 (*hw_get_perf_counter_control_register_stride)(void);
+			u32 (*hw_get_perf_counter_register_stride)(void);
+			u32 (*get_main_image_ctx_id)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			u32 (*get_patch_count)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_patch_count)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u32 count);
+			void (*set_patch_addr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*set_zcull_ptr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*set_zcull)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u32 mode);
+			void (*set_zcull_mode_no_ctxsw)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			bool (*is_zcull_mode_separate_buffer)(u32 mode);
+			void (*set_pm_ptr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*set_pm_mode)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u32 mode);
+			void (*set_pm_smpc_mode)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, bool enable);
+			u32 (*set_pm_mode_no_ctxsw)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			u32 (*set_pm_mode_ctxsw)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			u32 (*set_pm_mode_stream_out_ctxsw)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			u32 (*hw_get_pm_mode_no_ctxsw)(void);
+			u32 (*hw_get_pm_mode_ctxsw)(void);
+			u32 (*hw_get_pm_mode_stream_out_ctxsw)(void);
+			void (*init_ctxsw_hdr_data)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_compute_preemption_mode_cta)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_compute_preemption_mode_cilp)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_graphics_preemption_mode_gfxp)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_cde_enabled)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_pc_sampling)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, bool enable);
+			void (*set_priv_access_map_config_mode)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, bool allow_all);
+			void (*set_priv_access_map_addr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*disable_verif_features)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			bool (*check_main_image_header_magic)(u8 *context);
+			bool (*check_local_header_magic)(u8 *context);
+			u32 (*get_num_gpcs)(u8 *context);
+			u32 (*get_num_tpcs)(u8 *context);
+			void (*get_extended_buffer_size_offset)(u8 *context,
+				u32 *size, u32 *offset);
+			void (*get_ppc_info)(u8 *context,
+				u32 *num_ppcs, u32 *ppc_mask);
+			u32 (*get_local_priv_register_ctl_offset)(u8 *context);
+			u32 (*hw_get_ts_tag_invalid_timestamp)(void);
+			u32 (*hw_get_ts_tag)(u64 ts);
+			u64 (*hw_record_ts_timestamp)(u64 ts);
+			u32 (*hw_get_ts_record_size_in_bytes)(void);
+			u32 (*is_ts_valid_record)(u32 magic_hi);
+			u32 (*get_ts_buffer_aperture_mask)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*set_ts_num_records)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u32 num);
+			void (*set_ts_buffer_ptr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr,
+				u32 aperture_mask);
+			void (*set_pmu_options_boost_clock_frequencies)(
+				struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u32 boosted_ctx);
+			void (*set_context_buffer_ptr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*set_full_preemption_ptr)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*set_full_preemption_ptr_veid0)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem, u64 addr);
+			void (*set_type_per_veid_header)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+			void (*dump_ctxsw_stats)(struct gk20a *g,
+				struct nvgpu_mem *ctx_mem);
+		} ctxsw_prog;
 	} gr;
 	struct {
 		void (*init_hw)(struct gk20a *g);
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gm20b/hw_ctxsw_prog_gm20b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gm20b/hw_ctxsw_prog_gm20b.h
@@ -62,6 +62,10 @@ static inline u32 ctxsw_prog_fecs_header_v(void)
 {
 	return 0x00000100U;
 }
+static inline u32 ctxsw_prog_gpccs_header_stride_v(void)
+{
+	return 0x00000100U;
+}
 static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
 {
 	return 0x00000008U;
@@ -118,6 +122,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
 	return U32(0x7U) << 0U;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+	return 0x1U;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gp106/hw_ctxsw_prog_gp106.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gp106/hw_ctxsw_prog_gp106.h
@@ -62,6 +62,10 @@ static inline u32 ctxsw_prog_fecs_header_v(void)
 {
 	return 0x00000100U;
 }
+static inline u32 ctxsw_prog_gpccs_header_stride_v(void)
+{
+	return 0x00000100U;
+}
 static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
 {
 	return 0x00000008U;
@@ -102,6 +106,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
 	return U32(0x7U) << 0U;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+	return 0x1U;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gp10b/hw_ctxsw_prog_gp10b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gp10b/hw_ctxsw_prog_gp10b.h
@@ -62,6 +62,10 @@ static inline u32 ctxsw_prog_fecs_header_v(void)
 {
 	return 0x00000100U;
 }
+static inline u32 ctxsw_prog_gpccs_header_stride_v(void)
+{
+	return 0x00000100U;
+}
 static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
 {
 	return 0x00000008U;
@@ -106,6 +110,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
 	return U32(0x7U) << 0U;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+	return 0x1U;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_ctxsw_prog_gv100.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv100/hw_ctxsw_prog_gv100.h
@@ -62,6 +62,10 @@ static inline u32 ctxsw_prog_fecs_header_v(void)
 {
 	return 0x00000100U;
 }
+static inline u32 ctxsw_prog_gpccs_header_stride_v(void)
+{
+	return 0x00000100U;
+}
 static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
 {
 	return 0x00000008U;
@@ -142,6 +146,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
 	return U32(0x7U) << 0U;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+	return 0x1U;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_ctxsw_prog_gv11b.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/gv11b/hw_ctxsw_prog_gv11b.h
@@ -62,6 +62,10 @@ static inline u32 ctxsw_prog_fecs_header_v(void)
 {
 	return 0x00000100U;
 }
+static inline u32 ctxsw_prog_gpccs_header_stride_v(void)
+{
+	return 0x00000100U;
+}
 static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
 {
 	return 0x00000008U;
@@ -142,6 +146,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
 	return U32(0x7U) << 0U;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+	return 0x1U;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_ctxsw_prog_tu104.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_ctxsw_prog_tu104.h
@@ -62,6 +62,10 @@ static inline u32 ctxsw_prog_fecs_header_v(void)
 {
 	return 0x00000100U;
 }
+static inline u32 ctxsw_prog_gpccs_header_stride_v(void)
+{
+	return 0x00000100U;
+}
 static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
 {
 	return 0x00000008U;
@@ -142,6 +146,10 @@ static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
 {
 	return U32(0x7U) << 0U;
 }
+static inline u32 ctxsw_prog_main_image_pm_mode_ctxsw_f(void)
+{
+	return 0x1U;
+}
 static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
 {
 	return 0x0U;
--- a/drivers/gpu/nvgpu/os/linux/ctxsw_trace.c
+++ b/drivers/gpu/nvgpu/os/linux/ctxsw_trace.c
@@ -36,7 +36,6 @@
 #include "os_linux.h"
 #include "ctxsw_trace.h"

-#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>

 #define GK20A_CTXSW_TRACE_MAX_VM_RING_SIZE	(128*PAGE_SIZE)
--- a/drivers/gpu/nvgpu/os/linux/debug_fecs_trace.c
+++ b/drivers/gpu/nvgpu/os/linux/debug_fecs_trace.c
@@ -55,23 +55,24 @@ static int gk20a_fecs_trace_debugfs_ring_seq_show(
 	struct gk20a_fecs_trace_record *r =
 		gk20a_fecs_trace_get_record(g, *pos);
 	int i;
-	const u32 invalid_tag = gk20a_fecs_trace_record_ts_tag_invalid_ts_v();
+	const u32 invalid_tag =
+		g->ops.gr.ctxsw_prog.hw_get_ts_tag_invalid_timestamp();
 	u32 tag;
 	u64 timestamp;

 	seq_printf(s, "record #%lld (%p)\n", *pos, r);
 	seq_printf(s, "\tmagic_lo=%08x\n", r->magic_lo);
 	seq_printf(s, "\tmagic_hi=%08x\n", r->magic_hi);
-	if (gk20a_fecs_trace_is_valid_record(r)) {
+	if (gk20a_fecs_trace_is_valid_record(g, r)) {
 		seq_printf(s, "\tcontext_ptr=%08x\n", r->context_ptr);
 		seq_printf(s, "\tcontext_id=%08x\n", r->context_id);
 		seq_printf(s, "\tnew_context_ptr=%08x\n", r->new_context_ptr);
 		seq_printf(s, "\tnew_context_id=%08x\n", r->new_context_id);
-		for (i = 0; i < gk20a_fecs_trace_num_ts(); i++) {
-			tag = gk20a_fecs_trace_record_ts_tag_v(r->ts[i]);
+		for (i = 0; i < gk20a_fecs_trace_num_ts(g); i++) {
+			tag = g->ops.gr.ctxsw_prog.hw_get_ts_tag(r->ts[i]);
 			if (tag == invalid_tag)
 				continue;
-			timestamp = gk20a_fecs_trace_record_ts_timestamp_v(r->ts[i]);
+			timestamp = g->ops.gr.ctxsw_prog.hw_record_ts_timestamp(r->ts[i]);
 			timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
 			seq_printf(s, "\ttag=%02x timestamp=%012llx\n", tag, timestamp);
 		}
--- a/drivers/gpu/nvgpu/os/linux/sched.c
+++ b/drivers/gpu/nvgpu/os/linux/sched.c
@@ -31,7 +31,6 @@
 #include "os_linux.h"
 #include "ioctl_tsg.h"

-#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>

 ssize_t gk20a_sched_dev_read(struct file *filp, char __user *buf,
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -38,6 +38,9 @@
 #include "common/fb/fb_tu104.h"
 #include "common/xve/xve_gp106.h"
 #include "common/netlist/netlist_tu104.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gv11b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp10b.h"
 #include "common/therm/therm_gp106.h"
@@ -412,8 +415,6 @@ static const struct gpu_ops tu104_ops = {
 		.enable_exceptions = gr_gv11b_enable_exceptions,
 		.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
 		.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
-		.get_hw_accessor_stream_out_mode =
-			gr_gv100_get_hw_accessor_stream_out_mode,
 		.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
 		.set_pmm_register = gr_gv100_set_pmm_register,
 		.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -430,8 +431,6 @@ static const struct gpu_ops tu104_ops = {
 		.program_zcull_mapping = gr_gv11b_program_zcull_mapping,
 		.commit_global_timeslice = gr_gv11b_commit_global_timeslice,
 		.commit_inst = gr_gv11b_commit_inst,
-		.write_zcull_ptr = gr_gv11b_write_zcull_ptr,
-		.write_pm_ptr = gr_gv11b_write_pm_ptr,
 		.load_tpc_mask = gr_gv11b_load_tpc_mask,
 		.trigger_suspend = gv11b_gr_sm_trigger_suspend,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -488,7 +487,6 @@ static const struct gpu_ops tu104_ops = {
 		.handle_tpc_sm_ecc_exception =
 			gr_gv11b_handle_tpc_sm_ecc_exception,
 		.decode_egpc_addr = gv11b_gr_decode_egpc_addr,
-		.init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data,
 		.init_gfxp_wfi_timeout_count =
 				gr_gv11b_init_gfxp_wfi_timeout_count,
 		.get_max_gfxp_wfi_timeout_count =
@@ -517,6 +515,93 @@ static const struct gpu_ops tu104_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gv11b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gv11b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.hw_get_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_hw_get_pm_mode_stream_out_ctxsw,
+			.set_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_set_pm_mode_stream_out_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gm20b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gv11b_ctxsw_prog_set_full_preemption_ptr,
+			.set_full_preemption_ptr_veid0 =
+				gv11b_ctxsw_prog_set_full_preemption_ptr_veid0,
+			.hw_get_perf_counter_register_stride =
+				gv11b_ctxsw_prog_hw_get_perf_counter_register_stride,
+			.set_context_buffer_ptr =
+				gv11b_ctxsw_prog_set_context_buffer_ptr,
+			.set_type_per_veid_header =
+				gv11b_ctxsw_prog_set_type_per_veid_header,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = gv11b_fb_init_hw,
@@ -1027,6 +1112,7 @@ int tu104_init_hal(struct gk20a *g)
 	gops->ltc = tu104_ops.ltc;
 	gops->ce2 = tu104_ops.ce2;
 	gops->gr = tu104_ops.gr;
+	gops->gr.ctxsw_prog = tu104_ops.gr.ctxsw_prog;
 	gops->fb = tu104_ops.fb;
 	gops->clock_gating = tu104_ops.clock_gating;
 	gops->fifo = tu104_ops.fifo;
--- a/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/vgpu/gp10b/vgpu_hal_gp10b.c
@@ -28,6 +28,8 @@
 #include "common/fb/fb_gm20b.h"
 #include "common/fb/fb_gp10b.h"
 #include "common/netlist/netlist_gp10b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp10b.h"
 #include "common/ltc/ltc_gm20b.h"
@@ -176,8 +178,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.program_zcull_mapping = NULL,
 		.commit_global_timeslice = NULL,
 		.commit_inst = vgpu_gr_commit_inst,
-		.write_zcull_ptr = gr_gk20a_write_zcull_ptr,
-		.write_pm_ptr = gr_gk20a_write_pm_ptr,
 		.load_tpc_mask = NULL,
 		.trigger_suspend = NULL,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -212,7 +212,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.set_bes_crop_debug4 = NULL,
 		.set_ctxsw_preemption_mode =
 					vgpu_gr_gp10b_set_ctxsw_preemption_mode,
-		.init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data,
 		.init_gfxp_wfi_timeout_count =
 			gr_gp10b_init_gfxp_wfi_timeout_count,
 		.get_max_gfxp_wfi_timeout_count =
@@ -230,6 +229,81 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.get_offset_in_gpccs_segment =
 			gr_gk20a_get_offset_in_gpccs_segment,
 		.set_debug_mode = gm20b_gr_set_debug_mode,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gm20b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gm20b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gp10b_ctxsw_prog_set_full_preemption_ptr,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = NULL,
@@ -575,6 +649,7 @@ int vgpu_gp10b_init_hal(struct gk20a *g)
 	gops->ltc = vgpu_gp10b_ops.ltc;
 	gops->ce2 = vgpu_gp10b_ops.ce2;
 	gops->gr = vgpu_gp10b_ops.gr;
+	gops->gr.ctxsw_prog = vgpu_gp10b_ops.gr.ctxsw_prog;
 	gops->fb = vgpu_gp10b_ops.fb;
 	gops->clock_gating = vgpu_gp10b_ops.clock_gating;
 	gops->fifo = vgpu_gp10b_ops.fifo;
--- a/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/gr_vgpu.c
@@ -39,7 +39,6 @@
 #include "gk20a/fecs_trace_gk20a.h"

 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
-#include <nvgpu/hw/gk20a/hw_ctxsw_prog_gk20a.h>

 void vgpu_gr_detect_sm_arch(struct gk20a *g)
 {
@@ -614,7 +613,7 @@ int vgpu_gr_alloc_obj_ctx(struct channel_gk20a  *c, u32 class_num, u32 flags)
 	}

 	/* PM ctxt switch is off by default */
-	gr_ctx->pm_ctx.pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+	gr_ctx->pm_ctx.pm_mode = g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();

 	nvgpu_log_fn(g, "done");
 	return 0;
@@ -1087,18 +1086,21 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 		 * will return an error due to using the same GPU VA twice.
 		 */

-		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_ctxsw_f()) {
+		if (pm_ctx->pm_mode ==
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_ctxsw()) {
 			return 0;
 		}
 		p->mode = TEGRA_VGPU_CTXSW_MODE_CTXSW;
 	} else if (mode == NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW) {
-		if (pm_ctx->pm_mode == ctxsw_prog_main_image_pm_mode_no_ctxsw_f()) {
+		if (pm_ctx->pm_mode ==
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw()) {
 			return 0;
 		}
 		p->mode = TEGRA_VGPU_CTXSW_MODE_NO_CTXSW;
 	} else if ((mode == NVGPU_DBG_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
-			(g->ops.gr.get_hw_accessor_stream_out_mode)){
-		if (pm_ctx->pm_mode == g->ops.gr.get_hw_accessor_stream_out_mode()) {
+			g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw()) {
+		if (pm_ctx->pm_mode ==
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw()) {
 			return 0;
 		}
 		p->mode = TEGRA_VGPU_CTXSW_MODE_STREAM_OUT_CTXSW;
@@ -1130,11 +1132,14 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
 	err = err ? err : msg.ret;
 	if (!err) {
 		if (mode == NVGPU_DBG_HWPM_CTXSW_MODE_CTXSW) {
-			pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_ctxsw_f();
+			pm_ctx->pm_mode =
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_ctxsw();
 		} else if (mode == NVGPU_DBG_HWPM_CTXSW_MODE_NO_CTXSW) {
-			pm_ctx->pm_mode = ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+			pm_ctx->pm_mode =
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw();
 		} else {
-			pm_ctx->pm_mode = g->ops.gr.get_hw_accessor_stream_out_mode();
+			pm_ctx->pm_mode =
+				g->ops.gr.ctxsw_prog.hw_get_pm_mode_stream_out_ctxsw();
 		}
 	}

--- a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_hal_gv11b.c
@@ -29,6 +29,9 @@
 #include "common/fb/fb_gp10b.h"
 #include "common/fb/fb_gv11b.h"
 #include "common/netlist/netlist_gv11b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
+#include "common/gr/ctxsw_prog/ctxsw_prog_gv11b.h"
 #include "common/therm/therm_gm20b.h"
 #include "common/therm/therm_gp10b.h"
 #include "common/therm/therm_gv11b.h"
@@ -177,8 +180,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.enable_exceptions = NULL,
 		.get_lrf_tex_ltc_dram_override = NULL,
 		.update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode,
-		.get_hw_accessor_stream_out_mode =
-			gr_gv100_get_hw_accessor_stream_out_mode,
 		.update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode,
 		.record_sm_error_state = gv11b_gr_record_sm_error_state,
 		.clear_sm_error_state = vgpu_gr_clear_sm_error_state,
@@ -192,8 +193,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.program_zcull_mapping = NULL,
 		.commit_global_timeslice = NULL,
 		.commit_inst = vgpu_gr_gv11b_commit_inst,
-		.write_zcull_ptr = gr_gv11b_write_zcull_ptr,
-		.write_pm_ptr = gr_gv11b_write_pm_ptr,
 		.load_tpc_mask = NULL,
 		.trigger_suspend = NULL,
 		.wait_for_pause = gr_gk20a_wait_for_pause,
@@ -247,7 +246,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.handle_tpc_sm_ecc_exception =
 			gr_gv11b_handle_tpc_sm_ecc_exception,
 		.decode_egpc_addr = gv11b_gr_decode_egpc_addr,
-		.init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data,
 		.init_gfxp_wfi_timeout_count =
 			gr_gv11b_init_gfxp_wfi_timeout_count,
 		.get_max_gfxp_wfi_timeout_count =
@@ -265,6 +263,93 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.get_offset_in_gpccs_segment =
 			gr_gk20a_get_offset_in_gpccs_segment,
 		.set_debug_mode = gm20b_gr_set_debug_mode,
+		.ctxsw_prog = {
+			.hw_get_fecs_header_size =
+				gm20b_ctxsw_prog_hw_get_fecs_header_size,
+			.hw_get_gpccs_header_size =
+				gm20b_ctxsw_prog_hw_get_gpccs_header_size,
+			.hw_get_extended_buffer_segments_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_extended_buffer_segments_size_in_bytes,
+			.hw_extended_marker_size_in_bytes =
+				gm20b_ctxsw_prog_hw_extended_marker_size_in_bytes,
+			.hw_get_perf_counter_control_register_stride =
+				gm20b_ctxsw_prog_hw_get_perf_counter_control_register_stride,
+			.get_main_image_ctx_id =
+				gm20b_ctxsw_prog_get_main_image_ctx_id,
+			.get_patch_count = gm20b_ctxsw_prog_get_patch_count,
+			.set_patch_count = gm20b_ctxsw_prog_set_patch_count,
+			.set_patch_addr = gm20b_ctxsw_prog_set_patch_addr,
+			.set_zcull_ptr = gv11b_ctxsw_prog_set_zcull_ptr,
+			.set_zcull = gm20b_ctxsw_prog_set_zcull,
+			.set_zcull_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_zcull_mode_no_ctxsw,
+			.is_zcull_mode_separate_buffer =
+				gm20b_ctxsw_prog_is_zcull_mode_separate_buffer,
+			.set_pm_ptr = gv11b_ctxsw_prog_set_pm_ptr,
+			.set_pm_mode = gm20b_ctxsw_prog_set_pm_mode,
+			.set_pm_smpc_mode = gm20b_ctxsw_prog_set_pm_smpc_mode,
+			.set_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_set_pm_mode_no_ctxsw,
+			.set_pm_mode_ctxsw = gm20b_ctxsw_prog_set_pm_mode_ctxsw,
+			.hw_get_pm_mode_no_ctxsw =
+				gm20b_ctxsw_prog_hw_get_pm_mode_no_ctxsw,
+			.hw_get_pm_mode_ctxsw = gm20b_ctxsw_prog_hw_get_pm_mode_ctxsw,
+			.hw_get_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_hw_get_pm_mode_stream_out_ctxsw,
+			.set_pm_mode_stream_out_ctxsw =
+				gv11b_ctxsw_prog_set_pm_mode_stream_out_ctxsw,
+			.init_ctxsw_hdr_data = gp10b_ctxsw_prog_init_ctxsw_hdr_data,
+			.set_compute_preemption_mode_cta =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cta,
+			.set_compute_preemption_mode_cilp =
+				gp10b_ctxsw_prog_set_compute_preemption_mode_cilp,
+			.set_graphics_preemption_mode_gfxp =
+				gp10b_ctxsw_prog_set_graphics_preemption_mode_gfxp,
+			.set_cde_enabled = gm20b_ctxsw_prog_set_cde_enabled,
+			.set_pc_sampling = gm20b_ctxsw_prog_set_pc_sampling,
+			.set_priv_access_map_config_mode =
+				gm20b_ctxsw_prog_set_priv_access_map_config_mode,
+			.set_priv_access_map_addr =
+				gm20b_ctxsw_prog_set_priv_access_map_addr,
+			.disable_verif_features =
+				gm20b_ctxsw_prog_disable_verif_features,
+			.check_main_image_header_magic =
+				gm20b_ctxsw_prog_check_main_image_header_magic,
+			.check_local_header_magic =
+				gm20b_ctxsw_prog_check_local_header_magic,
+			.get_num_gpcs = gm20b_ctxsw_prog_get_num_gpcs,
+			.get_num_tpcs = gm20b_ctxsw_prog_get_num_tpcs,
+			.get_extended_buffer_size_offset =
+				gm20b_ctxsw_prog_get_extended_buffer_size_offset,
+			.get_ppc_info = gm20b_ctxsw_prog_get_ppc_info,
+			.get_local_priv_register_ctl_offset =
+				gm20b_ctxsw_prog_get_local_priv_register_ctl_offset,
+			.hw_get_ts_tag_invalid_timestamp =
+				gm20b_ctxsw_prog_hw_get_ts_tag_invalid_timestamp,
+			.hw_get_ts_tag = gm20b_ctxsw_prog_hw_get_ts_tag,
+			.hw_record_ts_timestamp =
+				gm20b_ctxsw_prog_hw_record_ts_timestamp,
+			.hw_get_ts_record_size_in_bytes =
+				gm20b_ctxsw_prog_hw_get_ts_record_size_in_bytes,
+			.is_ts_valid_record = gm20b_ctxsw_prog_is_ts_valid_record,
+			.get_ts_buffer_aperture_mask =
+				gm20b_ctxsw_prog_get_ts_buffer_aperture_mask,
+			.set_ts_num_records = gm20b_ctxsw_prog_set_ts_num_records,
+			.set_ts_buffer_ptr = gm20b_ctxsw_prog_set_ts_buffer_ptr,
+			.set_pmu_options_boost_clock_frequencies =
+				gp10b_ctxsw_prog_set_pmu_options_boost_clock_frequencies,
+			.set_full_preemption_ptr =
+				gv11b_ctxsw_prog_set_full_preemption_ptr,
+			.set_full_preemption_ptr_veid0 =
+				gv11b_ctxsw_prog_set_full_preemption_ptr_veid0,
+			.hw_get_perf_counter_register_stride =
+				gv11b_ctxsw_prog_hw_get_perf_counter_register_stride,
+			.set_context_buffer_ptr =
+				gv11b_ctxsw_prog_set_context_buffer_ptr,
+			.set_type_per_veid_header =
+				gv11b_ctxsw_prog_set_type_per_veid_header,
+			.dump_ctxsw_stats = gp10b_ctxsw_prog_dump_ctxsw_stats,
+		}
 	},
 	.fb = {
 		.init_hw = NULL,
@@ -640,6 +725,7 @@ int vgpu_gv11b_init_hal(struct gk20a *g)
 	gops->ltc = vgpu_gv11b_ops.ltc;
 	gops->ce2 = vgpu_gv11b_ops.ce2;
 	gops->gr = vgpu_gv11b_ops.gr;
+	gops->gr.ctxsw_prog = vgpu_gv11b_ops.gr.ctxsw_prog;
 	gops->fb = vgpu_gv11b_ops.fb;
 	gops->clock_gating = vgpu_gv11b_ops.clock_gating;
 	gops->fifo = vgpu_gv11b_ops.fifo;
--- a/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_subctx_gv11b.c
+++ b/drivers/gpu/nvgpu/vgpu/gv11b/vgpu_subctx_gv11b.c
@@ -27,7 +27,6 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/channel.h>

-#include <nvgpu/hw/gv11b/hw_ctxsw_prog_gv11b.h>

 int vgpu_gv11b_alloc_subctx_header(struct channel_gk20a *c)
 {
@@ -41,8 +40,8 @@ int vgpu_gv11b_alloc_subctx_header(struct channel_gk20a *c)
 	msg.handle = vgpu_get_handle(c->g);
 	p->ch_handle = c->virt_ctx;
 	p->ctx_header_va = __nvgpu_vm_alloc_va(c->vm,
-				ctxsw_prog_fecs_header_v(),
-				GMMU_PAGE_SIZE_KERNEL);
+			c->g->ops.gr.ctxsw_prog.hw_get_fecs_header_size(),
+			GMMU_PAGE_SIZE_KERNEL);
 	if (!p->ctx_header_va) {
 		nvgpu_err(c->g, "alloc va failed for ctx_header");
 		return -ENOMEM;