diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
index 98fe3301e..5f4fb6791 100644
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -81,7 +81,7 @@ enum /* global_ctx_buffer */ {
 	ATTRIBUTE_VPR		= 5,
 	GOLDEN_CTX		= 6,
 	PRIV_ACCESS_MAP		= 7,
-	/* #8 is reserved */
+	RTV_CIRCULAR_BUFFER	= 8,
 	FECS_TRACE_BUFFER	= 9,
 	NR_GLOBAL_CTX_BUF	= 10
 };
@@ -93,7 +93,7 @@ enum  /*global_ctx_buffer_va */ {
 	ATTRIBUTE_VA		= 2,
 	GOLDEN_CTX_VA		= 3,
 	PRIV_ACCESS_MAP_VA	= 4,
-	/* #5 is reserved */
+	RTV_CIRCULAR_BUFFER_VA	= 5,
 	FECS_TRACE_BUFFER_VA	= 6,
 	NR_GLOBAL_CTX_BUF_VA	= 7
 };
@@ -403,6 +403,7 @@ struct nvgpu_gr_ctx {
 	struct nvgpu_mem spill_ctxsw_buffer;
 	struct nvgpu_mem betacb_ctxsw_buffer;
 	struct nvgpu_mem pagepool_ctxsw_buffer;
+	struct nvgpu_mem gfxp_rtvcb_ctxsw_buffer;
 	u32 ctx_id;
 	bool ctx_id_valid;
 	bool cilp_preempt_pending;
diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
index 9fb2ab46d..3ff567165 100644
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -333,6 +333,8 @@ static const struct gpu_ops gm20b_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.alloc_gfxp_rtv_cb = NULL,
+		.commit_gfxp_rtv_cb = NULL,
 	},
 	.fb = {
 		.init_hw = gm20b_fb_init_hw,
diff --git a/drivers/gpu/nvgpu/gp106/hal_gp106.c b/drivers/gpu/nvgpu/gp106/hal_gp106.c
index c414f0fdb..412649f08 100644
--- a/drivers/gpu/nvgpu/gp106/hal_gp106.c
+++ b/drivers/gpu/nvgpu/gp106/hal_gp106.c
@@ -410,6 +410,8 @@ static const struct gpu_ops gp106_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.alloc_gfxp_rtv_cb = NULL,
+		.commit_gfxp_rtv_cb = NULL,
 	},
 	.fb = {
 		.init_hw = gm20b_fb_init_hw,
diff --git a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
index fb6513d43..ffefb32b7 100644
--- a/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/gr_gp10b.c
@@ -1036,6 +1036,14 @@ int gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g,
 			goto fail_free_betacb;
 		}
 
+		if (g->ops.gr.alloc_gfxp_rtv_cb != NULL) {
+			err = g->ops.gr.alloc_gfxp_rtv_cb(g, gr_ctx, vm);
+			if (err != 0) {
+				nvgpu_err(g, "cannot allocate gfxp rtv_cb");
+				goto fail_free_pagepool;
+			}
+		}
+
 		gr_ctx->graphics_preempt_mode = graphics_preempt_mode;
 		break;
 		}
@@ -1063,6 +1071,8 @@ int gr_gp10b_set_ctxsw_preemption_mode(struct gk20a *g,
 
 	return 0;
 
+fail_free_pagepool:
+	nvgpu_dma_unmap_free(vm, &gr_ctx->pagepool_ctxsw_buffer);
 fail_free_betacb:
 	nvgpu_dma_unmap_free(vm, &gr_ctx->betacb_ctxsw_buffer);
 fail_free_spill:
diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
index f2e0ed5d4..3f0ad1659 100644
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -371,6 +371,8 @@ static const struct gpu_ops gp10b_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.alloc_gfxp_rtv_cb = NULL,
+		.commit_gfxp_rtv_cb = NULL,
 	},
 	.fb = {
 		.init_hw = gm20b_fb_init_hw,
diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c
index 60da8dd6b..52ab46328 100644
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -491,6 +491,8 @@ static const struct gpu_ops gv100_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.alloc_gfxp_rtv_cb = NULL,
+		.commit_gfxp_rtv_cb = NULL,
 	},
 	.fb = {
 		.init_hw = gv11b_fb_init_hw,
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index a0b373867..f6cc2117b 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -1771,6 +1771,10 @@ void gr_gv11b_update_ctxsw_preemption_mode(struct gk20a *g,
 				g->gr.gfxp_wfi_timeout_count,
 				true);
 
+		if (g->ops.gr.commit_gfxp_rtv_cb != NULL) {
+			g->ops.gr.commit_gfxp_rtv_cb(g, gr_ctx, true);
+		}
+
 		gr_gk20a_ctx_patch_write_end(g, gr_ctx, true);
 	}
 
diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
index 2e5e4f9a7..8c80f59f2 100644
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -453,6 +453,8 @@ static const struct gpu_ops gv11b_ops = {
 		.dump_gr_falcon_stats = gk20a_fecs_dump_falcon_stats,
 		.get_fecs_ctx_state_store_major_rev_id =
 			gk20a_gr_get_fecs_ctx_state_store_major_rev_id,
+		.alloc_gfxp_rtv_cb = NULL,
+		.commit_gfxp_rtv_cb = NULL,
 	},
 	.fb = {
 		.init_hw = gv11b_fb_init_hw,
diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
index 1c2651d97..c9d0839d4 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -535,6 +535,10 @@ struct gpu_ops {
 		void (*set_debug_mode)(struct gk20a *g, bool enable);
 		void (*dump_gr_falcon_stats)(struct gk20a *g);
 		u32 (*get_fecs_ctx_state_store_major_rev_id)(struct gk20a *g);
+		int (*alloc_gfxp_rtv_cb)(struct gk20a *g,
+			  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm);
+		void (*commit_gfxp_rtv_cb)(struct gk20a *g,
+			  struct nvgpu_gr_ctx *gr_ctx, bool patch);
 	} gr;
 	struct {
 		void (*init_hw)(struct gk20a *g);
diff --git a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h
index 9fb9b7b93..83fa99889 100644
--- a/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/hw/tu104/hw_gr_tu104.h
@@ -2302,6 +2302,14 @@ static inline u32 gr_scc_rm_rtv_cb_size_div_256b_f(u32 v)
 {
 	return (v & 0x7fffU) << 0U;
 }
+static inline u32 gr_scc_rm_rtv_cb_size_div_256b_byte_granularity_v(void)
+{
+	return 0x00000100U;
+}
+static inline u32 gr_scc_rm_rtv_cb_size_div_256b_init_f(void)
+{
+	return 0x0U;
+}
 static inline u32 gr_scc_rm_rtv_cb_size_div_256b_default_f(void)
 {
 	return 0x800U;
@@ -2310,6 +2318,10 @@ static inline u32 gr_scc_rm_rtv_cb_size_div_256b_db_adder_f(void)
 {
 	return 0x0U;
 }
+static inline u32 gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f(void)
+{
+	return 0x20U;
+}
 static inline u32 gr_gpcs_gcc_rm_rtv_cb_base_r(void)
 {
 	return 0x00419034U;
diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.c b/drivers/gpu/nvgpu/tu104/gr_tu104.c
index ca2adafcc..59354d00f 100644
--- a/drivers/gpu/nvgpu/tu104/gr_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/gr_tu104.c
@@ -200,7 +200,7 @@ clean_up:
 
 static void gr_tu104_commit_rtv_circular_buffer(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx,
-	u64 addr, u32 size, bool patch)
+	u64 addr, u32 size, u32 gfxpAddSize, bool patch)
 {
 	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_rtv_cb_base_r(),
 		gr_scc_rm_rtv_cb_base_addr_39_8_f(addr), patch);
@@ -209,7 +209,8 @@ static void gr_tu104_commit_rtv_circular_buffer(struct gk20a *g,
 	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_rm_rtv_cb_base_r(),
 		gr_gpcs_gcc_rm_rtv_cb_base_addr_39_8_f(addr), patch);
 	gr_gk20a_ctx_patch_write(g, gr_ctx, gr_scc_rm_gfxp_reserve_r(),
-		gr_scc_rm_gfxp_reserve_rtv_cb_size_div_256b_f(0), patch);
+		gr_scc_rm_gfxp_reserve_rtv_cb_size_div_256b_f(gfxpAddSize),
+		patch);
 }
 
 int gr_tu104_commit_global_ctx_buffers(struct gk20a *g,
@@ -218,6 +219,9 @@ int gr_tu104_commit_global_ctx_buffers(struct gk20a *g,
 	int err;
 	u64 addr;
 	u32 size;
+	u32 gfxpaddsize = 0;
+
+	nvgpu_log_fn(g, " ");
 
 	err = gr_gk20a_commit_global_ctx_buffers(g, gr_ctx, patch);
 	if (err != 0) {
@@ -239,7 +243,8 @@ int gr_tu104_commit_global_ctx_buffers(struct gk20a *g,
 	size = (gr_scc_rm_rtv_cb_size_div_256b_default_f() +
 			gr_scc_rm_rtv_cb_size_div_256b_db_adder_f());
 
-	gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, size, patch);
+	gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr, size,
+						gfxpaddsize, patch);
 
 	if (patch) {
 		gr_gk20a_ctx_patch_write_end(g, gr_ctx, false);
@@ -248,6 +253,55 @@ int gr_tu104_commit_global_ctx_buffers(struct gk20a *g,
 	return 0;
 }
 
+int gr_tu104_alloc_gfxp_rtv_cb(struct gk20a *g,
+		  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm)
+{
+	int err;
+	u32 rtv_cb_size;
+
+	nvgpu_log_fn(g, " ");
+
+	rtv_cb_size =
+		(gr_scc_rm_rtv_cb_size_div_256b_default_f() +
+		gr_scc_rm_rtv_cb_size_div_256b_db_adder_f() +
+		gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f()) *
+		gr_scc_rm_rtv_cb_size_div_256b_byte_granularity_v();
+
+	err = gr_gp10b_alloc_buffer(vm,
+				rtv_cb_size,
+				&gr_ctx->gfxp_rtvcb_ctxsw_buffer);
+
+	return err;
+}
+
+void gr_tu104_commit_gfxp_rtv_cb(struct gk20a *g,
+		  struct nvgpu_gr_ctx *gr_ctx, bool patch)
+{
+	u64 addr;
+	u32 rtv_cb_size;
+	u32 gfxp_addr_size;
+
+	nvgpu_log_fn(g, " ");
+
+	rtv_cb_size =
+		(gr_scc_rm_rtv_cb_size_div_256b_default_f() +
+		gr_scc_rm_rtv_cb_size_div_256b_db_adder_f() +
+		gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f());
+	gfxp_addr_size = gr_scc_rm_rtv_cb_size_div_256b_gfxp_adder_f();
+
+	/* GFXP RTV circular buffer */
+	addr = (u64)(u64_lo32(gr_ctx->gfxp_rtvcb_ctxsw_buffer.gpu_va) >>
+	       gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f()) |
+	       (u64)(u64_hi32(gr_ctx->gfxp_rtvcb_ctxsw_buffer.gpu_va) <<
+		(32U - gr_scc_rm_rtv_cb_base_addr_39_8_align_bits_f()));
+
+
+	gr_tu104_commit_rtv_circular_buffer(g, gr_ctx, addr,
+						rtv_cb_size,
+						gfxp_addr_size,
+						patch);
+}
+
 void gr_tu104_bundle_cb_defaults(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
@@ -270,6 +324,10 @@ void gr_tu104_cb_size_default(struct gk20a *g)
 	}
 	gr->alpha_cb_default_size =
 		gr_gpc0_ppc0_cbm_alpha_cb_size_v_default_v();
+	gr->attrib_cb_gfxp_default_size =
+		gr_gpc0_ppc0_cbm_beta_cb_size_v_gfxp_v();
+	gr->attrib_cb_gfxp_size =
+		gr_gpc0_ppc0_cbm_beta_cb_size_v_gfxp_v();
 }
 
 int gr_tu104_get_preemption_mode_flags(struct gk20a *g,
@@ -290,6 +348,18 @@ int gr_tu104_get_preemption_mode_flags(struct gk20a *g,
 	return 0;
 }
 
+void gr_tu104_free_gr_ctx(struct gk20a *g,
+			  struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
+{
+	nvgpu_log_fn(g, " ");
+
+	if (gr_ctx != NULL) {
+		nvgpu_dma_unmap_free(vm, &gr_ctx->gfxp_rtvcb_ctxsw_buffer);
+	}
+
+	gr_gk20a_free_gr_ctx(g, vm, gr_ctx);
+}
+
 void gr_tu104_enable_gpc_exceptions(struct gk20a *g)
 {
 	struct gr_gk20a *gr = &g->gr;
diff --git a/drivers/gpu/nvgpu/tu104/gr_tu104.h b/drivers/gpu/nvgpu/tu104/gr_tu104.h
index 39c0f0db2..5ac378b56 100644
--- a/drivers/gpu/nvgpu/tu104/gr_tu104.h
+++ b/drivers/gpu/nvgpu/tu104/gr_tu104.h
@@ -27,6 +27,7 @@
 
 struct gk20a;
 struct nvgpu_preemption_modes_rec;
+struct nvgpu_gr_ctx;
 
 enum {
 	TURING_CHANNEL_GPFIFO_A	= 0xC46F,
@@ -55,10 +56,6 @@ enum {
 #define NVC597_SET_SM_DISP_CTRL                 0x10c8U
 #define NVC597_SET_SHADER_CUT_COLLECTOR         0x10d0U
 
-/* TODO: merge these into global context buffer list in gr_gk20a.h */
-#define RTV_CIRCULAR_BUFFER		8
-#define RTV_CIRCULAR_BUFFER_VA		5
-
 bool gr_tu104_is_valid_class(struct gk20a *g, u32 class_num);
 bool gr_tu104_is_valid_gfx_class(struct gk20a *g, u32 class_num);
 bool gr_tu104_is_valid_compute_class(struct gk20a *g, u32 class_num);
@@ -88,6 +85,15 @@ int gr_tu104_get_offset_in_gpccs_segment(struct gk20a *g,
 int gr_tu104_handle_sw_method(struct gk20a *g, u32 addr,
 			      u32 class_num, u32 offset, u32 data);
 
+int gr_tu104_alloc_gfxp_rtv_cb(struct gk20a *g,
+		  struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm);
+void gr_tu104_commit_gfxp_rtv_cb(struct gk20a *g,
+		  struct nvgpu_gr_ctx *gr_ctx, bool patch);
+
+void gr_tu104_free_gr_ctx(struct gk20a *g,
+			struct vm_gk20a *vm,
+			struct nvgpu_gr_ctx *gr_ctx);
+
 void gr_tu104_init_sm_dsm_reg_info(void);
 void gr_tu104_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
 	u32 *num_sm_dsm_perf_ctrl_regs, u32 **sm_dsm_perf_ctrl_regs,
diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c
index 95c403032..a06a92c6d 100644
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -387,9 +387,9 @@ static const struct gpu_ops tu104_ops = {
 		.pagepool_default_size = gr_gv11b_pagepool_default_size,
 		.init_ctx_state = gr_gp10b_init_ctx_state,
 		.alloc_gr_ctx = gr_gp10b_alloc_gr_ctx,
-		.free_gr_ctx = gr_gk20a_free_gr_ctx,
+		.free_gr_ctx = gr_tu104_free_gr_ctx,
 		.update_ctxsw_preemption_mode =
-			gr_gp10b_update_ctxsw_preemption_mode,
+			gr_gv11b_update_ctxsw_preemption_mode,
 		.dump_gr_regs = gr_gv11b_dump_gr_status_regs,
 		.update_pc_sampling = gr_gm20b_update_pc_sampling,
 		.get_fbp_en_mask = gr_gm20b_get_fbp_en_mask,
@@ -458,11 +458,13 @@ static const struct gpu_ops tu104_ops = {
 		.set_czf_bypass = NULL,
 		.pre_process_sm_exception = gr_gv11b_pre_process_sm_exception,
 		.set_preemption_buffer_va = gr_gv11b_set_preemption_buffer_va,
-		.init_preemption_state = NULL,
+		.init_preemption_state = gr_gv11b_init_preemption_state,
 		.update_boosted_ctx = gr_gp10b_update_boosted_ctx,
 		.set_bes_crop_debug3 = gr_gp10b_set_bes_crop_debug3,
 		.set_bes_crop_debug4 = gr_gp10b_set_bes_crop_debug4,
 		.init_ecc = tu104_ecc_init,
+		.alloc_gfxp_rtv_cb = gr_tu104_alloc_gfxp_rtv_cb,
+		.commit_gfxp_rtv_cb = gr_tu104_commit_gfxp_rtv_cb,
 		.set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode,
 		.is_etpc_addr = gv11b_gr_pri_is_etpc_addr,
 		.egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table,
@@ -485,6 +487,12 @@ static const struct gpu_ops tu104_ops = {
 		.handle_tpc_sm_ecc_exception =
 			gr_gv11b_handle_tpc_sm_ecc_exception,
 		.decode_egpc_addr = gv11b_gr_decode_egpc_addr,
+		.init_ctxsw_hdr_data = gr_gp10b_init_ctxsw_hdr_data,
+		.init_gfxp_wfi_timeout_count =
+				gr_gv11b_init_gfxp_wfi_timeout_count,
+		.get_max_gfxp_wfi_timeout_count =
+			gr_gv11b_get_max_gfxp_wfi_timeout_count,
+		.dump_ctxsw_stats = gr_gp10b_dump_ctxsw_stats,
 		.fecs_host_int_enable = gr_gv11b_fecs_host_int_enable,
 		.handle_ssync_hww = gr_gv11b_handle_ssync_hww,
 		.handle_notify_pending = gk20a_gr_handle_notify_pending,