diff --git a/drivers/gpu/nvgpu/Makefile b/drivers/gpu/nvgpu/Makefile
index 4ea56d8ef..c059e464f 100644
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -13,6 +13,7 @@ nvgpu-y += \
 	$(nvgpu-t19x)/gv11b/ce_gv11b.o \
 	$(nvgpu-t19x)/gv11b/gr_ctx_gv11b.o \
 	$(nvgpu-t19x)/gv11b/pmu_gv11b.o \
-	$(nvgpu-t19x)/gv11b/therm_gv11b.o
+	$(nvgpu-t19x)/gv11b/therm_gv11b.o \
+	$(nvgpu-t19x)/gv11b/subctx_gv11b.o
 
 nvgpu-$(CONFIG_TEGRA_GK20A) += $(nvgpu-t19x)/gv11b/platform_gv11b_tegra.o
diff --git a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
index bc413a9ee..b9276e094 100644
--- a/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/fifo_gv11b.c
@@ -19,6 +19,8 @@
 #include "gp10b/fifo_gp10b.h"
 #include "hw_pbdma_gv11b.h"
 #include "fifo_gv11b.h"
+#include "subctx_gv11b.h"
+#include "gr_gv11b.h"
 #include "hw_fifo_gv11b.h"
 #include "hw_ram_gv11b.h"
 #include "hw_ccsr_gv11b.h"
@@ -200,6 +202,15 @@ static void gv11b_userd_gp_put(struct gk20a *g, struct channel_gk20a *c)
 
 }
 
+static void channel_gv11b_unbind(struct channel_gk20a *ch)
+{
+	gk20a_dbg_fn("");
+
+	gv11b_free_subctx_header(ch);
+
+	channel_gk20a_unbind(ch);
+
+}
 
 static u32 gv11b_fifo_get_num_fifos(struct gk20a *g)
 {
@@ -218,4 +229,5 @@ void gv11b_init_fifo(struct gpu_ops *gops)
 	gops->fifo.userd_gp_get = gv11b_userd_gp_get;
 	gops->fifo.userd_gp_put = gv11b_userd_gp_put;
 	gops->fifo.setup_ramfc = channel_gv11b_setup_ramfc;
+	gops->fifo.unbind_channel = channel_gv11b_unbind;
 }
diff --git a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
index 7f5b8d3f1..bdb96329d 100644
--- a/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/gr_gv11b.c
@@ -13,6 +13,7 @@
  * more details.
  */
 
+#include <linux/tegra_gpu_t19x.h>
 #include "gk20a/gk20a.h" /* FERMI and MAXWELL classes defined here */
 #include <linux/delay.h>
 #include <linux/tegra-fuse.h>
@@ -24,12 +25,16 @@
 
 #include "gm20b/gr_gm20b.h"
 #include "gv11b/gr_gv11b.h"
+#include "gv11b/mm_gv11b.h"
+#include "gv11b/subctx_gv11b.h"
 #include "hw_gr_gv11b.h"
 #include "hw_fifo_gv11b.h"
 #include "hw_proj_gv11b.h"
 #include "hw_ctxsw_prog_gv11b.h"
 #include "hw_mc_gv11b.h"
 #include "hw_gr_gv11b.h"
+#include "hw_ram_gv11b.h"
+#include "hw_pbdma_gv11b.h"
 #include <linux/vmalloc.h>
 #include <linux/tegra_gpu_t19x.h>
 
@@ -1583,7 +1588,6 @@ static int gr_gv11b_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
 	return 0;
 }
 
-
 static void gv11b_write_bundle_veid_state(struct gk20a *g, u32 index)
 {
 	struct av_list_gk20a *sw_veid_bundle_init =
@@ -1766,12 +1770,43 @@ static int gr_gv11b_load_smid_config(struct gk20a *g)
 
 	for (i = 0; i < gr_cwd_sm_id__size_1_v(); i++)
 		gk20a_writel(g, gr_cwd_sm_id_r(i), tpc_sm_id[i]);
-
 	kfree(tpc_sm_id);
 
 	return 0;
 }
 
+static int gr_gv11b_commit_inst(struct channel_gk20a *c, u64 gpu_va)
+{
+	u32 addr_lo;
+	u32 addr_hi;
+	struct ctx_header_desc *ctx;
+
+	gk20a_dbg_fn("");
+
+	gv11b_alloc_subctx_header(c);
+
+	gv11b_update_subctx_header(c, gpu_va);
+
+	ctx = &c->ch_ctx.ctx_header;
+	addr_lo = u64_lo32(ctx->mem.gpu_va) >> ram_in_base_shift_v();
+	addr_hi = u64_hi32(ctx->mem.gpu_va);
+
+	/* point this address to engine_wfi_ptr */
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_engine_wfi_target_w(),
+		ram_in_engine_cs_wfi_v() |
+		ram_in_engine_wfi_target_f(
+			ram_in_engine_wfi_target_sys_mem_ncoh_v()) |
+		ram_in_engine_wfi_mode_f(ram_in_engine_wfi_mode_virtual_v()) |
+		ram_in_engine_wfi_ptr_lo_f(addr_lo));
+
+	gk20a_mem_wr32(c->g, &c->inst_block, ram_in_engine_wfi_ptr_hi_w(),
+		ram_in_engine_wfi_ptr_hi_f(addr_hi));
+
+	return 0;
+}
+
+
+
 static int gr_gv11b_commit_global_timeslice(struct gk20a *g,
 					struct channel_gk20a *c, bool patch)
 {
@@ -1828,6 +1863,7 @@ static int gr_gv11b_commit_global_timeslice(struct gk20a *g,
 void gv11b_init_gr(struct gpu_ops *gops)
 {
 	gp10b_init_gr(gops);
+	gops->gr.init_preemption_state = NULL;
 	gops->gr.init_fs_state = gr_gv11b_init_fs_state;
 	gops->gr.detect_sm_arch = gr_gv11b_detect_sm_arch;
 	gops->gr.is_valid_class = gr_gv11b_is_valid_class;
@@ -1872,4 +1908,6 @@ void gv11b_init_gr(struct gpu_ops *gops)
 	gops->gr.load_smid_config = gr_gv11b_load_smid_config;
 	gops->gr.program_sm_id_numbering =
 			gr_gv11b_program_sm_id_numbering;
+	gops->gr.commit_inst = gr_gv11b_commit_inst;
+
 }
diff --git a/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
new file mode 100644
index 000000000..3acc53f6e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.c
@@ -0,0 +1,147 @@
+/*
+ * Volta GPU series Subcontext
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.
+ */
+
+#include "gk20a/gk20a.h"
+#include "gk20a/semaphore_gk20a.h"
+#include "gv11b/subctx_gv11b.h"
+#include "gv11b/hw_ram_gv11b.h"
+#include "gv11b/hw_ctxsw_prog_gv11b.h"
+
+static void gv11b_init_subcontext_pdb(struct channel_gk20a *c,
+				struct mem_desc *inst_block);
+
+void gv11b_free_subctx_header(struct channel_gk20a *c)
+{
+	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct gk20a *g = c->g;
+
+	gk20a_dbg_fn("");
+
+	if (ctx->mem.gpu_va) {
+		gk20a_gmmu_unmap(c->vm, ctx->mem.gpu_va,
+			ctx->mem.size, gk20a_mem_flag_none);
+
+		gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING, &ctx->mem);
+	}
+}
+
+int gv11b_alloc_subctx_header(struct channel_gk20a *c)
+{
+	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct gk20a *g = c->g;
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (ctx->mem.gpu_va == 0) {
+		ret = gk20a_gmmu_alloc_attr_sys(g,
+				DMA_ATTR_NO_KERNEL_MAPPING,
+				ctxsw_prog_fecs_header_v(),
+				&ctx->mem);
+		if (ret) {
+			gk20a_err(dev_from_gk20a(g),
+				"failed to allocate sub ctx header");
+			return ret;
+		}
+		ctx->mem.gpu_va = gk20a_gmmu_map(c->vm,
+					&ctx->mem.sgt,
+					ctx->mem.size,
+					NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+					gk20a_mem_flag_none, true,
+					ctx->mem.aperture);
+		if (!ctx->mem.gpu_va) {
+			gk20a_err(dev_from_gk20a(g),
+				"failed to map ctx header");
+			gk20a_gmmu_free_attr(g, DMA_ATTR_NO_KERNEL_MAPPING,
+					&ctx->mem);
+			return -ENOMEM;
+		}
+		/* Now clear the buffer */
+		if (gk20a_mem_begin(g, &ctx->mem))
+			return -ENOMEM;
+
+		gk20a_memset(g, &ctx->mem, 0, 0, ctx->mem.size);
+		gk20a_mem_end(g, &ctx->mem);
+
+		gv11b_init_subcontext_pdb(c, &c->inst_block);
+
+	}
+	return ret;
+}
+
+static void gv11b_init_subcontext_pdb(struct channel_gk20a *c,
+				struct mem_desc *inst_block)
+{
+	struct gk20a *g = c->g;
+	struct vm_gk20a *vm;
+	u64 pdb_addr, pdb_addr_lo, pdb_addr_hi;
+	u32 format_word;
+	u32 lo, hi;
+
+	gk20a_dbg_fn("");
+	/* load main pdb as veid0 pdb also */
+	vm = c->vm;
+	pdb_addr = g->ops.mm.get_iova_addr(g, vm->pdb.mem.sgt->sgl, 0);
+	pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
+	pdb_addr_hi = u64_hi32(pdb_addr);
+	format_word = ram_in_sc_page_dir_base_target_f(
+		ram_in_sc_page_dir_base_target_sys_mem_ncoh_v(), 0) |
+		ram_in_sc_page_dir_base_vol_f(
+		ram_in_sc_page_dir_base_vol_true_v(), 0) |
+		ram_in_sc_page_dir_base_fault_replay_tex_f(0, 0) |
+		ram_in_sc_page_dir_base_fault_replay_gcc_f(0, 0) |
+		ram_in_sc_use_ver2_pt_format_f(1, 0) |
+		ram_in_sc_big_page_size_f(1, 0) |
+		ram_in_sc_page_dir_base_lo_0_f(pdb_addr_lo);
+	lo = ram_in_sc_page_dir_base_vol_0_w();
+	hi = ram_in_sc_page_dir_base_hi_0_w();
+	gk20a_mem_wr32(g, inst_block, lo, format_word);
+	gk20a_mem_wr32(g, inst_block, hi, pdb_addr_hi);
+
+	/* make subcontext0 address space to valid */
+	/* TODO fix proper hw register definations */
+	gk20a_mem_wr32(g, inst_block, 166, 0x1);
+	gk20a_mem_wr32(g, inst_block, 167, 0);
+	gk20a_mem_wr32(g, inst_block, ram_in_engine_wfi_veid_w(),
+			ram_in_engine_wfi_veid_f(0));
+
+}
+
+int gv11b_update_subctx_header(struct channel_gk20a *c, u64 gpu_va)
+{
+	struct ctx_header_desc *ctx = &c->ch_ctx.ctx_header;
+	struct mem_desc *gr_mem;
+	struct gk20a *g = c->g;
+	int ret = 0;
+	u32 addr_lo, addr_hi;
+
+	addr_lo = u64_lo32(gpu_va);
+	addr_hi = u64_hi32(gpu_va);
+
+	gr_mem = &ctx->mem;
+	g->ops.mm.l2_flush(g, true);
+	if (gk20a_mem_begin(g, gr_mem))
+		return -ENOMEM;
+
+	gk20a_mem_wr(g, gr_mem,
+		ctxsw_prog_main_image_context_buffer_ptr_hi_o(), addr_hi);
+	gk20a_mem_wr(g, gr_mem,
+		ctxsw_prog_main_image_context_buffer_ptr_o(), addr_lo);
+	gk20a_mem_end(g, gr_mem);
+	return ret;
+}
diff --git a/drivers/gpu/nvgpu/gv11b/subctx_gv11b.h b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.h
new file mode 100644
index 000000000..357cd2540
--- /dev/null
+++ b/drivers/gpu/nvgpu/gv11b/subctx_gv11b.h
@@ -0,0 +1,27 @@
+/*
+ *
+ * Volta GPU series Subcontext
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.
+ */
+#ifndef __SUBCONTEXT_GV11B_H__
+#define __SUBCONTEXT_GV11B_H__
+
+int gv11b_alloc_subctx_header(struct channel_gk20a *c);
+
+void gv11b_free_subctx_header(struct channel_gk20a *c);
+
+int gv11b_update_subctx_header(struct channel_gk20a *c, u64 gpu_va);
+#endif /* __SUBCONTEXT_GV11B_H__ */