gpu: nvgpu: support for non-secure/secure ctxsw loading

Code for secure/non-secure ctxsw booting spread across gr_gk20a.c and gr_gm20b.c. With this change this code is move to gr falcon unit. Ctxsw loading is now supported with 2 supported common functions: 1.Non secure boot: int nvgpu_gr_falcon_load_ctxsw_ucode(struct gk20a *g); 2.Secure boot: int nvgpu_gr_falcon_load_secure_ctxsw_ucode(struct gk20a *g); Now gr ops function "int (*load_ctxsw_ucode)(struct gk20a *g);" is moved to gr falcon ops and in chip hals it is set with secure/non-secure booting. Non-secure booting: nvgpu_gr_falcon_load_ctxsw_ucode support ctxsw loading in 2 methods: bit-banging uode or booting with bootloader A. Common and hal functions for non-secure bit-banging ctxsw loading: Common: static void nvgpu_gr_falcon_load_dmem(struct gk20a *g) -> Hals: void (*load_gpccs_dmem)(struct gk20a *g,i const u32 *ucode_u32_data, u32 size); void (*load_fecs_dmem)(struct gk20a *g, const u32 *ucode_u32_data, u32 size); Common: static void nvgpu_gr_falcon_load_imem(struct gk20a *g) -> Hals: void (*load_gpccs_imem)(struct gk20a *g, const u32 *ucode_u32_data, u32 size); void (*load_fecs_imem)(struct gk20a *g, const u32 *ucode_u32_data, u32 size); Other basic HALs: void (*configure_fmodel)(struct gk20a *g); -> configure fmodel for ctxsw loading void (*start_ucode)(struct gk20a *g); -> start running ctxcw ucode B.Common and hal functions for non-secure ctxsw loading with bootloader First get the ctxsw ucode using: nvgpu_gr_falcon_init_ctxsw_ucode, then Common: static void nvgpu_gr_falcon_load_with_bootloader(struct gk20a *g) void nvgpu_gr_falcon_bind_instblk((struct gk20a *g) -> Hal: void (*bind_instblk)(struct gk20a *g, struct nvgpu_mem *mem, u64 inst_ptr); Common: nvgpu_gr_falcon_load_ctxsw_ucode_segments -> nvgpu_gr_falcon_load_ctxsw_ucode_header -> nvgpu_gr_falcon_load_ctxsw_ucode_boot for both fecs and gpccs -> Hals: void (*load_ctxsw_ucode_header)(struct gk20a *g, u32 reg_offset, u32 boot_signature, u32 addr_code32, u32 addr_data32, u32 code_size, u32 data_size); void (*load_ctxsw_ucode_boot)(struct gk20a *g, u64 reg_offset, u32 boot_entry, u32 addr_load32, u32 blocks, u32 dst); Other basic HAL to get gpccs start offset: u32 (*get_gpccs_start_reg_offset)(void); C.Secure booting is support with gpmu and acr and with following additional common function in gr falcon. static void nvgpu_gr_falcon_load_gpccs_with_bootloader(struct gk20a *g) -> nvgpu_gr_falcon_bind_instblk and nvgpu_gr_falcon_load_ctxsw_ucode_segments Additional basic hals: void (*start_gpccs)(struct gk20a *g); void (*start_fecs)(struct gk20a *g); Following ops from gr is removed, since it is not required to set by chip hals: void (*falcon_load_ucode)(struct gk20a *g, u64 addr_base, struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset); Now this is handled by static common function: static int nvgpu_gr_falcon_copy_ctxsw_ucode_segments( struct gk20a *g, struct nvgpu_mem *dst, struct gk20a_ctxsw_ucode_segments *segments, u32 *bootimage, u32 *code, u32 *data) JIRA NVGPU-1881 Change-Id: I895a03faaf1a21286316befde24765c8b55075cf Signed-off-by: Seshendra Gadagottu <sgadagottu@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2083388 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-24 02:22:34 +03:00 · 2019-03-27 18:56:51 -07:00
parent b7835b5ead
commit 0f1726ae1f
16 changed files with 788 additions and 580 deletions
--- a/drivers/gpu/nvgpu/common/gr/gr_falcon.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_falcon.c
@@ -195,3 +195,237 @@ clean_up:

 	return err;
 }
+
+static void nvgpu_gr_falcon_load_dmem(struct gk20a *g)
+{
+	u32 ucode_u32_size;
+	const u32 *ucode_u32_data;
+
+	nvgpu_log_fn(g, " ");
+
+	ucode_u32_size = g->netlist_vars->ucode.gpccs.data.count;
+	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.gpccs.data.l;
+	g->ops.gr.falcon.load_gpccs_dmem(g, ucode_u32_data, ucode_u32_size);
+
+	ucode_u32_size = g->netlist_vars->ucode.fecs.data.count;
+	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.fecs.data.l;
+	g->ops.gr.falcon.load_fecs_dmem(g, ucode_u32_data, ucode_u32_size);
+
+	nvgpu_log_fn(g, "done");
+}
+
+static void nvgpu_gr_falcon_load_imem(struct gk20a *g)
+{
+	u32 ucode_u32_size;
+	const u32 *ucode_u32_data;
+
+	nvgpu_log_fn(g, " ");
+
+	ucode_u32_size = g->netlist_vars->ucode.gpccs.inst.count;
+	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.gpccs.inst.l;
+	g->ops.gr.falcon.load_gpccs_imem(g, ucode_u32_data, ucode_u32_size);
+
+
+	ucode_u32_size = g->netlist_vars->ucode.fecs.inst.count;
+	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.fecs.inst.l;
+	g->ops.gr.falcon.load_fecs_imem(g, ucode_u32_data, ucode_u32_size);
+
+	nvgpu_log_fn(g, "done");
+}
+
+static void nvgpu_gr_falcon_bind_instblk(struct gk20a *g)
+{
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	u64 inst_ptr;
+
+	inst_ptr = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc);
+
+	g->ops.gr.falcon.bind_instblk(g, &ucode_info->inst_blk_desc,
+					inst_ptr);
+
+}
+
+static void nvgpu_gr_falcon_load_ctxsw_ucode_header(struct gk20a *g,
+	u64 addr_base, struct gk20a_ctxsw_ucode_segments *segments,
+	u32 reg_offset)
+{
+	u32 addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
+	u32 addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
+
+	g->ops.gr.falcon.load_ctxsw_ucode_header(g, reg_offset,
+		segments->boot_signature, addr_code32, addr_data32,
+		segments->code.size, segments->data.size);
+}
+
+static void nvgpu_gr_falcon_load_ctxsw_ucode_boot(struct gk20a *g,
+	u64 addr_base, struct gk20a_ctxsw_ucode_segments *segments,
+	u32 reg_offset)
+{
+	u32 addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
+	u32 blocks = ((segments->boot.size + 0xFFU) & ~0xFFU) >> 8;
+	u32 dst = segments->boot_imem_offset;
+
+	g->ops.gr.falcon.load_ctxsw_ucode_boot(g, reg_offset,
+		segments->boot_entry, addr_load32, blocks, dst);
+
+}
+
+static void nvgpu_gr_falcon_load_ctxsw_ucode_segments(
+		struct gk20a *g, u64 addr_base,
+		struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
+{
+
+	/* Copy falcon bootloader into dmem */
+	nvgpu_gr_falcon_load_ctxsw_ucode_header(g, addr_base,
+						segments, reg_offset);
+	nvgpu_gr_falcon_load_ctxsw_ucode_boot(g,
+					addr_base, segments, reg_offset);
+}
+
+
+static void nvgpu_gr_falcon_load_with_bootloader(struct gk20a *g)
+{
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	u64 addr_base = ucode_info->surface_desc.gpu_va;
+
+	nvgpu_gr_falcon_bind_instblk(g);
+
+	nvgpu_gr_falcon_load_ctxsw_ucode_segments(g, addr_base,
+		&g->ctxsw_ucode_info.fecs, 0);
+
+	nvgpu_gr_falcon_load_ctxsw_ucode_segments(g, addr_base,
+		&g->ctxsw_ucode_info.gpccs,
+		g->ops.gr.falcon.get_gpccs_start_reg_offset());
+}
+
+int nvgpu_gr_falcon_load_ctxsw_ucode(struct gk20a *g)
+{
+	int err;
+
+	nvgpu_log_fn(g, " ");
+
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		g->ops.gr.falcon.configure_fmodel(g);
+	}
+
+	/*
+	 * In case bootloader is not supported, revert to the old way of
+	 * loading gr ucode, without the faster bootstrap routine.
+	 */
+	if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
+		nvgpu_gr_falcon_load_dmem(g);
+		nvgpu_gr_falcon_load_imem(g);
+		g->ops.gr.falcon.start_ucode(g);
+	} else {
+		if (!g->gr.skip_ucode_init) {
+			err =  nvgpu_gr_falcon_init_ctxsw_ucode(g);
+			if (err != 0) {
+				return err;
+			}
+		}
+		nvgpu_gr_falcon_load_with_bootloader(g);
+		g->gr.skip_ucode_init = true;
+	}
+	nvgpu_log_fn(g, "done");
+	return 0;
+}
+
+static void nvgpu_gr_falcon_load_gpccs_with_bootloader(struct gk20a *g)
+{
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	u64 addr_base = ucode_info->surface_desc.gpu_va;
+
+	nvgpu_gr_falcon_bind_instblk(g);
+
+	nvgpu_gr_falcon_load_ctxsw_ucode_segments(g, addr_base,
+		&g->ctxsw_ucode_info.gpccs,
+		g->ops.gr.falcon.get_gpccs_start_reg_offset());
+}
+
+int nvgpu_gr_falcon_load_secure_ctxsw_ucode(struct gk20a *g)
+{
+	int err = 0;
+	u8 falcon_id_mask = 0;
+
+	nvgpu_log_fn(g, " ");
+
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		g->ops.gr.falcon.configure_fmodel(g);
+	}
+
+	g->pmu_lsf_loaded_falcon_id = 0;
+	if (nvgpu_is_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE)) {
+		/* this must be recovery so bootstrap fecs and gpccs */
+		if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+			nvgpu_gr_falcon_load_gpccs_with_bootloader(g);
+			err = g->ops.pmu.load_lsfalcon_ucode(g,
+					BIT32(FALCON_ID_FECS));
+		} else {
+			/* bind WPR VA inst block */
+			nvgpu_gr_falcon_bind_instblk(g);
+			if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
+				err = nvgpu_sec2_bootstrap_ls_falcons(g,
+					&g->sec2, FALCON_ID_FECS);
+				err = nvgpu_sec2_bootstrap_ls_falcons(g,
+					&g->sec2, FALCON_ID_GPCCS);
+			} else if (g->support_ls_pmu) {
+				err = g->ops.pmu.load_lsfalcon_ucode(g,
+					BIT32(FALCON_ID_FECS) |
+					BIT32(FALCON_ID_GPCCS));
+			} else {
+				err = nvgpu_acr_bootstrap_hs_acr(g, g->acr);
+				if (err != 0) {
+					nvgpu_err(g,
+						"ACR GR LSF bootstrap failed");
+				}
+			}
+		}
+		if (err != 0) {
+			nvgpu_err(g, "Unable to recover GR falcon");
+			return err;
+		}
+
+	} else {
+		/* cold boot or rg exit */
+		nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, true);
+		if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+			nvgpu_gr_falcon_load_gpccs_with_bootloader(g);
+		} else {
+			/* bind WPR VA inst block */
+			nvgpu_gr_falcon_bind_instblk(g);
+			if (nvgpu_acr_is_lsf_lazy_bootstrap(g, g->acr,
+							FALCON_ID_FECS)) {
+				falcon_id_mask |= BIT8(FALCON_ID_FECS);
+			}
+			if (nvgpu_acr_is_lsf_lazy_bootstrap(g, g->acr,
+							FALCON_ID_GPCCS)) {
+				falcon_id_mask |= BIT8(FALCON_ID_GPCCS);
+			}
+
+			if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
+				err = nvgpu_sec2_bootstrap_ls_falcons(g,
+					&g->sec2, FALCON_ID_FECS);
+				err = nvgpu_sec2_bootstrap_ls_falcons(g,
+					&g->sec2, FALCON_ID_GPCCS);
+			} else if (g->support_ls_pmu) {
+				err = g->ops.pmu.load_lsfalcon_ucode(g,
+							falcon_id_mask);
+			} else {
+				/* GR falcons bootstrapped by ACR */
+				err = 0;
+			}
+
+			if (err != 0) {
+				nvgpu_err(g, "Unable to boot GPCCS");
+				return err;
+			}
+		}
+	}
+
+	g->ops.gr.falcon.start_gpccs(g);
+	g->ops.gr.falcon.start_fecs(g);
+
+	nvgpu_log_fn(g, "done");
+
+	return 0;
+}
--- a/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gp10b/vgpu_hal_gp10b.c
@@ -127,8 +127,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 		.get_sm_dsm_perf_regs = gr_gm20b_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gr_gm20b_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = NULL,
-		.falcon_load_ucode = NULL,
-		.load_ctxsw_ucode = NULL,
 		.set_gpc_tpc_mask = NULL,
 		.alloc_obj_ctx = vgpu_gr_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -294,6 +292,9 @@ static const struct gpu_ops vgpu_gp10b_ops = {
 			.get_zcull_info = vgpu_gr_get_zcull_info,
 			.program_zcull_mapping = NULL,
 		},
+		.falcon = {
+			.load_ctxsw_ucode = NULL,
+		},
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		.fecs_trace = {
 			.alloc_user_buffer = vgpu_alloc_user_buffer,
--- a/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c
+++ b/drivers/gpu/nvgpu/common/vgpu/gv11b/vgpu_hal_gv11b.c
@@ -148,8 +148,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 		.get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gv11b_gr_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = NULL,
-		.falcon_load_ucode = NULL,
-		.load_ctxsw_ucode = NULL,
 		.set_gpc_tpc_mask = NULL,
 		.alloc_obj_ctx = vgpu_gr_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -344,6 +342,9 @@ static const struct gpu_ops vgpu_gv11b_ops = {
 			.align_regs_perf_pma =
 				gv100_gr_hwpm_map_align_regs_perf_pma,
 		},
+		.falcon = {
+			.load_ctxsw_ucode = NULL,
+		},
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		.fecs_trace = {
 			.alloc_user_buffer = vgpu_alloc_user_buffer,
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -69,12 +69,8 @@
 #include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
 #include <nvgpu/hw/gk20a/hw_ram_gk20a.h>

-#define BLK_SIZE (256U)
 #define CTXSW_MEM_SCRUBBING_TIMEOUT_MAX 1000U
 #define CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT 10U
-#define FECS_ARB_CMD_TIMEOUT_MAX 40
-#define FECS_ARB_CMD_TIMEOUT_DEFAULT 2
-

 static struct channel_gk20a *gk20a_gr_get_channel_from_ctx(
 	struct gk20a *g, u32 curr_ctx, u32 *curr_tsgid);
@@ -183,126 +179,6 @@ static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
 		}
 	}
 }
-
-static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
-{
-	u32 i, ucode_u32_size;
-	const u32 *ucode_u32_data;
-	u32 checksum;
-
-	nvgpu_log_fn(g, " ");
-
-	gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
-					      gr_gpccs_dmemc_blk_f(0)  |
-					      gr_gpccs_dmemc_aincw_f(1)));
-
-	ucode_u32_size = g->netlist_vars->ucode.gpccs.data.count;
-	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.gpccs.data.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-
-	gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
-					     gr_fecs_dmemc_blk_f(0)  |
-					     gr_fecs_dmemc_aincw_f(1)));
-
-	ucode_u32_size = g->netlist_vars->ucode.fecs.data.count;
-	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.fecs.data.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-	nvgpu_log_fn(g, "done");
-}
-
-static void gr_gk20a_load_falcon_imem(struct gk20a *g)
-{
-	u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
-	const u32 *ucode_u32_data;
-	u32 tag, i, pad_start, pad_end;
-	u32 checksum;
-
-	nvgpu_log_fn(g, " ");
-
-	cfg = gk20a_readl(g, gr_fecs_cfg_r());
-	fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
-
-	cfg = gk20a_readl(g, gr_gpc0_cfg_r());
-	gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
-
-	/* Use the broadcast address to access all of the GPCCS units. */
-	gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
-					      gr_gpccs_imemc_blk_f(0) |
-					      gr_gpccs_imemc_aincw_f(1)));
-
-	/* Setup the tags for the instruction memory. */
-	tag = 0;
-	gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
-
-	ucode_u32_size = g->netlist_vars->ucode.gpccs.inst.count;
-	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.gpccs.inst.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
-			tag++;
-			gk20a_writel(g, gr_gpccs_imemt_r(0),
-				      gr_gpccs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-
-	pad_start = i * 4U;
-	pad_end = pad_start + (256U - pad_start % 256U) + 256U;
-	for (i = pad_start;
-	     (i < gpccs_imem_size * 256U) && (i < pad_end);
-	     i += 4U) {
-		if ((i != 0U) && ((i % 256U) == 0U)) {
-			tag++;
-			gk20a_writel(g, gr_gpccs_imemt_r(0),
-				      gr_gpccs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
-	}
-
-	gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
-					     gr_fecs_imemc_blk_f(0) |
-					     gr_fecs_imemc_aincw_f(1)));
-
-	/* Setup the tags for the instruction memory. */
-	tag = 0;
-	gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
-
-	ucode_u32_size = g->netlist_vars->ucode.fecs.inst.count;
-	ucode_u32_data = (const u32 *)g->netlist_vars->ucode.fecs.inst.l;
-
-	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
-		if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
-			tag++;
-			gk20a_writel(g, gr_fecs_imemt_r(0),
-				      gr_fecs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
-		checksum += ucode_u32_data[i];
-	}
-
-	pad_start = i * 4U;
-	pad_end = pad_start + (256U - pad_start % 256U) + 256U;
-	for (i = pad_start;
-	     (i < fecs_imem_size * 256U) && i < pad_end;
-	     i += 4U) {
-		if ((i != 0U) && ((i % 256U) == 0U)) {
-			tag++;
-			gk20a_writel(g, gr_fecs_imemt_r(0),
-				      gr_fecs_imemt_tag_f(tag));
-		}
-		gk20a_writel(g, gr_fecs_imemd_r(0), 0);
-	}
-}
-
 int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
 			    u32 *mailbox_ret, u32 opc_success,
 			    u32 mailbox_ok, u32 opc_fail,
@@ -1076,288 +952,6 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 	return ret;
 }

-static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
-{
-	nvgpu_log_fn(g, " ");
-
-	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0U),
-		     gr_fecs_ctxsw_mailbox_clear_value_f(~U32(0U)));
-
-	gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0U));
-	gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0U));
-
-	gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1U));
-	gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1U));
-
-	nvgpu_log_fn(g, "done");
-}
-
-static void gr_gk20a_wait_for_fecs_arb_idle(struct gk20a *g)
-{
-	int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
-	u32 val;
-
-	val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
-	while ((gr_fecs_arb_ctx_cmd_cmd_v(val) != 0U) && (retries != 0)) {
-		nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
-		retries--;
-		val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
-	}
-
-	if (retries == 0) {
-		nvgpu_err(g, "arbiter cmd timeout, fecs arb ctx cmd: 0x%08x",
-				gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
-	}
-
-	retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
-	while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
-			gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
-	       (retries != 0)) {
-		nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
-		retries--;
-	}
-	if (retries == 0) {
-		nvgpu_err(g,
-			  "arbiter idle timeout, fecs ctxsw status: 0x%08x",
-			  gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
-	}
-}
-
-void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
-{
-	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-	int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT;
-	u64 inst_ptr_shifted_u64;
-	u32 inst_ptr_shifted_u32;
-
-	while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
-			gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
-	       (retries != 0)) {
-		nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT);
-		retries--;
-	}
-	if (retries == 0) {
-		nvgpu_err(g,
-			  "arbiter idle timeout, status: %08x",
-			  gk20a_readl(g, gr_fecs_ctxsw_status_1_r()));
-	}
-
-	gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
-
-	inst_ptr_shifted_u64 = nvgpu_inst_block_addr(g,
-					&ucode_info->inst_blk_desc);
-	inst_ptr_shifted_u64 >>= 12;
-	BUG_ON(u64_hi32(inst_ptr_shifted_u64) != 0U);
-	inst_ptr_shifted_u32 = (u32)inst_ptr_shifted_u64;
-	gk20a_writel(g, gr_fecs_new_ctx_r(),
-		     gr_fecs_new_ctx_ptr_f(inst_ptr_shifted_u32) |
-		     nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
-				gr_fecs_new_ctx_target_sys_mem_ncoh_f(),
-				gr_fecs_new_ctx_target_sys_mem_coh_f(),
-				gr_fecs_new_ctx_target_vid_mem_f()) |
-		     gr_fecs_new_ctx_valid_m());
-
-	gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
-		     gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr_shifted_u32) |
-		     nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc,
-				gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(),
-				gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(),
-				gr_fecs_arb_ctx_ptr_target_vid_mem_f()));
-
-	gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
-
-	/* Wait for arbiter command to complete */
-	gr_gk20a_wait_for_fecs_arb_idle(g);
-
-	gk20a_writel(g, gr_fecs_current_ctx_r(),
-			gr_fecs_current_ctx_ptr_f(inst_ptr_shifted_u32) |
-			gr_fecs_current_ctx_target_m() |
-			gr_fecs_current_ctx_valid_m());
-	/* Send command to arbiter to flush */
-	gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
-
-	gr_gk20a_wait_for_fecs_arb_idle(g);
-
-}
-
-void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
-	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
-{
-	u32 addr_code32;
-	u32 addr_data32;
-
-	addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
-	addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
-
-	/*
-	 * Copy falcon bootloader header into dmem at offset 0.
-	 * Configure dmem port 0 for auto-incrementing writes starting at dmem
-	 * offset 0.
-	 */
-	gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
-			gr_fecs_dmemc_offs_f(0) |
-			gr_fecs_dmemc_blk_f(0) |
-			gr_fecs_dmemc_aincw_f(1));
-
-	/* Write out the actual data */
-	switch (segments->boot_signature) {
-	case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED:
-	case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
-	case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
-	case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
-	case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
-	case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		/* fallthrough */
-	case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
-	case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
-	case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
-	case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
-	case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				addr_code32);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				segments->code.size);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				addr_data32);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				segments->data.size);
-		break;
-	case FALCON_UCODE_SIG_T12X_FECS_OLDER:
-	case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				addr_code32);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				segments->code.size);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				addr_data32);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				segments->data.size);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0),
-				addr_code32);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
-		break;
-	default:
-		nvgpu_err(g,
-				"unknown falcon ucode boot signature 0x%08x"
-				" with reg_offset 0x%08x",
-				segments->boot_signature, reg_offset);
-		BUG();
-	}
-}
-
-void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
-	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
-{
-	u32 addr_load32;
-	u32 blocks;
-	u32 b;
-	u32 dst;
-
-	addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
-	blocks = ((segments->boot.size + 0xFFU) & ~0xFFU) >> 8;
-
-	/*
-	 * Set the base FB address for the DMA transfer. Subtract off the 256
-	 * byte IMEM block offset such that the relative FB and IMEM offsets
-	 * match, allowing the IMEM tags to be properly created.
-	 */
-
-	dst = segments->boot_imem_offset;
-	gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
-			(addr_load32 - (dst >> 8)));
-
-	for (b = 0; b < blocks; b++) {
-		/* Setup destination IMEM offset */
-		gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
-				dst + (b << 8));
-
-		/* Setup source offset (relative to BASE) */
-		gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
-				dst + (b << 8));
-
-		gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
-				gr_fecs_dmatrfcmd_imem_f(0x01) |
-				gr_fecs_dmatrfcmd_write_f(0x00) |
-				gr_fecs_dmatrfcmd_size_f(0x06) |
-				gr_fecs_dmatrfcmd_ctxdma_f(0));
-	}
-
-	/* Specify the falcon boot vector */
-	gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
-			gr_fecs_bootvec_vec_f(segments->boot_entry));
-}
-
-static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
-{
-	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-	u64 addr_base = ucode_info->surface_desc.gpu_va;
-
-	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
-
-	gr_gk20a_load_falcon_bind_instblk(g);
-
-	g->ops.gr.falcon_load_ucode(g, addr_base,
-		&g->ctxsw_ucode_info.fecs, 0);
-
-	g->ops.gr.falcon_load_ucode(g, addr_base,
-		&g->ctxsw_ucode_info.gpccs,
-		gr_gpcs_gpccs_falcon_hwcfg_r() -
-		gr_fecs_falcon_hwcfg_r());
-}
-
-int gr_gk20a_load_ctxsw_ucode(struct gk20a *g)
-{
-	int err;
-
-	nvgpu_log_fn(g, " ");
-
-	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
-		gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
-			gr_fecs_ctxsw_mailbox_value_f(0xc0de7777U));
-		gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
-			gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777U));
-	}
-
-	/*
-	 * In case bootloader is not supported, revert to the old way of
-	 * loading gr ucode, without the faster bootstrap routine.
-	 */
-	if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
-		gr_gk20a_load_falcon_dmem(g);
-		gr_gk20a_load_falcon_imem(g);
-		gr_gk20a_start_falcon_ucode(g);
-	} else {
-		if (!g->gr.skip_ucode_init) {
-			err = nvgpu_gr_falcon_init_ctxsw_ucode(g);
-
-			if (err != 0) {
-				return err;
-			}
-		}
-		gr_gk20a_load_falcon_with_bootloader(g);
-		g->gr.skip_ucode_init = true;
-	}
-	nvgpu_log_fn(g, "done");
-	return 0;
-}
-
 static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g)
 {
 	int ret;
@@ -1939,7 +1533,7 @@ static int gr_gk20a_init_ctxsw(struct gk20a *g)
 {
 	int err = 0;

-	err = g->ops.gr.load_ctxsw_ucode(g);
+	err = g->ops.gr.falcon.load_ctxsw_ucode(g);
 	if (err != 0) {
 		goto out;
 	}
--- a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -341,14 +341,6 @@ int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
 				  u32 mode);

 void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data);
-int gr_gk20a_load_ctxsw_ucode(struct gk20a *g);
-void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g);
-void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base,
-	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset);
-void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base,
-	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset);
-
-
 void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg);
 int gr_gk20a_disable_ctxsw(struct gk20a *g);
 int gr_gk20a_enable_ctxsw(struct gk20a *g);
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.c
@@ -340,23 +340,6 @@ void gr_gm20b_set_gpc_tpc_mask(struct gk20a *g, u32 gpc_index)
 	}
 }

-void gr_gm20b_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
-	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
-{
-	gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
-			gr_fecs_dmactl_require_ctx_f(0));
-
-	/* Copy falcon bootloader into dmem */
-	gr_gk20a_load_ctxsw_ucode_header(g, addr_base, segments, reg_offset);
-	gr_gk20a_load_ctxsw_ucode_boot(g, addr_base, segments, reg_offset);
-
-	/* start the falcon immediately if PRIV security is disabled*/
-	if (!nvgpu_is_enabled(g, NVGPU_SEC_PRIVSECURITY)) {
-		gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
-				gr_fecs_cpuctl_startcpu_f(0x01));
-	}
-}
-
 static bool gr_gm20b_is_tpc_addr_shared(struct gk20a *g, u32 addr)
 {
 	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
@@ -394,122 +377,6 @@ u32 gr_gm20b_get_tpc_num(struct gk20a *g, u32 addr)
 	return 0;
 }

-static void gr_gm20b_load_gpccs_with_bootloader(struct gk20a *g)
-{
-	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
-	u64 addr_base = ucode_info->surface_desc.gpu_va;
-
-	gr_gk20a_load_falcon_bind_instblk(g);
-
-	g->ops.gr.falcon_load_ucode(g, addr_base,
-		&g->ctxsw_ucode_info.gpccs,
-		gr_gpcs_gpccs_falcon_hwcfg_r() -
-		gr_fecs_falcon_hwcfg_r());
-}
-
-int gr_gm20b_load_ctxsw_ucode(struct gk20a *g)
-{
-	int err = 0;
-	u32 reg_offset = gr_gpcs_gpccs_falcon_hwcfg_r() -
-	  gr_fecs_falcon_hwcfg_r();
-	u8 falcon_id_mask = 0;
-
-	nvgpu_log_fn(g, " ");
-
-	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
-		gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
-			gr_fecs_ctxsw_mailbox_value_f(0xc0de7777U));
-		gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
-			gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777U));
-	}
-
-	g->pmu_lsf_loaded_falcon_id = 0;
-	if (nvgpu_is_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE)) {
-		/* this must be recovery so bootstrap fecs and gpccs */
-		if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
-			gr_gm20b_load_gpccs_with_bootloader(g);
-			err = g->ops.pmu.load_lsfalcon_ucode(g,
-					BIT32(FALCON_ID_FECS));
-		} else {
-			/* bind WPR VA inst block */
-			gr_gk20a_load_falcon_bind_instblk(g);
-			if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
-				err = nvgpu_sec2_bootstrap_ls_falcons(g, &g->sec2,
-						FALCON_ID_FECS);
-				err = nvgpu_sec2_bootstrap_ls_falcons(g, &g->sec2,
-						FALCON_ID_GPCCS);
-			} else if (g->support_ls_pmu) {
-					err = g->ops.pmu.load_lsfalcon_ucode(g,
-						BIT32(FALCON_ID_FECS) |
-						BIT32(FALCON_ID_GPCCS));
-			} else {
-				err = nvgpu_acr_bootstrap_hs_acr(g, g->acr);
-				if (err != 0) {
-					nvgpu_err(g, "GR Recovery: ACR GR LSF bootstrap failed");
-				}
-			}
-		}
-		if (err != 0) {
-			nvgpu_err(g, "Unable to recover GR falcon");
-			return err;
-		}
-
-	} else {
-		/* cold boot or rg exit */
-		nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, true);
-		if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
-			gr_gm20b_load_gpccs_with_bootloader(g);
-		} else {
-			/* bind WPR VA inst block */
-			gr_gk20a_load_falcon_bind_instblk(g);
-			if (nvgpu_acr_is_lsf_lazy_bootstrap(g, g->acr, FALCON_ID_FECS)) {
-				falcon_id_mask |= BIT8(FALCON_ID_FECS);
-			}
-			if (nvgpu_acr_is_lsf_lazy_bootstrap(g, g->acr, FALCON_ID_GPCCS)) {
-				falcon_id_mask |= BIT8(FALCON_ID_GPCCS);
-			}
-
-			if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
-				err = nvgpu_sec2_bootstrap_ls_falcons(g, &g->sec2,
-						FALCON_ID_FECS);
-				err = nvgpu_sec2_bootstrap_ls_falcons(g, &g->sec2,
-						FALCON_ID_GPCCS);
-			} else if (g->support_ls_pmu) {
-				err = g->ops.pmu.load_lsfalcon_ucode(g, falcon_id_mask);
-			} else {
-				/* GR falcons bootstrapped by ACR */
-				err = 0;
-			}
-
-			if (err != 0) {
-				nvgpu_err(g, "Unable to boot GPCCS");
-				return err;
-			}
-		}
-	}
-
-	/*start gpccs */
-	if (nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
-		gk20a_writel(g, reg_offset +
-			gr_fecs_cpuctl_alias_r(),
-			gr_gpccs_cpuctl_startcpu_f(1U));
-	} else {
-		gk20a_writel(g, gr_gpccs_dmactl_r(),
-			gr_gpccs_dmactl_require_ctx_f(0U));
-		gk20a_writel(g, gr_gpccs_cpuctl_r(),
-			gr_gpccs_cpuctl_startcpu_f(1U));
-	}
-	/* start fecs */
-	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0U), ~U32(0U));
-	gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(1U), 1U);
-	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(6U), 0xffffffffU);
-	gk20a_writel(g, gr_fecs_cpuctl_alias_r(),
-			gr_fecs_cpuctl_startcpu_f(1U));
-	nvgpu_log_fn(g, "done");
-
-	return 0;
-}
-
 void gr_gm20b_detect_sm_arch(struct gk20a *g)
 {
 	u32 v = gk20a_readl(g, gr_gpc0_tpc0_sm_arch_r());
--- a/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
+++ b/drivers/gpu/nvgpu/gm20b/gr_gm20b.h
@@ -68,7 +68,6 @@ void gr_gm20b_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
 	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset);
 bool gr_gm20b_is_tpc_addr(struct gk20a *g, u32 addr);
 u32 gr_gm20b_get_tpc_num(struct gk20a *g, u32 addr);
-int gr_gm20b_load_ctxsw_ucode(struct gk20a *g);
 void gr_gm20b_detect_sm_arch(struct gk20a *g);
 int gr_gm20b_init_ctxsw_preemption_mode(struct gk20a *g,
 	struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
--- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
+++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c
@@ -38,6 +38,7 @@
 #include <nvgpu/regops.h>
 #include <nvgpu/gr/zbc.h>
 #include <nvgpu/gr/zcull.h>
+#include <nvgpu/gr/gr_falcon.h>

 #include "hal/bus/bus_gm20b.h"
 #include "hal/bus/bus_gk20a.h"
@@ -252,8 +253,6 @@ static const struct gpu_ops gm20b_ops = {
 		.get_sm_dsm_perf_regs = gr_gm20b_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gr_gm20b_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = gr_gm20b_set_hww_esr_report_mask,
-		.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
-		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gm20b_set_gpc_tpc_mask,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -488,6 +487,23 @@ static const struct gpu_ops gm20b_ops = {
 				gm20b_gr_falcon_get_fecs_ctxsw_mailbox_size,
 			.get_fecs_ctx_state_store_major_rev_id =
 				gm20b_gr_falcon_get_fecs_ctx_state_store_major_rev_id,
+			.load_gpccs_dmem = gm20b_gr_falcon_load_gpccs_dmem,
+			.load_fecs_dmem = gm20b_gr_falcon_load_fecs_dmem,
+			.load_gpccs_imem = gm20b_gr_falcon_load_gpccs_imem,
+			.load_fecs_imem = gm20b_gr_falcon_load_fecs_imem,
+			.configure_fmodel = gm20b_gr_falcon_configure_fmodel,
+			.start_ucode = gm20b_gr_falcon_start_ucode,
+			.start_gpccs = gm20b_gr_falcon_start_gpccs,
+			.start_fecs = gm20b_gr_falcon_start_fecs,
+			.get_gpccs_start_reg_offset =
+				gm20b_gr_falcon_get_gpccs_start_reg_offset,
+			.bind_instblk = gm20b_gr_falcon_bind_instblk,
+			.load_ctxsw_ucode_header =
+				gm20b_gr_falcon_load_ctxsw_ucode_header,
+			.load_ctxsw_ucode_boot =
+				gm20b_gr_falcon_load_ctxsw_ucode_boot,
+			.load_ctxsw_ucode =
+					nvgpu_gr_falcon_load_ctxsw_ucode,
 		},
 	},
 	.fb = {
@@ -1028,7 +1044,8 @@ int gm20b_init_hal(struct gk20a *g)
 		gops->pmu.init_wpr_region = gm20b_pmu_init_acr;
 		gops->pmu.load_lsfalcon_ucode = gm20b_load_falcon_ucode;

-		gops->gr.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode;
+		gops->gr.falcon.load_ctxsw_ucode =
+			nvgpu_gr_falcon_load_secure_ctxsw_ucode;
 	} else {
 		/* Inherit from gk20a */
 		gops->pmu.pmu_setup_hw_and_bootstrap =
@@ -1037,8 +1054,6 @@ int gm20b_init_hal(struct gk20a *g)

 		gops->pmu.load_lsfalcon_ucode = NULL;
 		gops->pmu.init_wpr_region = NULL;
-
-		gops->gr.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode;
 	}

 	nvgpu_set_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL, false);
--- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
+++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c
@@ -38,6 +38,7 @@
 #include <nvgpu/regops.h>
 #include <nvgpu/gr/zbc.h>
 #include <nvgpu/gr/zcull.h>
+#include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/fecs_trace.h>

 #include "hal/bus/bus_gk20a.h"
@@ -277,8 +278,6 @@ static const struct gpu_ops gp10b_ops = {
 		.get_sm_dsm_perf_regs = gr_gm20b_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gr_gm20b_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = gr_gm20b_set_hww_esr_report_mask,
-		.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
-		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gp10b_set_gpc_tpc_mask,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -566,6 +565,23 @@ static const struct gpu_ops gp10b_ops = {
 				gm20b_gr_falcon_get_fecs_ctxsw_mailbox_size,
 			.get_fecs_ctx_state_store_major_rev_id =
 				gm20b_gr_falcon_get_fecs_ctx_state_store_major_rev_id,
+			.load_gpccs_dmem = gm20b_gr_falcon_load_gpccs_dmem,
+			.load_fecs_dmem = gm20b_gr_falcon_load_fecs_dmem,
+			.load_gpccs_imem = gm20b_gr_falcon_load_gpccs_imem,
+			.load_fecs_imem = gm20b_gr_falcon_load_fecs_imem,
+			.configure_fmodel = gm20b_gr_falcon_configure_fmodel,
+			.start_ucode = gm20b_gr_falcon_start_ucode,
+			.start_gpccs = gm20b_gr_falcon_start_gpccs,
+			.start_fecs = gm20b_gr_falcon_start_fecs,
+			.get_gpccs_start_reg_offset =
+				gm20b_gr_falcon_get_gpccs_start_reg_offset,
+			.bind_instblk = gm20b_gr_falcon_bind_instblk,
+			.load_ctxsw_ucode_header =
+				gm20b_gr_falcon_load_ctxsw_ucode_header,
+			.load_ctxsw_ucode_boot =
+				gm20b_gr_falcon_load_ctxsw_ucode_boot,
+			.load_ctxsw_ucode =
+					nvgpu_gr_falcon_load_ctxsw_ucode,
 		},
 	},
 	.fb = {
@@ -1107,7 +1123,8 @@ int gp10b_init_hal(struct gk20a *g)
 		gops->pmu.init_wpr_region = gm20b_pmu_init_acr;
 		gops->pmu.load_lsfalcon_ucode = gp10b_load_falcon_ucode;

-		gops->gr.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode;
+		gops->gr.falcon.load_ctxsw_ucode =
+			nvgpu_gr_falcon_load_secure_ctxsw_ucode;
 	} else {
 		/* Inherit from gk20a */
 		gops->pmu.pmu_setup_hw_and_bootstrap =
@@ -1117,7 +1134,6 @@ int gp10b_init_hal(struct gk20a *g)
 		gops->pmu.load_lsfalcon_ucode = NULL;
 		gops->pmu.init_wpr_region = NULL;

-		gops->gr.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode;
 	}

 	nvgpu_set_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL, false);
--- a/drivers/gpu/nvgpu/gv100/hal_gv100.c
+++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c
@@ -62,6 +62,7 @@
 #include "hal/gr/fecs_trace/fecs_trace_gm20b.h"
 #include "hal/gr/config/gr_config_gm20b.h"
 #include "hal/gr/config/gr_config_gv100.h"
+#include "hal/gr/falcon/gr_falcon_gm20b.h"
 #include "hal/gr/zbc/zbc_gp10b.h"
 #include "hal/gr/zbc/zbc_gv11b.h"
 #include "hal/gr/init/gr_init_gm20b.h"
@@ -72,7 +73,6 @@
 #include "hal/gr/intr/gr_intr_gv11b.h"
 #include "hal/gr/zcull/zcull_gm20b.h"
 #include "hal/gr/zcull/zcull_gv11b.h"
-#include "hal/gr/falcon/gr_falcon_gm20b.h"
 #include "hal/gr/hwpm_map/hwpm_map_gv100.h"
 #include "hal/gr/ctxsw_prog/ctxsw_prog_gm20b.h"
 #include "hal/gr/ctxsw_prog/ctxsw_prog_gp10b.h"
@@ -166,6 +166,7 @@
 #include <nvgpu/regops.h>
 #include <nvgpu/gr/zbc.h>
 #include <nvgpu/gr/zcull.h>
+#include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/gr/fecs_trace.h>

 #include <nvgpu/hw/gv100/hw_proj_gv100.h>
@@ -389,8 +390,6 @@ static const struct gpu_ops gv100_ops = {
 		.get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gv11b_gr_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask,
-		.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
-		.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -710,6 +709,23 @@ static const struct gpu_ops gv100_ops = {
 				gm20b_gr_falcon_get_fecs_ctxsw_mailbox_size,
 			.get_fecs_ctx_state_store_major_rev_id =
 				gm20b_gr_falcon_get_fecs_ctx_state_store_major_rev_id,
+			.load_gpccs_dmem = gm20b_gr_falcon_load_gpccs_dmem,
+			.load_fecs_dmem = gm20b_gr_falcon_load_fecs_dmem,
+			.load_gpccs_imem = gm20b_gr_falcon_load_gpccs_imem,
+			.load_fecs_imem = gm20b_gr_falcon_load_fecs_imem,
+			.configure_fmodel = gm20b_gr_falcon_configure_fmodel,
+			.start_ucode = gm20b_gr_falcon_start_ucode,
+			.start_gpccs = gm20b_gr_falcon_start_gpccs,
+			.start_fecs = gm20b_gr_falcon_start_fecs,
+			.get_gpccs_start_reg_offset =
+				gm20b_gr_falcon_get_gpccs_start_reg_offset,
+			.bind_instblk = gm20b_gr_falcon_bind_instblk,
+			.load_ctxsw_ucode_header =
+				gm20b_gr_falcon_load_ctxsw_ucode_header,
+			.load_ctxsw_ucode_boot =
+				gm20b_gr_falcon_load_ctxsw_ucode_boot,
+			.load_ctxsw_ucode =
+					nvgpu_gr_falcon_load_secure_ctxsw_ucode,
 		},
 	},
 	.fb = {
--- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
+++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c
@@ -24,6 +24,7 @@
 #include <nvgpu/gk20a.h>
 #include <nvgpu/fuse.h>
 #include <nvgpu/regops.h>
+#include <nvgpu/gr/gr_falcon.h>

 #include "hal/bus/bus_gk20a.h"
 #include "hal/bus/bus_gp10b.h"
@@ -340,8 +341,6 @@ static const struct gpu_ops gv11b_ops = {
 		.get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gv11b_gr_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask,
-		.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
-		.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gv11b_set_gpc_tpc_mask,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -669,6 +668,23 @@ static const struct gpu_ops gv11b_ops = {
 				gm20b_gr_falcon_get_fecs_ctxsw_mailbox_size,
 			.get_fecs_ctx_state_store_major_rev_id =
 				gm20b_gr_falcon_get_fecs_ctx_state_store_major_rev_id,
+			.load_gpccs_dmem = gm20b_gr_falcon_load_gpccs_dmem,
+			.load_fecs_dmem = gm20b_gr_falcon_load_fecs_dmem,
+			.load_gpccs_imem = gm20b_gr_falcon_load_gpccs_imem,
+			.load_fecs_imem = gm20b_gr_falcon_load_fecs_imem,
+			.configure_fmodel = gm20b_gr_falcon_configure_fmodel,
+			.start_ucode = gm20b_gr_falcon_start_ucode,
+			.start_gpccs = gm20b_gr_falcon_start_gpccs,
+			.start_fecs = gm20b_gr_falcon_start_fecs,
+			.get_gpccs_start_reg_offset =
+				gm20b_gr_falcon_get_gpccs_start_reg_offset,
+			.bind_instblk = gm20b_gr_falcon_bind_instblk,
+			.load_ctxsw_ucode_header =
+				gm20b_gr_falcon_load_ctxsw_ucode_header,
+			.load_ctxsw_ucode_boot =
+				gm20b_gr_falcon_load_ctxsw_ucode_boot,
+			.load_ctxsw_ucode =
+					nvgpu_gr_falcon_load_ctxsw_ucode,
 		},
 	},
 	.fb = {
@@ -1257,7 +1273,8 @@ int gv11b_init_hal(struct gk20a *g)

 	/* priv security dependent ops */
 	if (nvgpu_is_enabled(g, NVGPU_SEC_PRIVSECURITY)) {
-		gops->gr.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode;
+		gops->gr.falcon.load_ctxsw_ucode =
+			nvgpu_gr_falcon_load_secure_ctxsw_ucode;
 	} else {
 		/* non-secure boot */
 		gops->pmu.pmu_nsbootstrap = gv11b_pmu_bootstrap;
@@ -1267,7 +1284,6 @@ int gv11b_init_hal(struct gk20a *g)
 		gops->pmu.load_lsfalcon_ucode = NULL;
 		gops->pmu.init_wpr_region = NULL;

-		gops->gr.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode;
 	}

 	nvgpu_set_enabled(g, NVGPU_PMU_FECS_BOOTSTRAP_DONE, false);
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b.c
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b.c
@@ -21,6 +21,7 @@
 */

 #include <nvgpu/gk20a.h>
+#include <nvgpu/gr/gr_falcon.h>
 #include <nvgpu/io.h>
 #include <nvgpu/debug.h>

@@ -28,6 +29,402 @@

 #include <nvgpu/hw/gm20b/hw_gr_gm20b.h>

+#define FECS_ARB_CMD_TIMEOUT_MAX_US 40U
+#define FECS_ARB_CMD_TIMEOUT_DEFAULT_US 2U
+
+void gm20b_gr_falcon_load_gpccs_dmem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size)
+{
+	u32 i, checksum;
+
+	/* enable access for gpccs dmem */
+	nvgpu_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
+					gr_gpccs_dmemc_blk_f(0)  |
+					gr_gpccs_dmemc_aincw_f(1)));
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		nvgpu_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+	nvgpu_log_info(g, "gpccs dmem checksum: 0x%x", checksum);
+}
+
+void gm20b_gr_falcon_load_fecs_dmem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size)
+{
+	u32 i, checksum;
+
+	/* set access for fecs dmem */
+	nvgpu_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
+					gr_fecs_dmemc_blk_f(0)  |
+					gr_fecs_dmemc_aincw_f(1)));
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		nvgpu_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+	nvgpu_log_info(g, "fecs dmem checksum: 0x%x", checksum);
+}
+
+void gm20b_gr_falcon_load_gpccs_imem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size)
+{
+	u32 cfg, gpccs_imem_size;
+	u32 tag, i, pad_start, pad_end;
+	u32 checksum;
+
+	/* enable access for gpccs imem */
+	nvgpu_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
+					gr_gpccs_imemc_blk_f(0) |
+					gr_gpccs_imemc_aincw_f(1)));
+
+	cfg = nvgpu_readl(g, gr_gpc0_cfg_r());
+	gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
+
+	/* Setup the tags for the instruction memory. */
+	tag = 0;
+	nvgpu_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
+			tag++;
+			nvgpu_writel(g, gr_gpccs_imemt_r(0),
+					gr_gpccs_imemt_tag_f(tag));
+		}
+		nvgpu_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+
+	pad_start = i * 4U;
+	pad_end = pad_start + (256U - pad_start % 256U) + 256U;
+	for (i = pad_start;
+		(i < gpccs_imem_size * 256U) && (i < pad_end); i += 4U) {
+		if ((i != 0U) && ((i % 256U) == 0U)) {
+			tag++;
+			nvgpu_writel(g, gr_gpccs_imemt_r(0),
+					gr_gpccs_imemt_tag_f(tag));
+		}
+		nvgpu_writel(g, gr_gpccs_imemd_r(0), 0);
+	}
+
+	nvgpu_log_info(g, "gpccs imem checksum: 0x%x", checksum);
+}
+
+void gm20b_gr_falcon_load_fecs_imem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size)
+{
+	u32 cfg, fecs_imem_size;
+	u32 tag, i, pad_start, pad_end;
+	u32 checksum;
+
+	/* set access for fecs imem */
+	nvgpu_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
+					gr_fecs_imemc_blk_f(0) |
+					gr_fecs_imemc_aincw_f(1)));
+
+	cfg = nvgpu_readl(g, gr_fecs_cfg_r());
+	fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
+
+	/* Setup the tags for the instruction memory. */
+	tag = 0;
+	nvgpu_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) {
+			tag++;
+			nvgpu_writel(g, gr_fecs_imemt_r(0),
+				      gr_fecs_imemt_tag_f(tag));
+		}
+		nvgpu_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+
+	pad_start = i * 4U;
+	pad_end = pad_start + (256U - pad_start % 256U) + 256U;
+	for (i = pad_start;
+	     (i < fecs_imem_size * 256U) && i < pad_end;
+	     i += 4U) {
+		if ((i != 0U) && ((i % 256U) == 0U)) {
+			tag++;
+			nvgpu_writel(g, gr_fecs_imemt_r(0),
+				      gr_fecs_imemt_tag_f(tag));
+		}
+		nvgpu_writel(g, gr_fecs_imemd_r(0), 0);
+	}
+	nvgpu_log_info(g, "fecs imem checksum: 0x%x", checksum);
+}
+
+u32 gm20b_gr_falcon_get_gpccs_start_reg_offset(void)
+{
+	return (gr_gpcs_gpccs_falcon_hwcfg_r() - gr_fecs_falcon_hwcfg_r());
+}
+
+void gm20b_gr_falcon_configure_fmodel(struct gk20a *g)
+{
+	nvgpu_log_fn(g, " ");
+
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_r(7),
+		gr_fecs_ctxsw_mailbox_value_f(0xc0de7777U));
+	nvgpu_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
+		gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777U));
+
+}
+
+void gm20b_gr_falcon_start_ucode(struct gk20a *g)
+{
+	nvgpu_log_fn(g, " ");
+
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0U),
+		gr_fecs_ctxsw_mailbox_clear_value_f(~U32(0U)));
+
+	nvgpu_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0U));
+	nvgpu_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0U));
+
+	nvgpu_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1U));
+	nvgpu_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1U));
+
+	nvgpu_log_fn(g, "done");
+}
+
+
+void gm20b_gr_falcon_start_gpccs(struct gk20a *g)
+{
+	u32 reg_offset = gr_gpcs_gpccs_falcon_hwcfg_r() -
+					gr_fecs_falcon_hwcfg_r();
+
+	if (nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+		nvgpu_writel(g, reg_offset +
+			gr_fecs_cpuctl_alias_r(),
+			gr_gpccs_cpuctl_startcpu_f(1U));
+	} else {
+		nvgpu_writel(g, gr_gpccs_dmactl_r(),
+			gr_gpccs_dmactl_require_ctx_f(0U));
+		nvgpu_writel(g, gr_gpccs_cpuctl_r(),
+			gr_gpccs_cpuctl_startcpu_f(1U));
+	}
+}
+
+void gm20b_gr_falcon_start_fecs(struct gk20a *g)
+{
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0U), ~U32(0U));
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_r(1U), 1U);
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(6U), 0xffffffffU);
+	nvgpu_writel(g, gr_fecs_cpuctl_alias_r(),
+			gr_fecs_cpuctl_startcpu_f(1U));
+}
+
+static void gm20b_gr_falcon_wait_for_fecs_arb_idle(struct gk20a *g)
+{
+	int retries = FECS_ARB_CMD_TIMEOUT_MAX_US /
+			FECS_ARB_CMD_TIMEOUT_DEFAULT_US;
+	u32 val;
+
+	val = nvgpu_readl(g, gr_fecs_arb_ctx_cmd_r());
+	while ((gr_fecs_arb_ctx_cmd_cmd_v(val) != 0U) && (retries != 0)) {
+		nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT_US);
+		retries--;
+		val = nvgpu_readl(g, gr_fecs_arb_ctx_cmd_r());
+	}
+
+	if (retries == 0) {
+		nvgpu_err(g, "arbiter cmd timeout, fecs arb ctx cmd: 0x%08x",
+				nvgpu_readl(g, gr_fecs_arb_ctx_cmd_r()));
+	}
+
+	retries = FECS_ARB_CMD_TIMEOUT_MAX_US /
+			FECS_ARB_CMD_TIMEOUT_DEFAULT_US;
+	while (((nvgpu_readl(g, gr_fecs_ctxsw_status_1_r()) &
+			gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
+	       (retries != 0)) {
+		nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT_US);
+		retries--;
+	}
+	if (retries == 0) {
+		nvgpu_err(g,
+			  "arbiter idle timeout, fecs ctxsw status: 0x%08x",
+			  nvgpu_readl(g, gr_fecs_ctxsw_status_1_r()));
+	}
+}
+
+void gm20b_gr_falcon_bind_instblk(struct gk20a *g,
+				struct nvgpu_mem *mem, u64 inst_ptr)
+{
+	u32 retries = FECS_ARB_CMD_TIMEOUT_MAX_US /
+			FECS_ARB_CMD_TIMEOUT_DEFAULT_US;
+	u32 inst_ptr_u32;
+
+	nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
+
+	while (((nvgpu_readl(g, gr_fecs_ctxsw_status_1_r()) &
+			gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
+	       (retries != 0)) {
+		nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT_US);
+		retries--;
+	}
+	if (retries == 0) {
+		nvgpu_err(g,
+			  "arbiter idle timeout, status: %08x",
+			  nvgpu_readl(g, gr_fecs_ctxsw_status_1_r()));
+	}
+
+	nvgpu_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
+
+	inst_ptr >>= 12;
+	BUG_ON(u64_hi32(inst_ptr) != 0U);
+	inst_ptr_u32 = (u32)inst_ptr;
+	nvgpu_writel(g, gr_fecs_new_ctx_r(),
+		     gr_fecs_new_ctx_ptr_f(inst_ptr_u32) |
+		     nvgpu_aperture_mask(g, mem,
+				gr_fecs_new_ctx_target_sys_mem_ncoh_f(),
+				gr_fecs_new_ctx_target_sys_mem_coh_f(),
+				gr_fecs_new_ctx_target_vid_mem_f()) |
+		     gr_fecs_new_ctx_valid_m());
+
+	nvgpu_writel(g, gr_fecs_arb_ctx_ptr_r(),
+		     gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr_u32) |
+		     nvgpu_aperture_mask(g, mem,
+				gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(),
+				gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(),
+				gr_fecs_arb_ctx_ptr_target_vid_mem_f()));
+
+	nvgpu_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
+
+	/* Wait for arbiter command to complete */
+	gm20b_gr_falcon_wait_for_fecs_arb_idle(g);
+
+	nvgpu_writel(g, gr_fecs_current_ctx_r(),
+			gr_fecs_current_ctx_ptr_f(inst_ptr_u32) |
+			gr_fecs_current_ctx_target_m() |
+			gr_fecs_current_ctx_valid_m());
+	/* Send command to arbiter to flush */
+	nvgpu_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
+
+	gm20b_gr_falcon_wait_for_fecs_arb_idle(g);
+
+}
+
+void gm20b_gr_falcon_load_ctxsw_ucode_header(struct gk20a *g,
+	u32 reg_offset, u32 boot_signature, u32 addr_code32,
+	u32 addr_data32, u32 code_size, u32 data_size)
+{
+
+	nvgpu_writel(g, reg_offset + gr_fecs_dmactl_r(),
+			gr_fecs_dmactl_require_ctx_f(0));
+
+	/*
+	 * Copy falcon bootloader header into dmem at offset 0.
+	 * Configure dmem port 0 for auto-incrementing writes starting at dmem
+	 * offset 0.
+	 */
+	nvgpu_writel(g, reg_offset + gr_fecs_dmemc_r(0),
+			gr_fecs_dmemc_offs_f(0) |
+			gr_fecs_dmemc_blk_f(0) |
+			gr_fecs_dmemc_aincw_f(1));
+
+	/* Write out the actual data */
+	switch (boot_signature) {
+	case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED:
+	case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE:
+	case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED:
+	case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED:
+	case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED:
+	case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED:
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		/* fallthrough */
+	case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED:
+	case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED:
+	case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED:
+	case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2:
+	case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED:
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				addr_code32);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				code_size);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				addr_data32);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				data_size);
+		break;
+	case FALCON_UCODE_SIG_T12X_FECS_OLDER:
+	case FALCON_UCODE_SIG_T12X_GPCCS_OLDER:
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				addr_code32);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				code_size);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				addr_data32);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				data_size);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0),
+				addr_code32);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		nvgpu_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+		break;
+	default:
+		nvgpu_err(g,
+				"unknown falcon ucode boot signature 0x%08x"
+				" with reg_offset 0x%08x",
+				boot_signature, reg_offset);
+		BUG();
+	}
+}
+
+void gm20b_gr_falcon_load_ctxsw_ucode_boot(struct gk20a *g, u32 reg_offset,
+			u32 boot_entry, u32 addr_load32, u32 blocks, u32 dst)
+{
+	u32 b;
+
+	/*
+	 * Set the base FB address for the DMA transfer. Subtract off the 256
+	 * byte IMEM block offset such that the relative FB and IMEM offsets
+	 * match, allowing the IMEM tags to be properly created.
+	 */
+
+	nvgpu_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
+			(addr_load32 - (dst >> 8)));
+
+	for (b = 0; b < blocks; b++) {
+		/* Setup destination IMEM offset */
+		nvgpu_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
+				dst + (b << 8));
+
+		/* Setup source offset (relative to BASE) */
+		nvgpu_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
+				dst + (b << 8));
+
+		nvgpu_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
+				gr_fecs_dmatrfcmd_imem_f(0x01) |
+				gr_fecs_dmatrfcmd_write_f(0x00) |
+				gr_fecs_dmatrfcmd_size_f(0x06) |
+				gr_fecs_dmatrfcmd_ctxdma_f(0));
+	}
+
+	/* Specify the falcon boot vector */
+	nvgpu_writel(g, reg_offset + gr_fecs_bootvec_r(),
+			gr_fecs_bootvec_vec_f(boot_entry));
+
+	/* start the falcon immediately if PRIV security is disabled*/
+	if (!nvgpu_is_enabled(g, NVGPU_SEC_PRIVSECURITY)) {
+		nvgpu_writel(g, reg_offset + gr_fecs_cpuctl_r(),
+				gr_fecs_cpuctl_startcpu_f(0x01));
+	}
+}
+
 u32 gm20b_gr_falcon_fecs_base_addr(void)
 {
 	return gr_fecs_irqsset_r();
--- a/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b.h
+++ b/drivers/gpu/nvgpu/hal/gr/falcon/gr_falcon_gm20b.h
@@ -32,5 +32,26 @@ u32 gm20b_gr_falcon_gpccs_base_addr(void);
 void gm20b_gr_falcon_fecs_dump_stats(struct gk20a *g);
 u32 gm20b_gr_falcon_get_fecs_ctx_state_store_major_rev_id(struct gk20a *g);
 u32 gm20b_gr_falcon_get_fecs_ctxsw_mailbox_size(void);
+void gm20b_gr_falcon_load_gpccs_dmem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size);
+void gm20b_gr_falcon_load_fecs_dmem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size);
+void gm20b_gr_falcon_load_gpccs_imem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size);
+void gm20b_gr_falcon_load_fecs_imem(struct gk20a *g,
+			const u32 *ucode_u32_data, u32 ucode_u32_size);
+void gm20b_gr_falcon_configure_fmodel(struct gk20a *g);
+void gm20b_gr_falcon_start_ucode(struct gk20a *g);
+void gm20b_gr_falcon_start_gpccs(struct gk20a *g);
+void gm20b_gr_falcon_start_fecs(struct gk20a *g);
+u32 gm20b_gr_falcon_get_gpccs_start_reg_offset(void);
+void gm20b_gr_falcon_bind_instblk(struct gk20a *g,
+				struct nvgpu_mem *mem, u64 inst_ptr);
+void gm20b_gr_falcon_load_ctxsw_ucode_header(struct gk20a *g,
+	u32 reg_offset, u32 boot_signature, u32 addr_code32,
+	u32 addr_data32, u32 code_size, u32 data_size);
+void gm20b_gr_falcon_load_ctxsw_ucode_boot(struct gk20a *g,
+	u32 reg_offset, u32 boot_entry, u32 addr_load32, u32 blocks,
+	u32 dst);

 #endif /* NVGPU_GR_FALCON_GM20B_H */
--- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h
@@ -285,11 +285,6 @@ struct gpu_ops {
 						  u32 *num_ovr_perf_regs,
 						  u32 **ovr_perf_regsr);
 		void (*set_hww_esr_report_mask)(struct gk20a *g);
-		void (*falcon_load_ucode)(struct gk20a *g,
-				u64 addr_base,
-				struct gk20a_ctxsw_ucode_segments *segments,
-				u32 reg_offset);
-		int (*load_ctxsw_ucode)(struct gk20a *g);
 		void (*set_gpc_tpc_mask)(struct gk20a *g, u32 gpc_index);
 		int (*alloc_obj_ctx)(struct channel_gk20a  *c,
 				     u32 class_num, u32 flags);
@@ -574,9 +569,33 @@ struct gpu_ops {
 			u32 (*gpccs_base_addr)(void);
 			void (*dump_stats)(struct gk20a *g);
 			u32 (*fecs_ctxsw_mailbox_size)(void);
-			u32 (*get_fecs_ctx_state_store_major_rev_id)
-							(struct gk20a *g);
+			u32 (*get_fecs_ctx_state_store_major_rev_id)(
+							struct gk20a *g);
+			void (*load_gpccs_dmem)(struct gk20a *g,
+					const u32 *ucode_u32_data, u32 size);
+			void (*load_fecs_dmem)(struct gk20a *g,
+					const u32 *ucode_u32_data, u32 size);
+			void (*load_gpccs_imem)(struct gk20a *g,
+					const u32 *ucode_u32_data, u32 size);
+			void (*load_fecs_imem)(struct gk20a *g,
+					const u32 *ucode_u32_data, u32 size);
+			void (*configure_fmodel)(struct gk20a *g);
+			void (*start_ucode)(struct gk20a *g);
+			void (*start_gpccs)(struct gk20a *g);
+			void (*start_fecs)(struct gk20a *g);
+			u32 (*get_gpccs_start_reg_offset)(void);
+			void (*bind_instblk)(struct gk20a *g,
+					struct nvgpu_mem *mem, u64 inst_ptr);
+			void (*load_ctxsw_ucode_header)(struct gk20a *g,
+				u32 reg_offset, u32 boot_signature,
+				u32 addr_code32, u32 addr_data32,
+				u32 code_size, u32 data_size);
+			void (*load_ctxsw_ucode_boot)(struct gk20a *g,
+				u32 reg_offset, u32 boot_entry,
+				u32 addr_load32, u32 blocks, u32 dst);
+			int (*load_ctxsw_ucode)(struct gk20a *g);
 		} falcon;
+
 #ifdef CONFIG_GK20A_CTXSW_TRACE
 		struct {
 			int (*init)(struct gk20a *g);
--- a/drivers/gpu/nvgpu/include/nvgpu/gr/gr_falcon.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/gr/gr_falcon.h
@@ -27,6 +27,9 @@

 struct gk20a;

+int nvgpu_gr_falcon_init_ctxsw(struct gk20a *g);
 int nvgpu_gr_falcon_init_ctxsw_ucode(struct gk20a *g);
+int nvgpu_gr_falcon_load_ctxsw_ucode(struct gk20a *g);
+int nvgpu_gr_falcon_load_secure_ctxsw_ucode(struct gk20a *g);

-#endif /* NVGPU_GR_SUBCTX_H */
+#endif /* NVGPU_GR_FALCON_H */
--- a/drivers/gpu/nvgpu/tu104/hal_tu104.c
+++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c
@@ -186,6 +186,7 @@
 #include <nvgpu/gr/zbc.h>
 #include <nvgpu/gr/fecs_trace.h>
 #include <nvgpu/pmu/perf.h>
+#include <nvgpu/gr/gr_falcon.h>

 #include <nvgpu/hw/tu104/hw_proj_tu104.h>
 #include <nvgpu/hw/tu104/hw_top_tu104.h>
@@ -410,8 +411,6 @@ static const struct gpu_ops tu104_ops = {
 		.get_sm_dsm_perf_regs = gv11b_gr_get_sm_dsm_perf_regs,
 		.get_sm_dsm_perf_ctrl_regs = gr_tu104_get_sm_dsm_perf_ctrl_regs,
 		.set_hww_esr_report_mask = gv11b_gr_set_hww_esr_report_mask,
-		.falcon_load_ucode = gr_gm20b_load_ctxsw_ucode_segments,
-		.load_ctxsw_ucode = gr_gm20b_load_ctxsw_ucode,
 		.set_gpc_tpc_mask = gr_gv100_set_gpc_tpc_mask,
 		.alloc_obj_ctx = gk20a_alloc_obj_ctx,
 		.is_tpc_addr = gr_gm20b_is_tpc_addr,
@@ -743,6 +742,23 @@ static const struct gpu_ops tu104_ops = {
 				gm20b_gr_falcon_get_fecs_ctxsw_mailbox_size,
 			.get_fecs_ctx_state_store_major_rev_id =
 				gm20b_gr_falcon_get_fecs_ctx_state_store_major_rev_id,
+			.load_gpccs_dmem = gm20b_gr_falcon_load_gpccs_dmem,
+			.load_fecs_dmem = gm20b_gr_falcon_load_fecs_dmem,
+			.load_gpccs_imem = gm20b_gr_falcon_load_gpccs_imem,
+			.load_fecs_imem = gm20b_gr_falcon_load_fecs_imem,
+			.configure_fmodel = gm20b_gr_falcon_configure_fmodel,
+			.start_ucode = gm20b_gr_falcon_start_ucode,
+			.start_gpccs = gm20b_gr_falcon_start_gpccs,
+			.start_fecs = gm20b_gr_falcon_start_fecs,
+			.get_gpccs_start_reg_offset =
+				gm20b_gr_falcon_get_gpccs_start_reg_offset,
+			.bind_instblk = gm20b_gr_falcon_bind_instblk,
+			.load_ctxsw_ucode_header =
+				gm20b_gr_falcon_load_ctxsw_ucode_header,
+			.load_ctxsw_ucode_boot =
+				gm20b_gr_falcon_load_ctxsw_ucode_boot,
+			.load_ctxsw_ucode =
+				nvgpu_gr_falcon_load_secure_ctxsw_ucode,
 		},
 	},
 	.fb = {
@@ -1465,7 +1481,8 @@ int tu104_init_hal(struct gk20a *g)
 		gops->cbc.ctrl = NULL;
 		gops->cbc.alloc_comptags = NULL;

-		gops->gr.load_ctxsw_ucode = gr_gk20a_load_ctxsw_ucode;
+		gops->gr.falcon.load_ctxsw_ucode =
+			nvgpu_gr_falcon_load_ctxsw_ucode;

 		/* Disable pmu pstate, as there is no pmu support */
 		nvgpu_set_enabled(g, NVGPU_PMU_PSTATE, false);