Open source GPL/LGPL release

2025-12-24 10:34:43 +03:00 · 2022-07-21 16:03:29 -07:00
commit f338182221
2260 changed files with 576813 additions and 0 deletions
--- a/drivers/gpu/nvgpu/common/gr/ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/ctx.c
--- a/drivers/gpu/nvgpu/common/gr/ctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/ctx_priv.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_CTX_PRIV_H
+#define NVGPU_GR_CTX_PRIV_H
+
+struct nvgpu_mem;
+
+/**
+ * Patch context buffer descriptor structure.
+ *
+ * Pointer to this structure is maintained in #nvgpu_gr_ctx structure.
+ */
+struct patch_desc {
+	/**
+	 * Memory to hold patch context buffer.
+	 */
+	struct nvgpu_mem mem;
+
+	/**
+	 * Count of entries written into patch context buffer.
+	 */
+	u32 data_count;
+};
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+struct zcull_ctx_desc {
+	u64 gpu_va;
+	u32 ctx_sw_mode;
+};
+#endif
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+struct pm_ctx_desc {
+	struct nvgpu_mem mem;
+	u64 gpu_va;
+	u32 pm_mode;
+};
+#endif
+
+/**
+ * GR context descriptor structure.
+ *
+ * This structure stores various properties of all GR context buffers.
+ */
+struct nvgpu_gr_ctx_desc {
+	/**
+	 * Array to store all GR context buffer sizes.
+	 */
+	u32 size[NVGPU_GR_CTX_COUNT];
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	bool force_preemption_gfxp;
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+	bool force_preemption_cilp;
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+	bool dump_ctxsw_stats_on_channel_close;
+#endif
+};
+
+/**
+ * Graphics context buffer structure.
+ *
+ * This structure stores all the properties of a graphics context
+ * buffer. One graphics context is allocated per GPU Time Slice
+ * Group (TSG).
+ */
+struct nvgpu_gr_ctx {
+	/**
+	 * Context ID read from graphics context buffer.
+	 */
+	u32 ctx_id;
+
+	/**
+	 * Flag to indicate if above context ID is valid or not.
+	 */
+	bool ctx_id_valid;
+
+	/**
+	 * Memory to hold graphics context buffer.
+	 */
+	struct nvgpu_mem mem;
+
+#ifdef CONFIG_NVGPU_GFXP
+	struct nvgpu_mem preempt_ctxsw_buffer;
+	struct nvgpu_mem spill_ctxsw_buffer;
+	struct nvgpu_mem betacb_ctxsw_buffer;
+	struct nvgpu_mem pagepool_ctxsw_buffer;
+	struct nvgpu_mem gfxp_rtvcb_ctxsw_buffer;
+#endif
+
+	/**
+	 * Patch context buffer descriptor struct.
+	 */
+	struct patch_desc	patch_ctx;
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	struct zcull_ctx_desc	zcull_ctx;
+#endif
+#ifdef CONFIG_NVGPU_DEBUGGER
+	struct pm_ctx_desc	pm_ctx;
+#endif
+
+	/**
+	 * Graphics preemption mode of the graphics context.
+	 */
+	u32 graphics_preempt_mode;
+
+	/**
+	 * Compute preemption mode of the graphics context.
+	 */
+	u32 compute_preempt_mode;
+
+#ifdef CONFIG_NVGPU_NON_FUSA
+	bool golden_img_loaded;
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+	bool cilp_preempt_pending;
+#endif
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+	bool boosted_ctx;
+#endif
+
+	/**
+	 * Array to store GPU virtual addresses of all global context
+	 * buffers.
+	 */
+	u64	global_ctx_buffer_va[NVGPU_GR_CTX_VA_COUNT];
+
+	/**
+	 * Array to store indexes of global context buffers
+	 * corresponding to GPU virtual addresses above.
+	 */
+	u32	global_ctx_buffer_index[NVGPU_GR_CTX_VA_COUNT];
+
+	/**
+	 * Flag to indicate if global context buffers are mapped and
+	 * #global_ctx_buffer_va array is populated.
+	 */
+	bool	global_ctx_buffer_mapped;
+
+	/**
+	 * TSG identifier corresponding to the graphics context.
+	 */
+	u32 tsgid;
+
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+	/** SM diversity configuration offset.
+	 * It is valid only if NVGPU_SUPPORT_SM_DIVERSITY support is true.
+	 * else input param is just ignored.
+	 * A valid offset starts from 0 to
+	 * (#gk20a.max_sm_diversity_config_count - 1).
+	 */
+	u32 sm_diversity_config;
+#endif
+};
+
+#endif /* NVGPU_GR_CTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/fecs_trace.c
+++ b/drivers/gpu/nvgpu/common/gr/fecs_trace.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/nvgpu_init.h>
+#include <nvgpu/list.h>
+#include <nvgpu/log.h>
+#include <nvgpu/log2.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/circ_buf.h>
+#include <nvgpu/timers.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/gr/global_ctx.h>
+#include <nvgpu/gr/ctx.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/fecs_trace.h>
+#include <nvgpu/gr/gr_utils.h>
+
+static int nvgpu_gr_fecs_trace_periodic_polling(void *arg);
+
+int nvgpu_gr_fecs_trace_add_context(struct gk20a *g, u32 context_ptr,
+	pid_t pid, u32 vmid, struct nvgpu_list_node *list)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	struct nvgpu_fecs_trace_context_entry *entry;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+		"adding hash entry context_ptr=%x -> pid=%d, vmid=%d",
+		context_ptr, pid, vmid);
+
+	entry = nvgpu_kzalloc(g, sizeof(*entry));
+	if (entry == NULL) {
+		nvgpu_err(g,
+			"can't alloc new entry for context_ptr=%x pid=%d vmid=%d",
+			context_ptr, pid, vmid);
+		return -ENOMEM;
+	}
+
+	nvgpu_init_list_node(&entry->entry);
+	entry->context_ptr = context_ptr;
+	entry->pid = pid;
+	entry->vmid = vmid;
+
+	nvgpu_mutex_acquire(&trace->list_lock);
+	nvgpu_list_add_tail(&entry->entry, list);
+	nvgpu_mutex_release(&trace->list_lock);
+
+	return 0;
+}
+
+void nvgpu_gr_fecs_trace_remove_context(struct gk20a *g, u32 context_ptr,
+	struct nvgpu_list_node *list)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	struct nvgpu_fecs_trace_context_entry *entry, *tmp;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+		"freeing entry context_ptr=%x", context_ptr);
+
+	nvgpu_mutex_acquire(&trace->list_lock);
+	nvgpu_list_for_each_entry_safe(entry, tmp, list,
+			nvgpu_fecs_trace_context_entry,	entry) {
+		if (entry->context_ptr == context_ptr) {
+			nvgpu_list_del(&entry->entry);
+			nvgpu_log(g, gpu_dbg_ctxsw,
+				"freed entry=%p context_ptr=%x", entry,
+				entry->context_ptr);
+			nvgpu_kfree(g, entry);
+			break;
+		}
+	}
+	nvgpu_mutex_release(&trace->list_lock);
+}
+
+void nvgpu_gr_fecs_trace_remove_contexts(struct gk20a *g,
+	struct nvgpu_list_node *list)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	struct nvgpu_fecs_trace_context_entry *entry, *tmp;
+
+	nvgpu_mutex_acquire(&trace->list_lock);
+	nvgpu_list_for_each_entry_safe(entry, tmp, list,
+			nvgpu_fecs_trace_context_entry,	entry) {
+		nvgpu_list_del(&entry->entry);
+		nvgpu_kfree(g, entry);
+	}
+	nvgpu_mutex_release(&trace->list_lock);
+}
+
+void nvgpu_gr_fecs_trace_find_pid(struct gk20a *g, u32 context_ptr,
+	struct nvgpu_list_node *list, pid_t *pid, u32 *vmid)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	struct nvgpu_fecs_trace_context_entry *entry;
+
+	nvgpu_mutex_acquire(&trace->list_lock);
+	nvgpu_list_for_each_entry(entry, list, nvgpu_fecs_trace_context_entry,
+			entry) {
+		if (entry->context_ptr == context_ptr) {
+			nvgpu_log(g, gpu_dbg_ctxsw,
+				"found context_ptr=%x -> pid=%d, vmid=%d",
+				entry->context_ptr, entry->pid, entry->vmid);
+			*pid = entry->pid;
+			*vmid = entry->vmid;
+			nvgpu_mutex_release(&trace->list_lock);
+			return;
+		}
+	}
+	nvgpu_mutex_release(&trace->list_lock);
+
+	*pid = 0;
+	*vmid = 0xffffffffU;
+}
+
+int nvgpu_gr_fecs_trace_init(struct gk20a *g)
+{
+	struct nvgpu_gr_fecs_trace *trace;
+
+	if (!is_power_of_2(GK20A_FECS_TRACE_NUM_RECORDS)) {
+		nvgpu_err(g, "invalid NUM_RECORDS chosen");
+		nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, false);
+		return -EINVAL;
+	}
+
+	trace = nvgpu_kzalloc(g, sizeof(struct nvgpu_gr_fecs_trace));
+	if (trace == NULL) {
+		nvgpu_err(g, "failed to allocate fecs_trace");
+		nvgpu_set_enabled(g, NVGPU_SUPPORT_FECS_CTXSW_TRACE, false);
+		return -ENOMEM;
+	}
+	g->fecs_trace = trace;
+
+	nvgpu_mutex_init(&trace->poll_lock);
+	nvgpu_mutex_init(&trace->list_lock);
+	nvgpu_mutex_init(&trace->enable_lock);
+
+	nvgpu_init_list_node(&trace->context_list);
+
+	trace->enable_count = 0;
+
+	return 0;
+}
+
+int nvgpu_gr_fecs_trace_deinit(struct gk20a *g)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+
+	if (trace == NULL) {
+		return 0;
+	}
+
+	/*
+	 * Check if tracer was enabled before attempting to stop the
+	 * tracer thread.
+	 */
+	if (trace->enable_count > 0) {
+		nvgpu_thread_stop(&trace->poll_task);
+	}
+
+	nvgpu_gr_fecs_trace_remove_contexts(g, &trace->context_list);
+
+	nvgpu_mutex_destroy(&g->fecs_trace->list_lock);
+	nvgpu_mutex_destroy(&g->fecs_trace->poll_lock);
+	nvgpu_mutex_destroy(&g->fecs_trace->enable_lock);
+
+	nvgpu_kfree(g, g->fecs_trace);
+	g->fecs_trace = NULL;
+	return 0;
+}
+
+int nvgpu_gr_fecs_trace_num_ts(struct gk20a *g)
+{
+	return (g->ops.gr.ctxsw_prog.hw_get_ts_record_size_in_bytes()
+		- sizeof(struct nvgpu_fecs_trace_record)) / sizeof(u64);
+}
+
+struct nvgpu_fecs_trace_record *nvgpu_gr_fecs_trace_get_record(
+	struct gk20a *g, int idx)
+{
+	struct nvgpu_gr_global_ctx_buffer_desc *gr_global_ctx_buffer =
+				nvgpu_gr_get_global_ctx_buffer_ptr(g);
+	struct nvgpu_mem *mem = nvgpu_gr_global_ctx_buffer_get_mem(
+					gr_global_ctx_buffer,
+					NVGPU_GR_GLOBAL_CTX_FECS_TRACE_BUFFER);
+	if (mem == NULL) {
+		return NULL;
+	}
+
+	return (struct nvgpu_fecs_trace_record *)
+		((u8 *) mem->cpu_va +
+		(idx * g->ops.gr.ctxsw_prog.hw_get_ts_record_size_in_bytes()));
+}
+
+bool nvgpu_gr_fecs_trace_is_valid_record(struct gk20a *g,
+	struct nvgpu_fecs_trace_record *r)
+{
+	/*
+	 * testing magic_hi should suffice. magic_lo is sometimes used
+	 * as a sequence number in experimental ucode.
+	 */
+	return g->ops.gr.ctxsw_prog.is_ts_valid_record(r->magic_hi);
+}
+
+size_t nvgpu_gr_fecs_trace_buffer_size(struct gk20a *g)
+{
+	return GK20A_FECS_TRACE_NUM_RECORDS
+			* g->ops.gr.ctxsw_prog.hw_get_ts_record_size_in_bytes();
+}
+
+int nvgpu_gr_fecs_trace_max_entries(struct gk20a *g,
+		struct nvgpu_gpu_ctxsw_trace_filter *filter)
+{
+	int n;
+	int tag;
+
+	/* Compute number of entries per record, with given filter */
+	for (n = 0, tag = 0; tag < nvgpu_gr_fecs_trace_num_ts(g); tag++)
+		n += (NVGPU_GPU_CTXSW_FILTER_ISSET(tag, filter) != 0);
+
+	/* Return max number of entries generated for the whole ring */
+	return n * GK20A_FECS_TRACE_NUM_RECORDS;
+}
+
+int nvgpu_gr_fecs_trace_enable(struct gk20a *g)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	int write;
+	int err = 0;
+
+	nvgpu_mutex_acquire(&trace->enable_lock);
+	trace->enable_count++;
+
+	if (trace->enable_count == 1U) {
+		/* drop data in hw buffer */
+		if (g->ops.gr.fecs_trace.flush)
+			g->ops.gr.fecs_trace.flush(g);
+
+		write = g->ops.gr.fecs_trace.get_write_index(g);
+
+		if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+			/*
+			 * For enabling FECS trace support, MAILBOX1's MSB
+			 * (Bit 31:31) should be set to 1. Bits 30:0 represents
+			 * actual pointer value.
+			 */
+			write = write |
+				(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT));
+		}
+
+		g->ops.gr.fecs_trace.set_read_index(g, write);
+
+		/*
+		 * FECS ucode does a priv holdoff around the assertion of
+		 * context reset. So, pri transactions (e.g. mailbox1 register
+		 * write) might fail due to this. Hence, do write with ack
+		 * i.e. write and read it back to make sure write happened for
+		 * mailbox1.
+		 */
+		while (g->ops.gr.fecs_trace.get_read_index(g) != write) {
+			nvgpu_log(g, gpu_dbg_ctxsw, "mailbox1 update failed");
+			g->ops.gr.fecs_trace.set_read_index(g, write);
+		}
+
+		err = nvgpu_thread_create(&trace->poll_task, g,
+				nvgpu_gr_fecs_trace_periodic_polling, __func__);
+		if (err != 0) {
+			nvgpu_warn(g, "failed to create FECS polling task");
+			goto done;
+		}
+	}
+
+done:
+	nvgpu_mutex_release(&trace->enable_lock);
+	return err;
+}
+
+int nvgpu_gr_fecs_trace_disable(struct gk20a *g)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	int read = 0;
+
+	if (trace == NULL) {
+		return -EINVAL;
+	}
+
+	nvgpu_mutex_acquire(&trace->enable_lock);
+	if (trace->enable_count <= 0U) {
+		nvgpu_mutex_release(&trace->enable_lock);
+		return 0;
+	}
+
+	trace->enable_count--;
+	if (trace->enable_count == 0U) {
+		if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+			/*
+			 * For disabling FECS trace support, MAILBOX1's MSB
+			 * (Bit 31:31) should be set to 0.
+			 */
+			read = g->ops.gr.fecs_trace.get_read_index(g) &
+				(~(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT)));
+
+			g->ops.gr.fecs_trace.set_read_index(g, read);
+
+			/*
+			 * FECS ucode does a priv holdoff around the assertion
+			 * of context reset. So, pri transactions (e.g.
+			 * mailbox1 register write) might fail due to this.
+			 * Hence, do write with ack i.e. write and read it back
+			 * to make sure write happened for mailbox1.
+			 */
+			while (g->ops.gr.fecs_trace.get_read_index(g) != read) {
+				nvgpu_log(g, gpu_dbg_ctxsw,
+						"mailbox1 update failed");
+				g->ops.gr.fecs_trace.set_read_index(g, read);
+			}
+		}
+		nvgpu_thread_stop(&trace->poll_task);
+	}
+	nvgpu_mutex_release(&trace->enable_lock);
+
+	return 0;
+}
+
+bool nvgpu_gr_fecs_trace_is_enabled(struct gk20a *g)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+
+	return (trace && (trace->enable_count > 0));
+}
+
+void nvgpu_gr_fecs_trace_reset_buffer(struct gk20a *g)
+{
+	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw, " ");
+
+	g->ops.gr.fecs_trace.set_read_index(g,
+		g->ops.gr.fecs_trace.get_write_index(g));
+}
+
+/*
+ * Converts HW entry format to userspace-facing format and pushes it to the
+ * queue.
+ */
+int nvgpu_gr_fecs_trace_ring_read(struct gk20a *g, int index,
+	u32 *vm_update_mask)
+{
+	int i;
+	struct nvgpu_gpu_ctxsw_trace_entry entry = { };
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	pid_t cur_pid = 0, new_pid = 0;
+	u32 cur_vmid = 0U, new_vmid = 0U;
+	u32 vmid = 0U;
+	int count = 0;
+
+	struct nvgpu_fecs_trace_record *r =
+		nvgpu_gr_fecs_trace_get_record(g, index);
+	if (r == NULL) {
+		return -EINVAL;
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_ctxsw,
+		"consuming record trace=%p read=%d record=%p", trace, index, r);
+
+	if (!nvgpu_gr_fecs_trace_is_valid_record(g, r)) {
+		nvgpu_warn(g,
+			"trace=%p read=%d record=%p magic_lo=%08x magic_hi=%08x (invalid)",
+			trace, index, r, r->magic_lo, r->magic_hi);
+		return -EINVAL;
+	}
+
+	/* Clear magic_hi to detect cases where CPU could read write index
+	 * before FECS record is actually written to DRAM. This should not
+	 * as we force FECS writes to SYSMEM by reading through PRAMIN.
+	 */
+	r->magic_hi = 0;
+
+	if ((r->context_ptr != 0U) && (r->context_id != 0U)) {
+		nvgpu_gr_fecs_trace_find_pid(g, r->context_ptr,
+			&trace->context_list, &cur_pid, &cur_vmid);
+	} else {
+		cur_vmid = 0xffffffffU;
+		cur_pid = 0;
+	}
+
+	if (r->new_context_ptr != 0U) {
+		nvgpu_gr_fecs_trace_find_pid(g, r->new_context_ptr,
+			&trace->context_list, &new_pid, &new_vmid);
+	} else {
+		new_vmid = 0xffffffffU;
+		new_pid = 0;
+	}
+
+	nvgpu_log(g, gpu_dbg_ctxsw,
+		"context_ptr=%x (vmid=%u pid=%d)",
+		r->context_ptr, cur_vmid, cur_pid);
+	nvgpu_log(g, gpu_dbg_ctxsw,
+		"new_context_ptr=%x (vmid=%u pid=%d)",
+		r->new_context_ptr, new_vmid, new_pid);
+
+	entry.context_id = r->context_id;
+
+	/* break out FECS record into trace events */
+	for (i = 0; i < nvgpu_gr_fecs_trace_num_ts(g); i++) {
+
+		entry.tag = g->ops.gr.ctxsw_prog.hw_get_ts_tag(r->ts[i]);
+		entry.timestamp =
+			g->ops.gr.ctxsw_prog.hw_record_ts_timestamp(r->ts[i]);
+		entry.timestamp <<= GK20A_FECS_TRACE_PTIMER_SHIFT;
+
+		nvgpu_log(g, gpu_dbg_ctxsw,
+			"tag=%x timestamp=%llx context_id=%08x new_context_id=%08x",
+			entry.tag, entry.timestamp, r->context_id,
+			r->new_context_id);
+
+		switch (nvgpu_gpu_ctxsw_tags_to_common_tags(entry.tag)) {
+		case NVGPU_GPU_CTXSW_TAG_RESTORE_START:
+		case NVGPU_GPU_CTXSW_TAG_CONTEXT_START:
+			entry.context_id = r->new_context_id;
+			entry.pid = new_pid;
+			entry.vmid = new_vmid;
+			break;
+
+		case NVGPU_GPU_CTXSW_TAG_CTXSW_REQ_BY_HOST:
+		case NVGPU_GPU_CTXSW_TAG_FE_ACK:
+		case NVGPU_GPU_CTXSW_TAG_FE_ACK_WFI:
+		case NVGPU_GPU_CTXSW_TAG_FE_ACK_GFXP:
+		case NVGPU_GPU_CTXSW_TAG_FE_ACK_CTAP:
+		case NVGPU_GPU_CTXSW_TAG_FE_ACK_CILP:
+		case NVGPU_GPU_CTXSW_TAG_SAVE_END:
+			entry.context_id = r->context_id;
+			entry.pid = cur_pid;
+			entry.vmid = cur_vmid;
+			break;
+
+		default:
+			/* tags are not guaranteed to start at the beginning */
+			if ((entry.tag != 0) && (entry.tag !=
+				    NVGPU_GPU_CTXSW_TAG_INVALID_TIMESTAMP)) {
+				nvgpu_warn(g, "TAG not found");
+			}
+			continue;
+		}
+
+		nvgpu_log(g, gpu_dbg_ctxsw, "tag=%x context_id=%x pid=%lld",
+			entry.tag, entry.context_id, entry.pid);
+
+		if (!entry.context_id)
+			continue;
+
+		if (g->ops.gr.fecs_trace.vm_dev_write != NULL) {
+			g->ops.gr.fecs_trace.vm_dev_write(g, entry.vmid,
+				vm_update_mask, &entry);
+		} else {
+			nvgpu_gr_fecs_trace_write_entry(g, &entry);
+		}
+		count++;
+	}
+
+	nvgpu_gr_fecs_trace_wake_up(g, vmid);
+	return count;
+}
+
+int nvgpu_gr_fecs_trace_poll(struct gk20a *g)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	u32 vm_update_mask = 0U;
+	int read = 0;
+	int write = 0;
+	int cnt;
+	int err = 0;
+
+	nvgpu_mutex_acquire(&trace->poll_lock);
+	if (trace->enable_count == 0) {
+		goto done_unlock;
+	}
+
+	err = gk20a_busy(g);
+	if (err) {
+		goto done_unlock;
+	}
+
+	write = g->ops.gr.fecs_trace.get_write_index(g);
+	if ((write < 0) || (write >= GK20A_FECS_TRACE_NUM_RECORDS)) {
+		nvgpu_err(g,
+			"failed to acquire write index, write=%d", write);
+		err = write;
+		goto done;
+	}
+
+	read = g->ops.gr.fecs_trace.get_read_index(g);
+
+	cnt = CIRC_CNT(write, read, GK20A_FECS_TRACE_NUM_RECORDS);
+	if (!cnt)
+		goto done;
+
+	nvgpu_log(g, gpu_dbg_ctxsw,
+		"circular buffer: read=%d (mailbox=%d) write=%d cnt=%d",
+		read, g->ops.gr.fecs_trace.get_read_index(g), write, cnt);
+
+	/* Ensure all FECS writes have made it to SYSMEM */
+	err = g->ops.mm.cache.fb_flush(g);
+	if (err != 0) {
+		nvgpu_err(g, "mm.cache.fb_flush() failed err=%d", err);
+		goto done;
+	}
+
+	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+		/* Bits 30:0 of MAILBOX1 represents actual read pointer value */
+		read = read & (~(BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT)));
+	}
+
+	while (read != write) {
+		cnt = nvgpu_gr_fecs_trace_ring_read(g, read, &vm_update_mask);
+		if (cnt <= 0) {
+			break;
+		}
+
+		/* Get to next record. */
+		read = (read + 1) & (GK20A_FECS_TRACE_NUM_RECORDS - 1);
+	}
+
+	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_FEATURE_CONTROL)) {
+		/*
+		 * In the next step, read pointer is going to be updated.
+		 * So, MSB of read pointer should be set back to 1. This will
+		 * keep FECS trace enabled.
+		 */
+		read = read | (BIT32(NVGPU_FECS_TRACE_FEATURE_CONTROL_BIT));
+	}
+
+	/* ensure FECS records has been updated before incrementing read index */
+	nvgpu_wmb();
+	g->ops.gr.fecs_trace.set_read_index(g, read);
+
+	/*
+	 * FECS ucode does a priv holdoff around the assertion of context
+	 * reset. So, pri transactions (e.g. mailbox1 register write) might
+	 * fail due to this. Hence, do write with ack i.e. write and read
+	 * it back to make sure write happened for mailbox1.
+	 */
+	while (g->ops.gr.fecs_trace.get_read_index(g) != read) {
+		nvgpu_log(g, gpu_dbg_ctxsw, "mailbox1 update failed");
+		g->ops.gr.fecs_trace.set_read_index(g, read);
+	}
+
+	if (g->ops.gr.fecs_trace.vm_dev_update) {
+		g->ops.gr.fecs_trace.vm_dev_update(g, vm_update_mask);
+	}
+
+done:
+	gk20a_idle(g);
+done_unlock:
+	nvgpu_mutex_release(&trace->poll_lock);
+	return err;
+}
+
+static int nvgpu_gr_fecs_trace_periodic_polling(void *arg)
+{
+	struct gk20a *g = (struct gk20a *)arg;
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+
+	nvgpu_log(g, gpu_dbg_ctxsw, "thread running");
+
+	while (!nvgpu_thread_should_stop(&trace->poll_task) &&
+			trace->enable_count > 0U) {
+
+		nvgpu_usleep_range(GK20A_FECS_TRACE_FRAME_PERIOD_US,
+				   GK20A_FECS_TRACE_FRAME_PERIOD_US * 2U);
+
+		nvgpu_gr_fecs_trace_poll(g);
+	}
+
+	return 0;
+}
+
+int nvgpu_gr_fecs_trace_reset(struct gk20a *g)
+{
+	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw, " ");
+
+	if (!g->ops.gr.fecs_trace.is_enabled(g))
+		return 0;
+
+	nvgpu_gr_fecs_trace_poll(g);
+	return g->ops.gr.fecs_trace.set_read_index(g, 0);
+}
+
+/*
+ * map global circ_buf to the context space and store the GPU VA
+ * in the context header.
+ */
+int nvgpu_gr_fecs_trace_bind_channel(struct gk20a *g,
+	struct nvgpu_mem *inst_block, struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, pid_t pid, u32 vmid)
+{
+	u64 addr = 0ULL;
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	struct nvgpu_mem *mem;
+	struct nvgpu_gr_global_ctx_buffer_desc *gr_global_ctx_buffer =
+				nvgpu_gr_get_global_ctx_buffer_ptr(g);
+	u32 context_ptr;
+	u32 aperture_mask;
+	int ret;
+
+	if (trace == NULL) {
+		return -EINVAL;
+	}
+
+	context_ptr = nvgpu_inst_block_ptr(g, inst_block);
+
+	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
+			"pid=%d context_ptr=%x inst_block=%llx",
+			pid, context_ptr,
+			nvgpu_inst_block_addr(g, inst_block));
+
+	mem = nvgpu_gr_global_ctx_buffer_get_mem(gr_global_ctx_buffer,
+					NVGPU_GR_GLOBAL_CTX_FECS_TRACE_BUFFER);
+	if (mem == NULL) {
+		return -EINVAL;
+	}
+
+	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA)) {
+		addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx,
+				NVGPU_GR_CTX_FECS_TRACE_BUFFER_VA);
+		nvgpu_log(g, gpu_dbg_ctxsw, "gpu_va=%llx", addr);
+		aperture_mask = 0;
+	} else {
+		addr = nvgpu_inst_block_addr(g, mem);
+		nvgpu_log(g, gpu_dbg_ctxsw, "pa=%llx", addr);
+		aperture_mask =
+		       g->ops.gr.ctxsw_prog.get_ts_buffer_aperture_mask(g, mem);
+	}
+	if (addr == 0ULL) {
+		return -ENOMEM;
+	}
+
+	mem = nvgpu_gr_ctx_get_ctx_mem(gr_ctx);
+
+	nvgpu_log(g, gpu_dbg_ctxsw, "addr=%llx count=%d", addr,
+		GK20A_FECS_TRACE_NUM_RECORDS);
+
+	g->ops.gr.ctxsw_prog.set_ts_num_records(g, mem,
+		GK20A_FECS_TRACE_NUM_RECORDS);
+
+	if (nvgpu_is_enabled(g, NVGPU_FECS_TRACE_VA) && subctx != NULL) {
+		mem = nvgpu_gr_subctx_get_ctx_header(subctx);
+	}
+
+	g->ops.gr.ctxsw_prog.set_ts_buffer_ptr(g, mem, addr, aperture_mask);
+
+	ret = nvgpu_gr_fecs_trace_add_context(g, context_ptr, pid, vmid,
+		&trace->context_list);
+
+	return ret;
+}
+
+int nvgpu_gr_fecs_trace_unbind_channel(struct gk20a *g,
+	struct nvgpu_mem *inst_block)
+{
+	struct nvgpu_gr_fecs_trace *trace = g->fecs_trace;
+	u32 context_ptr;
+
+	if (trace == NULL) {
+		return -EINVAL;
+	}
+
+	context_ptr = nvgpu_inst_block_ptr(g, inst_block);
+
+	nvgpu_log(g, gpu_dbg_fn|gpu_dbg_ctxsw,
+		"context_ptr=%x", context_ptr);
+
+	if (g->ops.gr.fecs_trace.is_enabled(g)) {
+		if (g->ops.gr.fecs_trace.flush) {
+			g->ops.gr.fecs_trace.flush(g);
+		}
+		nvgpu_gr_fecs_trace_poll(g);
+	}
+
+	nvgpu_gr_fecs_trace_remove_context(g, context_ptr,
+		&trace->context_list);
+
+	return 0;
+}
--- a/drivers/gpu/nvgpu/common/gr/fs_state.c
+++ b/drivers/gpu/nvgpu/common/gr/fs_state.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/static_analysis.h>
+
+#include <nvgpu/gr/config.h>
+#include <nvgpu/gr/fs_state.h>
+#include <nvgpu/gr/gr_instances.h>
+#include <nvgpu/grmgr.h>
+
+static int gr_load_sm_id_config(struct gk20a *g, struct nvgpu_gr_config *config)
+{
+	int err;
+	u32 *tpc_sm_id;
+	u32 sm_id_size = g->ops.gr.init.get_sm_id_size();
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	tpc_sm_id = nvgpu_kcalloc(g, sm_id_size, sizeof(u32));
+	if (tpc_sm_id == NULL) {
+		return -ENOMEM;
+	}
+
+	err = g->ops.gr.init.sm_id_config(g, tpc_sm_id, config, NULL, false);
+
+	nvgpu_kfree(g, tpc_sm_id);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	return err;
+}
+
+static void gr_load_tpc_mask(struct gk20a *g, struct nvgpu_gr_config *config)
+{
+	u32 pes_tpc_mask = 0;
+	u32 gpc, pes;
+	u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
+						     GPU_LIT_NUM_TPC_PER_GPC);
+#ifdef CONFIG_NVGPU_NON_FUSA
+	u32 max_tpc_count = nvgpu_gr_config_get_max_tpc_count(config);
+	u32 fuse_tpc_mask;
+	u32 val;
+	u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
+	u32 gpc_phys_id;
+#endif
+
+	/* gv11b has 1 GPC and 4 TPC/GPC, so mask will not overflow u32 */
+	for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(config); gpc++) {
+		for (pes = 0;
+		     pes < nvgpu_gr_config_get_pe_count_per_gpc(config);
+		     pes++) {
+			pes_tpc_mask |= nvgpu_gr_config_get_pes_tpc_mask(
+						config, gpc, pes) <<
+				nvgpu_safe_mult_u32(num_tpc_per_gpc, gpc);
+		}
+	}
+
+	nvgpu_log_info(g, "pes_tpc_mask %u\n", pes_tpc_mask);
+
+#ifdef CONFIG_NVGPU_NON_FUSA
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		/*
+		 * Fuse registers must be queried with physical gpc-id and not
+		 * the logical ones. For tu104 and before chips logical gpc-id
+		 * is same as physical gpc-id for non-floorswept config but for
+		 * chips after tu104 it may not be true.
+		 */
+		gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g,
+				cur_gr_instance, 0U);
+		fuse_tpc_mask = g->ops.gr.config.get_gpc_tpc_mask(g, config, gpc_phys_id);
+		if ((g->tpc_fs_mask_user != 0U) &&
+					(g->tpc_fs_mask_user != fuse_tpc_mask)) {
+			if (fuse_tpc_mask == nvgpu_safe_sub_u32(BIT32(max_tpc_count),
+									U32(1))) {
+				val = g->tpc_fs_mask_user;
+				val &= nvgpu_safe_sub_u32(BIT32(max_tpc_count), U32(1));
+				/*
+				 * skip tpc to disable the other tpc cause channel
+				 * timeout
+				 */
+				val = nvgpu_safe_sub_u32(BIT32(hweight32(val)), U32(1));
+				pes_tpc_mask = val;
+			}
+		}
+	}
+#endif
+
+	g->ops.gr.init.tpc_mask(g, 0, pes_tpc_mask);
+}
+
+int nvgpu_gr_fs_state_init(struct gk20a *g, struct nvgpu_gr_config *config)
+{
+	u32 tpc_index, gpc_index;
+	u32 sm_id = 0;
+#ifdef CONFIG_NVGPU_NON_FUSA
+	u32 fuse_tpc_mask;
+	u32 max_tpc_cnt;
+	u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
+	u32 gpc_phys_id;
+#endif
+	u32 gpc_cnt, tpc_cnt;
+	u32 num_sm;
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	g->ops.gr.init.fs_state(g);
+
+	err = g->ops.gr.config.init_sm_id_table(g, config);
+	if (err != 0) {
+		return err;
+	}
+
+	num_sm = nvgpu_gr_config_get_no_of_sm(config);
+	nvgpu_assert(num_sm > 0U);
+
+	for (sm_id = 0; sm_id < num_sm; sm_id++) {
+		struct nvgpu_sm_info *sm_info =
+			nvgpu_gr_config_get_sm_info(config, sm_id);
+		tpc_index = nvgpu_gr_config_get_sm_info_tpc_index(sm_info);
+		gpc_index = nvgpu_gr_config_get_sm_info_gpc_index(sm_info);
+
+		g->ops.gr.init.sm_id_numbering(g, gpc_index, tpc_index, sm_id,
+					       config, NULL, false);
+	}
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		g->ops.gr.init.pd_tpc_per_gpc(g, config);
+	}
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		/* gr__setup_pd_mapping */
+		g->ops.gr.init.rop_mapping(g, config);
+		g->ops.gr.init.pd_skip_table_gpc(g, config);
+	}
+#endif
+
+	gpc_cnt = nvgpu_gr_config_get_gpc_count(config);
+	tpc_cnt = nvgpu_gr_config_get_tpc_count(config);
+
+#ifdef CONFIG_NVGPU_NON_FUSA
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		/*
+		 * Fuse registers must be queried with physical gpc-id and not
+		 * the logical ones. For tu104 and before chips logical gpc-id
+		 * is same as physical gpc-id for non-floorswept config but for
+		 * chips after tu104 it may not be true.
+		 */
+		gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g,
+				cur_gr_instance, 0U);
+		fuse_tpc_mask = g->ops.gr.config.get_gpc_tpc_mask(g, config, gpc_phys_id);
+		max_tpc_cnt = nvgpu_gr_config_get_max_tpc_count(config);
+
+		if ((g->tpc_fs_mask_user != 0U) &&
+			(fuse_tpc_mask ==
+				nvgpu_safe_sub_u32(BIT32(max_tpc_cnt), U32(1)))) {
+			u32 val = g->tpc_fs_mask_user;
+			val &= nvgpu_safe_sub_u32(BIT32(max_tpc_cnt), U32(1));
+			tpc_cnt = (u32)hweight32(val);
+		}
+	}
+#endif
+
+	g->ops.gr.init.cwd_gpcs_tpcs_num(g, gpc_cnt, tpc_cnt);
+
+	gr_load_tpc_mask(g, config);
+
+	err = gr_load_sm_id_config(g, config);
+	if (err != 0) {
+		nvgpu_err(g, "load_smid_config failed err=%d", err);
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	return err;
+}
+
--- a/drivers/gpu/nvgpu/common/gr/global_ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/global_ctx.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/log.h>
+#include <nvgpu/nvgpu_mem.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/dma.h>
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+#include <nvgpu/static_analysis.h>
+#include <nvgpu/string.h>
+#endif
+
+#include <nvgpu/gr/global_ctx.h>
+
+#include "global_ctx_priv.h"
+
+#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
+#include <nvgpu/posix/posix-fault-injection.h>
+
+struct nvgpu_posix_fault_inj *nvgpu_golden_ctx_verif_get_fault_injection(void)
+{
+	struct nvgpu_posix_fault_inj_container *c =
+		nvgpu_posix_fault_injection_get_container();
+
+	return &c->golden_ctx_verif_fi;
+}
+
+struct nvgpu_posix_fault_inj *nvgpu_local_golden_image_get_fault_injection(void)
+{
+	struct nvgpu_posix_fault_inj_container *c =
+		nvgpu_posix_fault_injection_get_container();
+
+	return &c->local_golden_image_fi;
+}
+#endif
+
+struct nvgpu_gr_global_ctx_buffer_desc *
+nvgpu_gr_global_ctx_desc_alloc(struct gk20a *g)
+{
+	struct nvgpu_gr_global_ctx_buffer_desc *desc =
+		nvgpu_kzalloc(g, sizeof(*desc) *
+					U64(NVGPU_GR_GLOBAL_CTX_COUNT));
+	return desc;
+}
+
+void nvgpu_gr_global_ctx_desc_free(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *desc)
+{
+	nvgpu_kfree(g, desc);
+}
+
+
+void nvgpu_gr_global_ctx_set_size(struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index, size_t size)
+{
+	nvgpu_assert(index < NVGPU_GR_GLOBAL_CTX_COUNT);
+	desc[index].size = size;
+}
+
+size_t nvgpu_gr_global_ctx_get_size(struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index)
+{
+	return desc[index].size;
+}
+
+static void nvgpu_gr_global_ctx_buffer_destroy(struct gk20a *g,
+		struct nvgpu_mem *mem)
+{
+	nvgpu_dma_free(g, mem);
+}
+
+void nvgpu_gr_global_ctx_buffer_free(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *desc)
+{
+	u32 i;
+
+	if (desc == NULL) {
+		return;
+	}
+
+	for (i = 0; i < NVGPU_GR_GLOBAL_CTX_COUNT; i++) {
+		if (desc[i].destroy != NULL) {
+			desc[i].destroy(g, &desc[i].mem);
+			desc[i].destroy = NULL;
+		}
+	}
+
+	nvgpu_log_fn(g, "done");
+}
+
+static int nvgpu_gr_global_ctx_buffer_alloc_sys(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index)
+{
+	int err = 0;
+
+	nvgpu_log_fn(g, " ");
+
+	if (nvgpu_mem_is_valid(&desc[index].mem)) {
+		return 0;
+	}
+
+	err = nvgpu_dma_alloc_sys(g, desc[index].size,
+			&desc[index].mem);
+	if (err != 0) {
+		return err;
+	}
+
+	desc[index].destroy = nvgpu_gr_global_ctx_buffer_destroy;
+
+	return err;
+}
+
+#ifdef CONFIG_NVGPU_VPR
+static int nvgpu_gr_global_ctx_buffer_alloc_vpr(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index)
+{
+	int err = 0;
+
+	nvgpu_log_fn(g, " ");
+
+	if (nvgpu_mem_is_valid(&desc[index].mem)) {
+		return 0;
+	}
+
+	if (g->ops.secure_alloc != NULL) {
+		err = g->ops.secure_alloc(g,
+				&desc[index].mem, desc[index].size,
+				&desc[index].destroy);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	return err;
+}
+#endif
+
+static bool nvgpu_gr_global_ctx_buffer_sizes_are_valid(struct gk20a *g,
+				struct nvgpu_gr_global_ctx_buffer_desc *desc)
+{
+
+	if (desc[NVGPU_GR_GLOBAL_CTX_PRIV_ACCESS_MAP].size == 0U) {
+		return false;
+	}
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		if ((desc[NVGPU_GR_GLOBAL_CTX_CIRCULAR].size == 0U) ||
+			(desc[NVGPU_GR_GLOBAL_CTX_PAGEPOOL].size == 0U) ||
+			(desc[NVGPU_GR_GLOBAL_CTX_ATTRIBUTE].size == 0U)) {
+			return false;
+		}
+#ifdef CONFIG_NVGPU_VPR
+		if ((desc[NVGPU_GR_GLOBAL_CTX_CIRCULAR_VPR].size == 0U) ||
+			(desc[NVGPU_GR_GLOBAL_CTX_PAGEPOOL_VPR].size == 0U) ||
+			(desc[NVGPU_GR_GLOBAL_CTX_ATTRIBUTE_VPR].size == 0U)) {
+			return false;
+		}
+#endif
+	}
+
+	return true;
+}
+
+#ifdef CONFIG_NVGPU_VPR
+static int nvgpu_gr_global_ctx_buffer_vpr_alloc(struct gk20a *g,
+				struct nvgpu_gr_global_ctx_buffer_desc *desc)
+{
+	int err = 0;
+
+	/*
+	 * MIG supports only compute class.
+	 * Allocate BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
+	 * if 2D/3D/I2M classes(graphics) are supported.
+	 */
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		nvgpu_log(g, gpu_dbg_gr | gpu_dbg_mig,
+			"2D class is not supported "
+				"skip BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB "
+				"and RTV_CB");
+		return 0;
+	}
+
+	err = nvgpu_gr_global_ctx_buffer_alloc_vpr(g, desc,
+		NVGPU_GR_GLOBAL_CTX_CIRCULAR_VPR);
+	if (err != 0) {
+		goto fail;
+	}
+
+	err = nvgpu_gr_global_ctx_buffer_alloc_vpr(g, desc,
+		NVGPU_GR_GLOBAL_CTX_PAGEPOOL_VPR);
+	if (err != 0) {
+		goto fail;
+	}
+
+	err = nvgpu_gr_global_ctx_buffer_alloc_vpr(g, desc,
+		NVGPU_GR_GLOBAL_CTX_ATTRIBUTE_VPR);
+	if (err != 0) {
+		goto fail;
+	}
+fail:
+	return err;
+}
+#endif
+
+static int nvgpu_gr_global_ctx_buffer_sys_alloc(struct gk20a *g,
+				struct nvgpu_gr_global_ctx_buffer_desc *desc)
+{
+	int err = 0;
+
+	/*
+	 * MIG supports only compute class.
+	 * Allocate BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
+	 * if 2D/3D/I2M classes(graphics) are supported.
+	 */
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		err = nvgpu_gr_global_ctx_buffer_alloc_sys(g, desc,
+			NVGPU_GR_GLOBAL_CTX_CIRCULAR);
+		if (err != 0) {
+			goto fail;
+		}
+
+		err = nvgpu_gr_global_ctx_buffer_alloc_sys(g, desc,
+			NVGPU_GR_GLOBAL_CTX_PAGEPOOL);
+		if (err != 0) {
+			goto fail;
+		}
+
+		err = nvgpu_gr_global_ctx_buffer_alloc_sys(g, desc,
+			NVGPU_GR_GLOBAL_CTX_ATTRIBUTE);
+		if (err != 0) {
+			goto fail;
+		}
+	}
+
+	err = nvgpu_gr_global_ctx_buffer_alloc_sys(g, desc,
+		NVGPU_GR_GLOBAL_CTX_PRIV_ACCESS_MAP);
+	if (err != 0) {
+		goto fail;
+	}
+fail:
+	return err;
+}
+
+
+int nvgpu_gr_global_ctx_buffer_alloc(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *desc)
+{
+	int err = 0;
+
+	if (nvgpu_gr_global_ctx_buffer_sizes_are_valid(g, desc) != true) {
+		return -EINVAL;
+	}
+
+	err = nvgpu_gr_global_ctx_buffer_sys_alloc(g, desc);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+#ifdef CONFIG_NVGPU_FECS_TRACE
+	if (desc[NVGPU_GR_GLOBAL_CTX_FECS_TRACE_BUFFER].size != 0U) {
+		err = nvgpu_gr_global_ctx_buffer_alloc_sys(g, desc,
+			NVGPU_GR_GLOBAL_CTX_FECS_TRACE_BUFFER);
+		if (err != 0) {
+			goto clean_up;
+		}
+	}
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		if (desc[NVGPU_GR_GLOBAL_CTX_RTV_CIRCULAR_BUFFER].size != 0U) {
+			err = nvgpu_gr_global_ctx_buffer_alloc_sys(g, desc,
+				NVGPU_GR_GLOBAL_CTX_RTV_CIRCULAR_BUFFER);
+			if (err != 0) {
+				goto clean_up;
+			}
+		}
+	}
+#endif
+
+#ifdef CONFIG_NVGPU_VPR
+	if (nvgpu_gr_global_ctx_buffer_vpr_alloc(g, desc) != 0) {
+			goto clean_up;
+	}
+#endif
+
+	return err;
+
+clean_up:
+	nvgpu_gr_global_ctx_buffer_free(g, desc);
+	return err;
+}
+
+u64 nvgpu_gr_global_ctx_buffer_map(struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index,
+	struct vm_gk20a *vm, u32 flags, bool priv)
+{
+	u64 gpu_va;
+
+	if (!nvgpu_mem_is_valid(&desc[index].mem)) {
+		return 0;
+	}
+
+	gpu_va = nvgpu_gmmu_map(vm, &desc[index].mem, desc[index].mem.size,
+			flags, gk20a_mem_flag_none, priv,
+			desc[index].mem.aperture);
+	return gpu_va;
+}
+
+void nvgpu_gr_global_ctx_buffer_unmap(
+	struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index,
+	struct vm_gk20a *vm, u64 gpu_va)
+{
+	if (nvgpu_mem_is_valid(&desc[index].mem)) {
+		nvgpu_gmmu_unmap(vm, &desc[index].mem, gpu_va);
+	}
+}
+
+struct nvgpu_mem *nvgpu_gr_global_ctx_buffer_get_mem(
+	struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index)
+{
+	if (nvgpu_mem_is_valid(&desc[index].mem)) {
+		return &desc[index].mem;
+	}
+	return NULL;
+}
+
+bool nvgpu_gr_global_ctx_buffer_ready(
+	struct nvgpu_gr_global_ctx_buffer_desc *desc,
+	u32 index)
+{
+	if (nvgpu_mem_is_valid(&desc[index].mem)) {
+		return true;
+	}
+	return false;
+}
+
+struct nvgpu_gr_global_ctx_local_golden_image *
+nvgpu_gr_global_ctx_init_local_golden_image(struct gk20a *g,
+	struct nvgpu_mem *source_mem, size_t size)
+{
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image;
+
+#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
+	if (nvgpu_posix_fault_injection_handle_call(
+			nvgpu_local_golden_image_get_fault_injection())) {
+		return NULL;
+	}
+#endif
+
+	local_golden_image = nvgpu_kzalloc(g, sizeof(*local_golden_image));
+	if (local_golden_image == NULL) {
+		return NULL;
+	}
+
+	local_golden_image->context = nvgpu_vzalloc(g, size);
+	if (local_golden_image->context == NULL) {
+		nvgpu_kfree(g, local_golden_image);
+		return NULL;
+	}
+
+	local_golden_image->size = size;
+
+	nvgpu_mem_rd_n(g, source_mem, 0, local_golden_image->context,
+		nvgpu_safe_cast_u64_to_u32(size));
+
+	return local_golden_image;
+}
+
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+bool nvgpu_gr_global_ctx_compare_golden_images(struct gk20a *g,
+	bool is_sysmem,
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image1,
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image2,
+	size_t size)
+{
+	bool is_identical = true;
+	u32 *data1 = local_golden_image1->context;
+	u32 *data2 = local_golden_image2->context;
+#ifdef CONFIG_NVGPU_DGPU
+	u32 i;
+#endif
+
+#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
+	if (nvgpu_posix_fault_injection_handle_call(
+			nvgpu_golden_ctx_verif_get_fault_injection())) {
+		return false;
+	}
+#endif
+
+	/*
+	 * In case of sysmem, direct mem compare can be used.
+	 * For vidmem, word by word comparison only works and
+	 * it is too early to use ce engine for read operations.
+	 */
+	if (is_sysmem) {
+		if (nvgpu_memcmp((u8 *)data1, (u8 *)data2, size) != 0) {
+			is_identical = false;
+		}
+	}
+	else {
+#ifdef CONFIG_NVGPU_DGPU
+		for( i = 0U; i < nvgpu_safe_cast_u64_to_u32(size/sizeof(u32));
+					i = nvgpu_safe_add_u32(i, 1U)) {
+			if (*(data1 + i) != *(data2 + i)) {
+				is_identical = false;
+				nvgpu_log_info(g,
+				"mismatch i = %u golden1: %u golden2 %u",
+				i, *(data1 + i), *(data2 + i));
+				break;
+			}
+		}
+#else
+		is_identical = false;
+#endif
+	}
+
+	nvgpu_log_info(g, "%s result %u", __func__, is_identical);
+	return is_identical;
+}
+#endif
+
+void nvgpu_gr_global_ctx_load_local_golden_image(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image,
+	struct nvgpu_mem *target_mem)
+{
+	/* Channel gr_ctx buffer is gpu cacheable.
+	   Flush and invalidate before cpu update. */
+	if (g->ops.mm.cache.l2_flush(g, true) != 0) {
+		nvgpu_err(g, "l2_flush failed");
+	}
+
+	nvgpu_mem_wr_n(g, target_mem, 0, local_golden_image->context,
+		nvgpu_safe_cast_u64_to_u32(local_golden_image->size));
+
+	nvgpu_log(g, gpu_dbg_gr, "loaded saved golden image into gr_ctx");
+}
+
+void nvgpu_gr_global_ctx_deinit_local_golden_image(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image)
+{
+	nvgpu_vfree(g, local_golden_image->context);
+	nvgpu_kfree(g, local_golden_image);
+}
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+u32 *nvgpu_gr_global_ctx_get_local_golden_image_ptr(
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image)
+{
+	return local_golden_image->context;
+}
+#endif
--- a/drivers/gpu/nvgpu/common/gr/global_ctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/global_ctx_priv.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_GLOBAL_CTX_PRIV_H
+#define NVGPU_GR_GLOBAL_CTX_PRIV_H
+
+/**
+ * Global context buffer descriptor structure.
+ *
+ * This structure stores properties applicable to each global
+ * context buffer.
+ */
+struct nvgpu_gr_global_ctx_buffer_desc {
+	/**
+	 * Memory to hold global context buffer.
+	 */
+	struct nvgpu_mem mem;
+
+	/**
+	 * Size of global context buffer.
+	 */
+	size_t size;
+
+	/**
+	 * Function pointer to free global context buffer.
+	 */
+	global_ctx_mem_destroy_fn destroy;
+};
+
+/**
+ * Local Golden context image descriptor structure.
+ *
+ * This structure stores details of a local Golden context image.
+ * Pointer to this struct is maintained in
+ * #nvgpu_gr_obj_ctx_golden_image structure.
+ */
+struct nvgpu_gr_global_ctx_local_golden_image {
+	/**
+	 * Pointer to local Golden context image memory.
+	 */
+	u32 *context;
+
+	/**
+	 * Size of local Golden context image.
+	 */
+	size_t size;
+};
+
+#endif /* NVGPU_GR_GLOBAL_CTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/gr.c
+++ b/drivers/gpu/nvgpu/common/gr/gr.c
--- a/drivers/gpu/nvgpu/common/gr/gr_config.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_config.c
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/io.h>
+#include <nvgpu/static_analysis.h>
+#include <nvgpu/gr/config.h>
+#include <nvgpu/gr/gr_instances.h>
+#include <nvgpu/grmgr.h>
+
+#include "gr_config_priv.h"
+
+static void gr_config_init_pes_tpc(struct gk20a *g,
+				struct nvgpu_gr_config *config,
+				u32 gpc_index)
+{
+	u32 pes_index;
+	u32 pes_tpc_mask;
+	u32 pes_tpc_count;
+
+	for (pes_index = 0; pes_index < config->pe_count_per_gpc;
+			    pes_index++) {
+		pes_tpc_mask = g->ops.gr.config.get_pes_tpc_mask(g,
+					config, gpc_index, pes_index);
+		pes_tpc_count = hweight32(pes_tpc_mask);
+
+		/* detect PES presence by seeing if there are
+		 * TPCs connected to it.
+		 */
+		if (pes_tpc_count != 0U) {
+			config->gpc_ppc_count[gpc_index] = nvgpu_safe_add_u32(
+				config->gpc_ppc_count[gpc_index], 1U);
+		}
+
+		config->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
+		config->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
+	}
+}
+
+static void gr_config_init_gpc_skip_mask(struct nvgpu_gr_config *config,
+					u32 gpc_index)
+{
+	u32 pes_heavy_index;
+	u32 gpc_new_skip_mask = 0U;
+	u32 pes_tpc_cnt = 0U, pes_tpc_mask = 0U;
+
+	if (config->pe_count_per_gpc <= 1U) {
+		goto skip_mask_end;
+	}
+
+	pes_tpc_cnt = nvgpu_safe_add_u32(
+		config->pes_tpc_count[0][gpc_index],
+		config->pes_tpc_count[1][gpc_index]);
+
+	pes_heavy_index =
+		(config->pes_tpc_count[0][gpc_index] >
+			config->pes_tpc_count[1][gpc_index]) ? 0U : 1U;
+
+	if ((pes_tpc_cnt == 5U) || ((pes_tpc_cnt == 4U) &&
+		   (config->pes_tpc_count[0][gpc_index] !=
+		    config->pes_tpc_count[1][gpc_index]))) {
+		pes_tpc_mask = nvgpu_safe_sub_u32(
+			config->pes_tpc_mask[pes_heavy_index][gpc_index], 1U);
+		gpc_new_skip_mask =
+			config->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+			   (config->pes_tpc_mask[pes_heavy_index][gpc_index] &
+			   pes_tpc_mask);
+	}
+
+skip_mask_end:
+	config->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
+}
+
+static void gr_config_log_info(struct gk20a *g,
+					struct nvgpu_gr_config *config)
+{
+	u32 gpc_index, pes_index;
+
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "max_gpc_count: %d", config->max_gpc_count);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_count: %d", config->gpc_count);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_mask: 0x%x", config->gpc_mask);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "max_tpc_per_gpc_count: %d", config->max_tpc_per_gpc_count);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "max_tpc_count: %d", config->max_tpc_count);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "tpc_count: %d", config->tpc_count);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "sm_count_per_tpc: %d", config->sm_count_per_tpc);
+#ifdef CONFIG_NVGPU_GRAPHICS
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "max_zcull_per_gpc_count: %d", config->max_zcull_per_gpc_count);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "zcb_count: %d", config->zcb_count);
+#endif
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "pe_count_per_gpc: %d", config->pe_count_per_gpc);
+	nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "ppc_count: %d", config->ppc_count);
+
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_tpc_count[%d] : %d",
+			   gpc_index, config->gpc_tpc_count[gpc_index]);
+	}
+	for (gpc_index = 0; gpc_index < config->max_gpc_count; gpc_index++) {
+		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_tpc_mask[%d] : 0x%x",
+			   gpc_index, config->gpc_tpc_mask[gpc_index]);
+	}
+#ifdef CONFIG_NVGPU_GRAPHICS
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_zcb_count[%d] : %d",
+			   gpc_index, config->gpc_zcb_count != NULL ?
+				      config->gpc_zcb_count[gpc_index] : 0U);
+	}
+#endif
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_ppc_count[%d] : %d",
+			   gpc_index, config->gpc_ppc_count[gpc_index]);
+	}
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "gpc_skip_mask[%d] : 0x%x",
+			   gpc_index, config->gpc_skip_mask[gpc_index]);
+	}
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		for (pes_index = 0;
+		     pes_index < config->pe_count_per_gpc;
+		     pes_index++) {
+			nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "pes_tpc_count[%d][%d] : %d",
+				   pes_index, gpc_index,
+				   config->pes_tpc_count[pes_index][gpc_index]);
+		}
+	}
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		for (pes_index = 0;
+		     pes_index < config->pe_count_per_gpc;
+		     pes_index++) {
+			nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "pes_tpc_mask[%d][%d] : 0x%x",
+				   pes_index, gpc_index,
+				   config->pes_tpc_mask[pes_index][gpc_index]);
+		}
+	}
+}
+
+static void gr_config_set_gpc_mask(struct gk20a *g,
+					struct nvgpu_gr_config *config)
+{
+#ifdef CONFIG_NVGPU_DGPU
+	if (g->ops.gr.config.get_gpc_mask != NULL) {
+		config->gpc_mask = g->ops.gr.config.get_gpc_mask(g);
+	} else
+#endif
+	{
+		config->gpc_mask = nvgpu_safe_sub_u32(BIT32(config->gpc_count),
+								1U);
+	}
+}
+
+static bool gr_config_alloc_valid(struct nvgpu_gr_config *config)
+{
+	if ((config->gpc_tpc_count == NULL) || (config->gpc_tpc_mask == NULL) ||
+	    (config->gpc_ppc_count == NULL) ||
+	    (config->gpc_skip_mask == NULL)) {
+		return false;
+	}
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	if (!nvgpu_is_enabled(config->g, NVGPU_SUPPORT_MIG) &&
+			(config->gpc_zcb_count == NULL)) {
+		return false;
+	}
+#endif
+
+	return true;
+}
+
+static void gr_config_free_mem(struct gk20a *g,
+				struct nvgpu_gr_config *config)
+{
+	u32 pes_index;
+
+	for (pes_index = 0U; pes_index < config->pe_count_per_gpc; pes_index++) {
+		nvgpu_kfree(g, config->pes_tpc_count[pes_index]);
+		nvgpu_kfree(g, config->pes_tpc_mask[pes_index]);
+	}
+
+	nvgpu_kfree(g, config->gpc_skip_mask);
+	nvgpu_kfree(g, config->gpc_ppc_count);
+#ifdef CONFIG_NVGPU_GRAPHICS
+	nvgpu_kfree(g, config->gpc_zcb_count);
+#endif
+	nvgpu_kfree(g, config->gpc_tpc_mask);
+	nvgpu_kfree(g, config->gpc_tpc_count);
+}
+
+static bool gr_config_alloc_struct_mem(struct gk20a *g,
+				struct nvgpu_gr_config *config)
+{
+	u32 pes_index;
+	u32 total_tpc_cnt;
+	size_t sm_info_size;
+	size_t gpc_size, sm_size, max_gpc_cnt;
+	size_t pd_tbl_size;
+
+	total_tpc_cnt = nvgpu_safe_mult_u32(config->gpc_count,
+				config->max_tpc_per_gpc_count);
+	sm_size = nvgpu_safe_mult_u64((size_t)config->sm_count_per_tpc,
+				sizeof(struct nvgpu_sm_info));
+	/* allocate for max tpc per gpc */
+	sm_info_size = nvgpu_safe_mult_u64((size_t)total_tpc_cnt, sm_size);
+
+	config->sm_to_cluster = nvgpu_kzalloc(g, sm_info_size);
+	if (config->sm_to_cluster == NULL) {
+		nvgpu_err(g, "sm_to_cluster == NULL");
+		goto alloc_err;
+	}
+
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY)) {
+		config->sm_to_cluster_redex_config =
+			nvgpu_kzalloc(g, sm_info_size);
+		if (config->sm_to_cluster_redex_config == NULL) {
+			nvgpu_err(g, "sm_to_cluster_redex_config == NULL");
+			goto clean_alloc_mem;
+		}
+	}
+#endif
+	config->no_of_sm = 0;
+
+	gpc_size = nvgpu_safe_mult_u64((size_t)config->gpc_count, sizeof(u32));
+	max_gpc_cnt = nvgpu_safe_mult_u64((size_t)config->max_gpc_count, sizeof(u32));
+	config->gpc_tpc_count = nvgpu_kzalloc(g, gpc_size);
+	config->gpc_tpc_mask = nvgpu_kzalloc(g, max_gpc_cnt);
+#ifdef CONFIG_NVGPU_GRAPHICS
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		config->max_zcull_per_gpc_count = nvgpu_get_litter_value(g,
+			GPU_LIT_NUM_ZCULL_BANKS);
+
+		config->gpc_zcb_count = nvgpu_kzalloc(g, gpc_size);
+	}
+#endif
+	config->gpc_ppc_count = nvgpu_kzalloc(g, gpc_size);
+
+	pd_tbl_size = nvgpu_safe_mult_u64(
+			(size_t)g->ops.gr.config.get_pd_dist_skip_table_size(),
+			sizeof(u32));
+	pd_tbl_size = nvgpu_safe_mult_u64(pd_tbl_size, 4UL);
+	config->gpc_skip_mask = nvgpu_kzalloc(g, pd_tbl_size);
+
+	if (gr_config_alloc_valid(config) == false) {
+		goto clean_alloc_mem;
+	}
+
+	for (pes_index = 0U; pes_index < config->pe_count_per_gpc; pes_index++) {
+		config->pes_tpc_count[pes_index] = nvgpu_kzalloc(g, gpc_size);
+		config->pes_tpc_mask[pes_index] = nvgpu_kzalloc(g, gpc_size);
+		if ((config->pes_tpc_count[pes_index] == NULL) ||
+		    (config->pes_tpc_mask[pes_index] == NULL)) {
+			goto clean_alloc_mem;
+		}
+	}
+
+	return true;
+
+clean_alloc_mem:
+	nvgpu_kfree(g, config->sm_to_cluster);
+	config->sm_to_cluster = NULL;
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+	if (config->sm_to_cluster_redex_config != NULL) {
+		nvgpu_kfree(g, config->sm_to_cluster_redex_config);
+		config->sm_to_cluster_redex_config = NULL;
+	}
+#endif
+	gr_config_free_mem(g, config);
+
+alloc_err:
+	return false;
+}
+
+static int gr_config_init_mig_gpcs(struct nvgpu_gr_config *config)
+{
+	struct gk20a *g = config->g;
+	u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
+
+	config->max_gpc_count = nvgpu_grmgr_get_max_gpc_count(g);
+	config->gpc_count = nvgpu_grmgr_get_gr_num_gpcs(g, cur_gr_instance);
+	if (config->gpc_count == 0U) {
+		nvgpu_err(g, "gpc_count==0!");
+		return -EINVAL;
+	}
+
+	config->gpc_mask = nvgpu_grmgr_get_gr_logical_gpc_mask(
+		g, cur_gr_instance);
+
+	return 0;
+}
+
+static int gr_config_init_gpcs(struct nvgpu_gr_config *config)
+{
+	struct gk20a *g = config->g;
+
+	config->max_gpc_count = g->ops.top.get_max_gpc_count(g);
+	config->gpc_count = g->ops.priv_ring.get_gpc_count(g);
+	if (config->gpc_count == 0U) {
+		nvgpu_err(g, "gpc_count==0!");
+		return -EINVAL;
+	}
+
+	gr_config_set_gpc_mask(g, config);
+
+	return 0;
+}
+
+struct nvgpu_gr_config *nvgpu_gr_config_init(struct gk20a *g)
+{
+	struct nvgpu_gr_config *config;
+	u32 cur_gr_instance = nvgpu_gr_get_cur_instance_id(g);
+	u32 gpc_index;
+	u32 gpc_phys_id;
+	int err;
+
+	config = nvgpu_kzalloc(g, sizeof(*config));
+	if (config == NULL) {
+		return NULL;
+	}
+
+	config->g = g;
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		err = gr_config_init_mig_gpcs(config);
+		if (err < 0) {
+			nvgpu_err(g, "MIG GPC config init failed");
+			nvgpu_kfree(g, config);
+			return NULL;
+		}
+	} else {
+		err = gr_config_init_gpcs(config);
+		if (err < 0) {
+			nvgpu_err(g, "GPC config init failed");
+			nvgpu_kfree(g, config);
+			return NULL;
+		}
+	}
+
+	/* Required to read gpc_tpc_mask below */
+	config->max_tpc_per_gpc_count = g->ops.top.get_max_tpc_per_gpc_count(g);
+
+	config->max_tpc_count = nvgpu_safe_mult_u32(config->max_gpc_count,
+				config->max_tpc_per_gpc_count);
+
+	config->pe_count_per_gpc = nvgpu_get_litter_value(g,
+		GPU_LIT_NUM_PES_PER_GPC);
+	if (config->pe_count_per_gpc > GK20A_GR_MAX_PES_PER_GPC) {
+		nvgpu_err(g, "too many pes per gpc");
+		goto clean_up_init;
+	}
+
+	config->sm_count_per_tpc =
+		nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
+	if (config->sm_count_per_tpc == 0U) {
+		nvgpu_err(g, "sm_count_per_tpc==0!");
+		goto clean_up_init;
+	}
+
+	if (gr_config_alloc_struct_mem(g, config) == false) {
+		goto clean_up_init;
+	}
+
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		/*
+		 * Fuse registers must be queried with physical gpc-id and not
+		 * the logical ones. For tu104 and before chips logical gpc-id
+		 * is same as physical gpc-id for non-floorswept config but for
+		 * chips after tu104 it may not be true.
+		 */
+		gpc_phys_id = nvgpu_grmgr_get_gr_gpc_phys_id(g,
+				cur_gr_instance, gpc_index);
+		config->gpc_tpc_mask[gpc_index] =
+		     g->ops.gr.config.get_gpc_tpc_mask(g, config, gpc_phys_id);
+	}
+
+	config->ppc_count = 0;
+	config->tpc_count = 0;
+#ifdef CONFIG_NVGPU_GRAPHICS
+	config->zcb_count = 0;
+#endif
+	for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+		config->gpc_tpc_count[gpc_index] =
+			g->ops.gr.config.get_tpc_count_in_gpc(g, config,
+				gpc_index);
+		config->tpc_count = nvgpu_safe_add_u32(config->tpc_count,
+					config->gpc_tpc_count[gpc_index]);
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+		if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+			config->gpc_zcb_count[gpc_index] =
+				g->ops.gr.config.get_zcull_count_in_gpc(g, config,
+					gpc_index);
+			config->zcb_count = nvgpu_safe_add_u32(config->zcb_count,
+						config->gpc_zcb_count[gpc_index]);
+		}
+#endif
+
+		gr_config_init_pes_tpc(g, config, gpc_index);
+
+		config->ppc_count = nvgpu_safe_add_u32(config->ppc_count,
+					config->gpc_ppc_count[gpc_index]);
+
+		gr_config_init_gpc_skip_mask(config, gpc_index);
+	}
+
+	gr_config_log_info(g, config);
+	return config;
+
+clean_up_init:
+	nvgpu_kfree(g, config);
+	return NULL;
+}
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+static u32 prime_set[18] = {
+	2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
+
+/*
+ * Return map tiles count for given index
+ * Return 0 if index is out-of-bounds
+ */
+u32 nvgpu_gr_config_get_map_tile_count(struct nvgpu_gr_config *config, u32 index)
+{
+	if (index >= config->map_tile_count) {
+		return 0;
+	}
+
+	return config->map_tiles[index];
+}
+
+u8 *nvgpu_gr_config_get_map_tiles(struct nvgpu_gr_config *config)
+{
+	return config->map_tiles;
+}
+
+u32 nvgpu_gr_config_get_map_row_offset(struct nvgpu_gr_config *config)
+{
+	return config->map_row_offset;
+}
+
+int nvgpu_gr_config_init_map_tiles(struct gk20a *g,
+	struct nvgpu_gr_config *config)
+{
+	s32 comm_denom;
+	s32 mul_factor;
+	s32 *init_frac = NULL;
+	s32 *init_err = NULL;
+	s32 *run_err = NULL;
+	u32 *sorted_num_tpcs = NULL;
+	u32 *sorted_to_unsorted_gpc_map = NULL;
+	u32 gpc_index;
+	u32 gpc_mark = 0;
+	u32 num_tpc;
+	u32 max_tpc_count = 0;
+	u32 swap;
+	u32 tile_count;
+	u32 index;
+	bool delete_map = false;
+	bool gpc_sorted;
+	int ret = 0;
+	u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
+	u32 num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC);
+	u32 map_tile_count = num_gpcs * num_tpc_per_gpc;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	init_frac = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+	init_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+	run_err = nvgpu_kzalloc(g, num_gpcs * sizeof(s32));
+	sorted_num_tpcs =
+		nvgpu_kzalloc(g, (size_t)num_gpcs *
+				 (size_t)num_tpc_per_gpc *
+				 sizeof(s32));
+	sorted_to_unsorted_gpc_map =
+		nvgpu_kzalloc(g, (size_t)num_gpcs * sizeof(s32));
+
+	if (!((init_frac != NULL) &&
+	      (init_err != NULL) &&
+	      (run_err != NULL) &&
+	      (sorted_num_tpcs != NULL) &&
+	      (sorted_to_unsorted_gpc_map != NULL))) {
+		ret = -ENOMEM;
+		goto clean_up;
+	}
+
+	config->map_row_offset = 0xFFFFFFFFU;
+
+	if (config->tpc_count == 3U) {
+		config->map_row_offset = 2;
+	} else if (config->tpc_count < 3U) {
+		config->map_row_offset = 1;
+	} else {
+		config->map_row_offset = 3;
+
+		for (index = 1U; index < 18U; index++) {
+			u32 prime = prime_set[index];
+			if ((config->tpc_count % prime) != 0U) {
+				config->map_row_offset = prime;
+				break;
+			}
+		}
+	}
+
+	switch (config->tpc_count) {
+	case 15:
+		config->map_row_offset = 6;
+		break;
+	case 14:
+		config->map_row_offset = 5;
+		break;
+	case 13:
+		config->map_row_offset = 2;
+		break;
+	case 11:
+		config->map_row_offset = 7;
+		break;
+	case 10:
+		config->map_row_offset = 6;
+		break;
+	case 7:
+	case 5:
+		config->map_row_offset = 1;
+		break;
+	default:
+		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gr, "unsupported tpc count = %u",
+				config->tpc_count);
+		break;
+	}
+
+	if (config->map_tiles != NULL) {
+		if (config->map_tile_count != config->tpc_count) {
+			delete_map = true;
+		}
+
+		for (tile_count = 0; tile_count < config->map_tile_count; tile_count++) {
+			if (nvgpu_gr_config_get_map_tile_count(config, tile_count)
+					>= config->tpc_count) {
+				delete_map = true;
+			}
+		}
+
+		if (delete_map) {
+			nvgpu_kfree(g, config->map_tiles);
+			config->map_tiles = NULL;
+			config->map_tile_count = 0;
+		}
+	}
+
+	if (config->map_tiles == NULL) {
+		config->map_tiles = nvgpu_kzalloc(g, map_tile_count * sizeof(u8));
+		if (config->map_tiles == NULL) {
+			ret = -ENOMEM;
+			goto clean_up;
+		}
+		config->map_tile_count = map_tile_count;
+
+		for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+			sorted_num_tpcs[gpc_index] = config->gpc_tpc_count[gpc_index];
+			sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
+		}
+
+		gpc_sorted = false;
+		while (!gpc_sorted) {
+			gpc_sorted = true;
+			for (gpc_index = 0U; gpc_index < config->gpc_count - 1U; gpc_index++) {
+				if (sorted_num_tpcs[gpc_index + 1U] > sorted_num_tpcs[gpc_index]) {
+					gpc_sorted = false;
+					swap = sorted_num_tpcs[gpc_index];
+					sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1U];
+					sorted_num_tpcs[gpc_index + 1U] = swap;
+					swap = sorted_to_unsorted_gpc_map[gpc_index];
+					sorted_to_unsorted_gpc_map[gpc_index] =
+						sorted_to_unsorted_gpc_map[gpc_index + 1U];
+					sorted_to_unsorted_gpc_map[gpc_index + 1U] = swap;
+				}
+			}
+		}
+
+		for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+			if (config->gpc_tpc_count[gpc_index] > max_tpc_count) {
+				max_tpc_count = config->gpc_tpc_count[gpc_index];
+			}
+		}
+
+		mul_factor = S32(config->gpc_count) * S32(max_tpc_count);
+		if ((U32(mul_factor) & 0x1U) != 0U) {
+			mul_factor = 2;
+		} else {
+			mul_factor = 1;
+		}
+
+		comm_denom = S32(config->gpc_count) * S32(max_tpc_count) * mul_factor;
+
+		for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+			num_tpc = sorted_num_tpcs[gpc_index];
+
+			init_frac[gpc_index] = S32(num_tpc) * S32(config->gpc_count) * mul_factor;
+
+			if (num_tpc != 0U) {
+				init_err[gpc_index] = S32(gpc_index) * S32(max_tpc_count) * mul_factor - comm_denom/2;
+			} else {
+				init_err[gpc_index] = 0;
+			}
+
+			run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
+		}
+
+		while (gpc_mark < config->tpc_count) {
+			for (gpc_index = 0; gpc_index < config->gpc_count; gpc_index++) {
+				if ((run_err[gpc_index] * 2) >= comm_denom) {
+					config->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
+					run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
+				} else {
+					run_err[gpc_index] += init_frac[gpc_index];
+				}
+			}
+		}
+	}
+
+clean_up:
+	nvgpu_kfree(g, init_frac);
+	nvgpu_kfree(g, init_err);
+	nvgpu_kfree(g, run_err);
+	nvgpu_kfree(g, sorted_num_tpcs);
+	nvgpu_kfree(g, sorted_to_unsorted_gpc_map);
+
+	if (ret != 0) {
+		nvgpu_err(g, "fail");
+	} else {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	}
+
+	return ret;
+}
+
+u32 nvgpu_gr_config_get_max_zcull_per_gpc_count(struct nvgpu_gr_config *config)
+{
+	return config->max_zcull_per_gpc_count;
+}
+
+u32 nvgpu_gr_config_get_zcb_count(struct nvgpu_gr_config *config)
+{
+	return config->zcb_count;
+}
+
+u32 nvgpu_gr_config_get_gpc_zcb_count(struct nvgpu_gr_config *config,
+	u32 gpc_index)
+{
+	return config->gpc_zcb_count[gpc_index];
+}
+#endif
+
+void nvgpu_gr_config_deinit(struct gk20a *g, struct nvgpu_gr_config *config)
+{
+	if (config == NULL) {
+		return;
+	}
+
+	gr_config_free_mem(g, config);
+#ifdef CONFIG_NVGPU_GRAPHICS
+	nvgpu_kfree(g, config->map_tiles);
+#endif
+	nvgpu_kfree(g, config->sm_to_cluster);
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+	if (config->sm_to_cluster_redex_config != NULL) {
+		nvgpu_kfree(g, config->sm_to_cluster_redex_config);
+		config->sm_to_cluster_redex_config = NULL;
+	}
+#endif
+}
+
+u32 nvgpu_gr_config_get_max_gpc_count(struct nvgpu_gr_config *config)
+{
+	return config->max_gpc_count;
+}
+
+u32 nvgpu_gr_config_get_max_tpc_per_gpc_count(struct nvgpu_gr_config *config)
+{
+	return config->max_tpc_per_gpc_count;
+}
+
+u32 nvgpu_gr_config_get_max_tpc_count(struct nvgpu_gr_config *config)
+{
+	return config->max_tpc_count;
+}
+
+u32 nvgpu_gr_config_get_gpc_count(struct nvgpu_gr_config *config)
+{
+	return config->gpc_count;
+}
+
+u32 nvgpu_gr_config_get_tpc_count(struct nvgpu_gr_config *config)
+{
+	return config->tpc_count;
+}
+
+u32 nvgpu_gr_config_get_ppc_count(struct nvgpu_gr_config *config)
+{
+	return config->ppc_count;
+}
+
+u32 nvgpu_gr_config_get_pe_count_per_gpc(struct nvgpu_gr_config *config)
+{
+	return config->pe_count_per_gpc;
+}
+
+u32 nvgpu_gr_config_get_sm_count_per_tpc(struct nvgpu_gr_config *config)
+{
+	return config->sm_count_per_tpc;
+}
+
+u32 nvgpu_gr_config_get_gpc_ppc_count(struct nvgpu_gr_config *config,
+	u32 gpc_index)
+{
+	nvgpu_assert(gpc_index < nvgpu_gr_config_get_gpc_count(config));
+	return config->gpc_ppc_count[gpc_index];
+}
+
+u32 *nvgpu_gr_config_get_gpc_tpc_count_base(struct nvgpu_gr_config *config)
+{
+	return config->gpc_tpc_count;
+}
+
+u32 nvgpu_gr_config_get_gpc_tpc_count(struct nvgpu_gr_config *config,
+	u32 gpc_index)
+{
+	if (gpc_index >= config->gpc_count) {
+		return 0;
+	}
+	return config->gpc_tpc_count[gpc_index];
+}
+
+u32 nvgpu_gr_config_get_pes_tpc_count(struct nvgpu_gr_config *config,
+	u32 gpc_index, u32 pes_index)
+{
+	nvgpu_assert(gpc_index < nvgpu_gr_config_get_gpc_count(config));
+	nvgpu_assert(pes_index < nvgpu_gr_config_get_pe_count_per_gpc(config));
+	return config->pes_tpc_count[pes_index][gpc_index];
+}
+
+u32 *nvgpu_gr_config_get_gpc_tpc_mask_base(struct nvgpu_gr_config *config)
+{
+	return config->gpc_tpc_mask;
+}
+
+u32 nvgpu_gr_config_get_gpc_tpc_mask(struct nvgpu_gr_config *config,
+	u32 gpc_index)
+{
+	nvgpu_assert(gpc_index < nvgpu_gr_config_get_gpc_count(config));
+	return config->gpc_tpc_mask[gpc_index];
+}
+
+void nvgpu_gr_config_set_gpc_tpc_mask(struct nvgpu_gr_config *config,
+	u32 gpc_index, u32 val)
+{
+	nvgpu_assert(gpc_index < nvgpu_gr_config_get_gpc_count(config));
+	config->gpc_tpc_mask[gpc_index] = val;
+}
+
+u32 nvgpu_gr_config_get_gpc_skip_mask(struct nvgpu_gr_config *config,
+	u32 gpc_index)
+{
+	if (gpc_index >= config->gpc_count) {
+		return 0;
+	}
+	return config->gpc_skip_mask[gpc_index];
+}
+
+u32 nvgpu_gr_config_get_pes_tpc_mask(struct nvgpu_gr_config *config,
+	u32 gpc_index, u32 pes_index)
+{
+	nvgpu_assert(gpc_index < nvgpu_gr_config_get_gpc_count(config));
+	nvgpu_assert(pes_index < nvgpu_gr_config_get_pe_count_per_gpc(config));
+	return config->pes_tpc_mask[pes_index][gpc_index];
+}
+
+u32 nvgpu_gr_config_get_gpc_mask(struct nvgpu_gr_config *config)
+{
+	return config->gpc_mask;
+}
+
+u32 nvgpu_gr_config_get_no_of_sm(struct nvgpu_gr_config *config)
+{
+	return config->no_of_sm;
+}
+
+void nvgpu_gr_config_set_no_of_sm(struct nvgpu_gr_config *config, u32 no_of_sm)
+{
+	config->no_of_sm = no_of_sm;
+}
+
+struct nvgpu_sm_info *nvgpu_gr_config_get_sm_info(struct nvgpu_gr_config *config,
+	u32 sm_id)
+{
+	return &config->sm_to_cluster[sm_id];
+}
+
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+struct nvgpu_sm_info *nvgpu_gr_config_get_redex_sm_info(
+	struct nvgpu_gr_config *config, u32 sm_id)
+{
+	return &config->sm_to_cluster_redex_config[sm_id];
+}
+#endif
+
+u32 nvgpu_gr_config_get_sm_info_gpc_index(struct nvgpu_sm_info *sm_info)
+{
+	return sm_info->gpc_index;
+}
+
+void nvgpu_gr_config_set_sm_info_gpc_index(struct nvgpu_sm_info *sm_info,
+	u32 gpc_index)
+{
+	sm_info->gpc_index = gpc_index;
+}
+
+u32 nvgpu_gr_config_get_sm_info_tpc_index(struct nvgpu_sm_info *sm_info)
+{
+	return sm_info->tpc_index;
+}
+
+void nvgpu_gr_config_set_sm_info_tpc_index(struct nvgpu_sm_info *sm_info,
+	u32 tpc_index)
+{
+	sm_info->tpc_index = tpc_index;
+}
+
+u32 nvgpu_gr_config_get_sm_info_global_tpc_index(struct nvgpu_sm_info *sm_info)
+{
+	return sm_info->global_tpc_index;
+}
+
+void nvgpu_gr_config_set_sm_info_global_tpc_index(struct nvgpu_sm_info *sm_info,
+	u32 global_tpc_index)
+{
+	sm_info->global_tpc_index = global_tpc_index;
+}
+
+u32 nvgpu_gr_config_get_sm_info_sm_index(struct nvgpu_sm_info *sm_info)
+{
+	return sm_info->sm_index;
+}
+
+void nvgpu_gr_config_set_sm_info_sm_index(struct nvgpu_sm_info *sm_info,
+	u32 sm_index)
+{
+	sm_info->sm_index = sm_index;
+}
--- a/drivers/gpu/nvgpu/common/gr/gr_config_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/gr_config_priv.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_CONFIG_PRIV_H
+#define NVGPU_GR_CONFIG_PRIV_H
+
+#include <nvgpu/types.h>
+
+/**
+ * Max possible PES count per GPC.
+ */
+#define GK20A_GR_MAX_PES_PER_GPC 3U
+
+struct gk20a;
+
+/**
+ * Detailed information of SM indexes in GR engine.
+ */
+struct nvgpu_sm_info {
+	/**
+	 * Index of GPC for SM.
+	 */
+	u32 gpc_index;
+
+	/**
+	 * Index of TPC for SM.
+	 */
+	u32 tpc_index;
+
+	/**
+	 * Index of SM within TPC.
+	 */
+	u32 sm_index;
+
+	/**
+	 * Global TPC index for SM.
+	 */
+	u32 global_tpc_index;
+};
+
+/**
+ * GR engine configuration data.
+ *
+ * This data is populated during GR initialization and referred across
+ * GPU driver through public APIs.
+ */
+struct nvgpu_gr_config {
+	/**
+	 * Pointer to GPU driver struct.
+	 */
+	struct gk20a *g;
+
+	/**
+	 * Max possible number of GPCs in GR engine.
+	 */
+	u32 max_gpc_count;
+	/**
+	 * Max possible number of TPCs per GPC in GR engine.
+	 */
+	u32 max_tpc_per_gpc_count;
+	/**
+	 * Max possible number of TPCs in GR engine.
+	 */
+	u32 max_tpc_count;
+
+	/**
+	 * Number of GPCs in GR engine.
+	 */
+	u32 gpc_count;
+	/**
+	 * Number of TPCs in GR engine.
+	 */
+	u32 tpc_count;
+	/**
+	 * Number of PPCs in GR engine.
+	 */
+	u32 ppc_count;
+
+	/**
+	 * Number of PES per GPC in GR engine.
+	 */
+	u32 pe_count_per_gpc;
+	/**
+	 * Number of SMs per TPC in GR engine.
+	 */
+	u32 sm_count_per_tpc;
+
+	/**
+	 * Array to hold number of PPC units per GPC.
+	 * Array is indexed by GPC index.
+	 */
+	u32 *gpc_ppc_count;
+	/**
+	 * Array to hold number of TPCs per GPC.
+	 * Array is indexed by GPC index.
+	 */
+	u32 *gpc_tpc_count;
+	/**
+	 * 2-D array to hold number of TPCs attached to a PES unit
+	 * in a GPC.
+	 */
+	u32 *pes_tpc_count[GK20A_GR_MAX_PES_PER_GPC];
+
+	/**
+	 * Mask of GPCs. A set bit indicates GPC is available, otherwise
+	 * it is not available.
+	 */
+	u32 gpc_mask;
+
+	/**
+	 * Array to hold mask of TPCs per GPC.
+	 * Array is indexed by GPC index.
+	 */
+	u32 *gpc_tpc_mask;
+	/**
+	 * 2-D array to hold mask of TPCs attached to a PES unit
+	 * in a GPC.
+	 */
+	u32 *pes_tpc_mask[GK20A_GR_MAX_PES_PER_GPC];
+	/**
+	 * Array to hold skip mask of TPCs per GPC.
+	 * Array is indexed by GPC index.
+	 */
+	u32 *gpc_skip_mask;
+
+	/**
+	 * Number of SMs in GR engine.
+	 */
+	u32 no_of_sm;
+	/**
+	 * Pointer to SM information struct.
+	 */
+	struct nvgpu_sm_info *sm_to_cluster;
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+	/**
+	 * Pointer to redundant execution config SM information struct.
+	 * It is valid only if NVGPU_SUPPORT_SM_DIVERSITY support is true.
+	 */
+	struct nvgpu_sm_info *sm_to_cluster_redex_config;
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	u32 max_zcull_per_gpc_count;
+	u32 zcb_count;
+	u32 *gpc_zcb_count;
+
+	u8 *map_tiles;
+	u32 map_tile_count;
+	u32 map_row_offset;
+#endif
+};
+
+#endif /* NVGPU_GR_CONFIG_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/gr_ecc.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_ecc.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gr/gr_ecc.h>
+#include <nvgpu/gr/gr_utils.h>
+#include <nvgpu/gr/config.h>
+#include <nvgpu/string.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/ecc.h>
+
+int nvgpu_ecc_counter_init_per_gr(struct gk20a *g,
+		struct nvgpu_ecc_stat **stat, const char *name)
+{
+	struct nvgpu_ecc_stat *stats;
+	u32 i;
+	char gr_str[10] = {0};
+
+	stats = nvgpu_kzalloc(g, nvgpu_safe_mult_u64(sizeof(*stats),
+			g->num_gr_instances));
+	if (stats == NULL) {
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < g->num_gr_instances; i++) {
+		/**
+		 * Store stats name as below:
+		 * gr<gr_index>_<name_string>
+		 */
+		(void)strcpy(stats[i].name, "gr");
+		(void)nvgpu_strnadd_u32(gr_str, i, sizeof(gr_str), 10U);
+		(void)strncat(stats[i].name, gr_str,
+					NVGPU_ECC_STAT_NAME_MAX_SIZE -
+					strlen(stats[i].name));
+		(void)strncat(stats[i].name, "_",
+					NVGPU_ECC_STAT_NAME_MAX_SIZE -
+					strlen(stats[i].name));
+		(void)strncat(stats[i].name, name,
+					NVGPU_ECC_STAT_NAME_MAX_SIZE -
+					strlen(stats[i].name));
+
+		nvgpu_ecc_stat_add(g, &stats[i]);
+	}
+
+	*stat = stats;
+	return 0;
+}
+
+int nvgpu_ecc_counter_init_per_tpc(struct gk20a *g,
+		struct nvgpu_ecc_stat ***stat, const char *name)
+{
+	struct nvgpu_ecc_stat **stats;
+	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
+	u32 gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
+	u32 gpc, tpc;
+	char gpc_str[10] = {0}, tpc_str[10] = {0};
+	int err = 0;
+
+	stats = nvgpu_kzalloc(g, nvgpu_safe_mult_u64(sizeof(*stats),
+			      gpc_count));
+	if (stats == NULL) {
+		return -ENOMEM;
+	}
+
+	for (gpc = 0; gpc < gpc_count; gpc++) {
+		stats[gpc] = nvgpu_kzalloc(g,
+			nvgpu_safe_mult_u64(sizeof(*stats[gpc]),
+				nvgpu_gr_config_get_gpc_tpc_count(gr_config,
+								  gpc)));
+		if (stats[gpc] == NULL) {
+			err = -ENOMEM;
+			goto fail;
+		}
+	}
+
+	for (gpc = 0; gpc < gpc_count; gpc++) {
+		for (tpc = 0;
+		     tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc);
+		     tpc++) {
+			/**
+			 * Store stats name as below:
+			 * gpc<gpc_value>_tpc<tpc_value>_<name_string>
+			 */
+			(void)strcpy(stats[gpc][tpc].name, "gpc");
+			(void)nvgpu_strnadd_u32(gpc_str, gpc,
+							sizeof(gpc_str), 10U);
+			(void)strncat(stats[gpc][tpc].name, gpc_str,
+						NVGPU_ECC_STAT_NAME_MAX_SIZE -
+						strlen(stats[gpc][tpc].name));
+			(void)strncat(stats[gpc][tpc].name, "_tpc",
+						NVGPU_ECC_STAT_NAME_MAX_SIZE -
+						strlen(stats[gpc][tpc].name));
+			(void)nvgpu_strnadd_u32(tpc_str, tpc,
+							sizeof(tpc_str), 10U);
+			(void)strncat(stats[gpc][tpc].name, tpc_str,
+						NVGPU_ECC_STAT_NAME_MAX_SIZE -
+						strlen(stats[gpc][tpc].name));
+			(void)strncat(stats[gpc][tpc].name, "_",
+						NVGPU_ECC_STAT_NAME_MAX_SIZE -
+						strlen(stats[gpc][tpc].name));
+			(void)strncat(stats[gpc][tpc].name, name,
+						NVGPU_ECC_STAT_NAME_MAX_SIZE -
+						strlen(stats[gpc][tpc].name));
+
+			nvgpu_ecc_stat_add(g, &stats[gpc][tpc]);
+		}
+	}
+
+	*stat = stats;
+
+fail:
+	if (err != 0) {
+		while (gpc-- != 0u) {
+			nvgpu_kfree(g, stats[gpc]);
+		}
+
+		nvgpu_kfree(g, stats);
+	}
+
+	return err;
+}
+
+int nvgpu_ecc_counter_init_per_gpc(struct gk20a *g,
+		struct nvgpu_ecc_stat **stat, const char *name)
+{
+	struct nvgpu_ecc_stat *stats;
+	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
+	u32 gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
+	u32 gpc;
+	char gpc_str[10] = {0};
+
+	stats = nvgpu_kzalloc(g, nvgpu_safe_mult_u64(sizeof(*stats),
+						     gpc_count));
+	if (stats == NULL) {
+		return -ENOMEM;
+	}
+
+	for (gpc = 0; gpc < gpc_count; gpc++) {
+		/**
+		 * Store stats name as below:
+		 * gpc<gpc_value>_<name_string>
+		 */
+		(void)strcpy(stats[gpc].name, "gpc");
+		(void)nvgpu_strnadd_u32(gpc_str, gpc, sizeof(gpc_str), 10U);
+		(void)strncat(stats[gpc].name, gpc_str,
+					NVGPU_ECC_STAT_NAME_MAX_SIZE -
+					strlen(stats[gpc].name));
+		(void)strncat(stats[gpc].name, "_",
+					NVGPU_ECC_STAT_NAME_MAX_SIZE -
+					strlen(stats[gpc].name));
+		(void)strncat(stats[gpc].name, name,
+					NVGPU_ECC_STAT_NAME_MAX_SIZE -
+					strlen(stats[gpc].name));
+
+		nvgpu_ecc_stat_add(g, &stats[gpc]);
+	}
+
+	*stat = stats;
+	return 0;
+}
+
+void nvgpu_ecc_counter_deinit_per_gr(struct gk20a *g,
+				     struct nvgpu_ecc_stat **stats_p)
+{
+	struct nvgpu_ecc_stat *stats = NULL;
+	u32 i;
+
+	if (*stats_p != NULL) {
+		stats = *stats_p;
+
+		for (i = 0; i < g->num_gr_instances; i++) {
+			nvgpu_ecc_stat_del(g, &stats[i]);
+		}
+
+		nvgpu_kfree(g, stats);
+		*stats_p = NULL;
+	}
+}
+
+void nvgpu_ecc_counter_deinit_per_tpc(struct gk20a *g,
+				      struct nvgpu_ecc_stat ***stats_p)
+{
+	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
+	struct nvgpu_ecc_stat **stats = NULL;
+	u32 gpc_count;
+	u32 gpc, tpc;
+
+	if (*stats_p != NULL) {
+		gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
+		stats = *stats_p;
+
+		for (gpc = 0; gpc < gpc_count; gpc++) {
+			if (stats[gpc] == NULL) {
+				continue;
+			}
+
+			for (tpc = 0;
+			     tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc);
+			     tpc++) {
+				nvgpu_ecc_stat_del(g, &stats[gpc][tpc]);
+			}
+
+			nvgpu_kfree(g, stats[gpc]);
+			stats[gpc] = NULL;
+		}
+
+		nvgpu_kfree(g, stats);
+		*stats_p = NULL;
+	}
+}
+
+void nvgpu_ecc_counter_deinit_per_gpc(struct gk20a *g,
+				      struct nvgpu_ecc_stat **stats_p)
+{
+	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
+	struct nvgpu_ecc_stat *stats = NULL;
+	u32 gpc_count;
+	u32 gpc;
+
+	if (*stats_p != NULL) {
+		gpc_count = nvgpu_gr_config_get_gpc_count(gr_config);
+		stats = *stats_p;
+
+		for (gpc = 0; gpc < gpc_count; gpc++) {
+			nvgpu_ecc_stat_del(g, &stats[gpc]);
+		}
+
+		nvgpu_kfree(g, stats);
+		*stats_p = NULL;
+	}
+}
+
+void nvgpu_gr_ecc_free(struct gk20a *g)
+{
+	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (gr_config == NULL) {
+		return;
+	}
+
+	if (g->ops.gr.ecc.fecs_ecc_deinit != NULL) {
+		g->ops.gr.ecc.fecs_ecc_deinit(g);
+	}
+
+	if (g->ops.gr.ecc.gpc_tpc_ecc_deinit != NULL) {
+		g->ops.gr.ecc.gpc_tpc_ecc_deinit(g);
+	}
+}
--- a/drivers/gpu/nvgpu/common/gr/gr_falcon.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_falcon.c
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/netlist.h>
+#include <nvgpu/gr/gr_falcon.h>
+#include <nvgpu/enabled.h>
+#include <nvgpu/debug.h>
+#include <nvgpu/gr/hwpm_map.h>
+#include <nvgpu/firmware.h>
+#include <nvgpu/sizes.h>
+#include <nvgpu/mm.h>
+#include <nvgpu/acr.h>
+#include <nvgpu/gr/gr_utils.h>
+#ifdef CONFIG_NVGPU_LS_PMU
+#include <nvgpu/pmu/lsfm.h>
+#include <nvgpu/pmu/pmu_pg.h>
+#endif
+#ifdef CONFIG_NVGPU_DGPU
+#include <nvgpu/sec2/lsfm.h>
+#endif
+#include <nvgpu/dma.h>
+#include <nvgpu/static_analysis.h>
+
+#include "gr_falcon_priv.h"
+
+#define NVGPU_FECS_UCODE_IMAGE	"fecs.bin"
+#define NVGPU_GPCCS_UCODE_IMAGE	"gpccs.bin"
+
+struct nvgpu_gr_falcon *nvgpu_gr_falcon_init_support(struct gk20a *g)
+{
+	struct nvgpu_gr_falcon *falcon;
+
+	nvgpu_log_fn(g, " ");
+
+	falcon = nvgpu_kzalloc(g, sizeof(*falcon));
+	if (falcon == NULL) {
+		return falcon;
+	}
+
+	nvgpu_mutex_init(&falcon->fecs_mutex);
+	falcon->coldboot_bootstrap_done = false;
+
+	return falcon;
+}
+
+void nvgpu_gr_falcon_remove_support(struct gk20a *g,
+				struct nvgpu_gr_falcon *falcon)
+{
+	nvgpu_log_fn(g, " ");
+
+	if (falcon == NULL) {
+		return;
+	}
+	nvgpu_kfree(g, falcon);
+}
+
+#ifdef CONFIG_NVGPU_POWER_PG
+int nvgpu_gr_falcon_bind_fecs_elpg(struct gk20a *g)
+{
+#ifdef CONFIG_NVGPU_LS_PMU
+	struct nvgpu_pmu *pmu = g->pmu;
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = mm->pmu.vm;
+	int err = 0;
+	u32 size;
+	u32 data;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	size = 0;
+
+	err = g->ops.gr.falcon.ctrl_ctxsw(g,
+		NVGPU_GR_FALCON_METHOD_REGLIST_DISCOVER_IMAGE_SIZE, 0U, &size);
+	if (err != 0) {
+		nvgpu_err(g,
+			"fail to query fecs pg buffer size");
+		return err;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "FECS PG buffer size = %u", size);
+
+	if (nvgpu_pmu_pg_buf_get_cpu_va(g, pmu) == NULL) {
+		err = nvgpu_dma_alloc_map_sys(vm, size, nvgpu_pmu_pg_buf(g, pmu));
+		if (err != 0) {
+			nvgpu_err(g, "failed to allocate memory");
+			return -ENOMEM;
+		}
+	}
+
+	data = g->ops.gr.falcon.get_fecs_current_ctx_data(g,
+						&mm->pmu.inst_block);
+	err = g->ops.gr.falcon.ctrl_ctxsw(g,
+		NVGPU_GR_FALCON_METHOD_REGLIST_BIND_INSTANCE, data, NULL);
+	if (err != 0) {
+		nvgpu_err(g,
+			"fail to bind pmu inst to gr");
+		return err;
+	}
+
+	data = u64_lo32(nvgpu_pmu_pg_buf_get_gpu_va(g, pmu) >> 8);
+	err = g->ops.gr.falcon.ctrl_ctxsw(g,
+		NVGPU_GR_FALCON_METHOD_REGLIST_SET_VIRTUAL_ADDRESS, data, NULL);
+	if (err != 0) {
+		nvgpu_err(g,
+			"fail to set pg buffer pmu va");
+		return err;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+	return err;
+#else
+	return 0;
+#endif
+}
+#endif
+
+int nvgpu_gr_falcon_init_ctxsw(struct gk20a *g, struct nvgpu_gr_falcon *falcon)
+{
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	err = g->ops.gr.falcon.load_ctxsw_ucode(g, falcon);
+	if (err != 0) {
+		goto out;
+	}
+
+	err = g->ops.gr.falcon.wait_ctxsw_ready(g);
+
+out:
+	if (err != 0) {
+		nvgpu_err(g, "fail");
+	} else {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	}
+
+	return err;
+}
+
+int nvgpu_gr_falcon_init_ctx_state(struct gk20a *g,
+		struct nvgpu_gr_falcon *falcon)
+{
+	struct nvgpu_gr_falcon_query_sizes *sizes = &falcon->sizes;
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	/* fecs init ramchain */
+	err = g->ops.gr.falcon.init_ctx_state(g, sizes);
+	if (err != 0) {
+		goto out;
+	}
+
+out:
+	if (err != 0) {
+		nvgpu_err(g, "fail");
+	} else {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	}
+
+	return err;
+}
+
+u32 nvgpu_gr_falcon_get_golden_image_size(struct nvgpu_gr_falcon *falcon)
+{
+	return falcon->sizes.golden_image_size;
+}
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+u32 nvgpu_gr_falcon_get_pm_ctxsw_image_size(struct nvgpu_gr_falcon *falcon)
+{
+	return falcon->sizes.pm_ctxsw_image_size;
+}
+#endif
+
+#ifdef CONFIG_NVGPU_GFXP
+u32 nvgpu_gr_falcon_get_preempt_image_size(struct nvgpu_gr_falcon *falcon)
+{
+	return falcon->sizes.preempt_image_size;
+}
+#endif /* CONFIG_NVGPU_GFXP */
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+u32 nvgpu_gr_falcon_get_zcull_image_size(struct nvgpu_gr_falcon *falcon)
+{
+	return falcon->sizes.zcull_image_size;
+}
+#endif /* CONFIG_NVGPU_GRAPHICS */
+
+static int nvgpu_gr_falcon_init_ctxsw_ucode_vaspace(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = mm->pmu.vm;
+	struct nvgpu_ctxsw_ucode_info *ucode_info = &falcon->ctxsw_ucode_info;
+	int err;
+
+	err = nvgpu_alloc_inst_block(g, &ucode_info->inst_blk_desc);
+	if (err != 0) {
+		return err;
+	}
+
+	g->ops.mm.init_inst_block(&ucode_info->inst_blk_desc, vm, 0);
+
+	/* Map ucode surface to GMMU */
+	ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm,
+					&ucode_info->surface_desc,
+					ucode_info->surface_desc.size,
+					0, /* flags */
+					gk20a_mem_flag_read_only,
+					false,
+					ucode_info->surface_desc.aperture);
+	if (ucode_info->surface_desc.gpu_va == 0ULL) {
+		nvgpu_err(g, "failed to update gmmu ptes");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nvgpu_gr_falcon_init_ctxsw_ucode_segment(
+	struct nvgpu_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
+{
+	u32 ucode_offset;
+
+	p_seg->offset = *offset;
+	p_seg->size = size;
+	ucode_offset = nvgpu_safe_add_u32(*offset, size);
+	*offset = NVGPU_ALIGN(ucode_offset, 256U);
+}
+
+static void nvgpu_gr_falcon_init_ctxsw_ucode_segments(
+	struct nvgpu_ctxsw_ucode_segments *segments, u32 *offset,
+	struct nvgpu_ctxsw_bootloader_desc *bootdesc,
+	u32 code_size, u32 data_size)
+{
+	u32 boot_size = NVGPU_ALIGN(bootdesc->size, sizeof(u32));
+
+	segments->boot_entry = bootdesc->entry_point;
+	segments->boot_imem_offset = bootdesc->imem_offset;
+	nvgpu_gr_falcon_init_ctxsw_ucode_segment(&segments->boot,
+							offset, boot_size);
+	nvgpu_gr_falcon_init_ctxsw_ucode_segment(&segments->code,
+							offset, code_size);
+	nvgpu_gr_falcon_init_ctxsw_ucode_segment(&segments->data,
+							offset, data_size);
+}
+
+static void nvgpu_gr_falcon_copy_ctxsw_ucode_segments(
+	struct gk20a *g,
+	struct nvgpu_mem *dst,
+	struct nvgpu_ctxsw_ucode_segments *segments,
+	u32 *bootimage,
+	u32 *code, u32 *data)
+{
+	unsigned int i;
+
+	nvgpu_mem_wr_n(g, dst, segments->boot.offset, bootimage,
+			segments->boot.size);
+	nvgpu_mem_wr_n(g, dst, segments->code.offset, code,
+			segments->code.size);
+	nvgpu_mem_wr_n(g, dst, segments->data.offset, data,
+			segments->data.size);
+
+	/* compute a "checksum" for the boot binary to detect its version */
+	segments->boot_signature = 0;
+	for (i = 0; i < (segments->boot.size / sizeof(u32)); i++) {
+		segments->boot_signature = nvgpu_gr_checksum_u32(
+				segments->boot_signature, bootimage[i]);
+	}
+}
+
+int nvgpu_gr_falcon_init_ctxsw_ucode(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	struct nvgpu_ctxsw_bootloader_desc *fecs_boot_desc;
+	struct nvgpu_ctxsw_bootloader_desc *gpccs_boot_desc;
+	struct nvgpu_firmware *fecs_fw;
+	struct nvgpu_firmware *gpccs_fw;
+	u32 *fecs_boot_image;
+	u32 *gpccs_boot_image;
+	struct nvgpu_ctxsw_ucode_info *ucode_info = &falcon->ctxsw_ucode_info;
+	u32 ucode_size;
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_gr, "Requst and copy FECS/GPCCS firmwares");
+
+	fecs_fw = nvgpu_request_firmware(g, NVGPU_FECS_UCODE_IMAGE, 0);
+	if (fecs_fw == NULL) {
+		nvgpu_err(g, "failed to load fecs ucode!!");
+		return -ENOENT;
+	}
+
+	fecs_boot_desc = (void *)fecs_fw->data;
+	fecs_boot_image = (void *)(fecs_fw->data +
+				sizeof(struct nvgpu_ctxsw_bootloader_desc));
+
+	gpccs_fw = nvgpu_request_firmware(g, NVGPU_GPCCS_UCODE_IMAGE, 0);
+	if (gpccs_fw == NULL) {
+		nvgpu_release_firmware(g, fecs_fw);
+		nvgpu_err(g, "failed to load gpccs ucode!!");
+		return -ENOENT;
+	}
+
+	gpccs_boot_desc = (void *)gpccs_fw->data;
+	gpccs_boot_image = (void *)(gpccs_fw->data +
+				sizeof(struct nvgpu_ctxsw_bootloader_desc));
+
+	ucode_size = 0;
+	nvgpu_gr_falcon_init_ctxsw_ucode_segments(&ucode_info->fecs,
+		&ucode_size, fecs_boot_desc,
+		nvgpu_safe_mult_u32(
+		nvgpu_netlist_get_fecs_inst_count(g), (u32)sizeof(u32)),
+		nvgpu_safe_mult_u32(
+		nvgpu_netlist_get_fecs_data_count(g), (u32)sizeof(u32)));
+	nvgpu_gr_falcon_init_ctxsw_ucode_segments(&ucode_info->gpccs,
+		&ucode_size, gpccs_boot_desc,
+		nvgpu_safe_mult_u32(
+		nvgpu_netlist_get_gpccs_inst_count(g), (u32)sizeof(u32)),
+		nvgpu_safe_mult_u32(
+		nvgpu_netlist_get_gpccs_data_count(g), (u32)sizeof(u32)));
+
+	err = nvgpu_dma_alloc_sys(g, ucode_size, &ucode_info->surface_desc);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	nvgpu_gr_falcon_copy_ctxsw_ucode_segments(g,
+		&ucode_info->surface_desc,
+		&ucode_info->fecs,
+		fecs_boot_image,
+		nvgpu_netlist_get_fecs_inst_list(g),
+		nvgpu_netlist_get_fecs_data_list(g));
+
+	nvgpu_release_firmware(g, fecs_fw);
+	fecs_fw = NULL;
+
+	nvgpu_gr_falcon_copy_ctxsw_ucode_segments(g,
+		&ucode_info->surface_desc,
+		&ucode_info->gpccs,
+		gpccs_boot_image,
+		nvgpu_netlist_get_gpccs_inst_list(g),
+		nvgpu_netlist_get_gpccs_data_list(g));
+
+	nvgpu_release_firmware(g, gpccs_fw);
+	gpccs_fw = NULL;
+
+	err = nvgpu_gr_falcon_init_ctxsw_ucode_vaspace(g, falcon);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	return 0;
+
+clean_up:
+	nvgpu_dma_free(g, &ucode_info->surface_desc);
+
+	if (gpccs_fw != NULL) {
+		nvgpu_release_firmware(g, gpccs_fw);
+		gpccs_fw = NULL;
+	}
+	if (fecs_fw != NULL) {
+		nvgpu_release_firmware(g, fecs_fw);
+		fecs_fw = NULL;
+	}
+
+	return err;
+}
+
+static void nvgpu_gr_falcon_bind_instblk(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	struct nvgpu_ctxsw_ucode_info *ucode_info =
+					&falcon->ctxsw_ucode_info;
+	u64 inst_ptr;
+
+	if (g->ops.gr.falcon.bind_instblk == NULL) {
+		return;
+        }
+
+	inst_ptr = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc);
+
+	g->ops.gr.falcon.bind_instblk(g, &ucode_info->inst_blk_desc,
+					inst_ptr);
+
+}
+
+#ifdef CONFIG_NVGPU_GR_FALCON_NON_SECURE_BOOT
+static void nvgpu_gr_falcon_load_dmem(struct gk20a *g)
+{
+	u32 ucode_u32_size;
+	const u32 *ucode_u32_data;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	ucode_u32_size = nvgpu_netlist_get_gpccs_data_count(g);
+	ucode_u32_data = (const u32 *)nvgpu_netlist_get_gpccs_data_list(g);
+	g->ops.gr.falcon.load_gpccs_dmem(g, ucode_u32_data, ucode_u32_size);
+
+	ucode_u32_size = nvgpu_netlist_get_fecs_data_count(g);
+	ucode_u32_data = (const u32 *)nvgpu_netlist_get_fecs_data_list(g);
+	g->ops.gr.falcon.load_fecs_dmem(g, ucode_u32_data, ucode_u32_size);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+
+static void nvgpu_gr_falcon_load_imem(struct gk20a *g)
+{
+	u32 ucode_u32_size;
+	const u32 *ucode_u32_data;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	ucode_u32_size = nvgpu_netlist_get_gpccs_inst_count(g);
+	ucode_u32_data = (const u32 *)nvgpu_netlist_get_gpccs_inst_list(g);
+	g->ops.gr.falcon.load_gpccs_imem(g, ucode_u32_data, ucode_u32_size);
+
+
+	ucode_u32_size = nvgpu_netlist_get_fecs_inst_count(g);
+	ucode_u32_data = (const u32 *)nvgpu_netlist_get_fecs_inst_list(g);
+	g->ops.gr.falcon.load_fecs_imem(g, ucode_u32_data, ucode_u32_size);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+
+static void nvgpu_gr_falcon_load_ctxsw_ucode_header(struct gk20a *g,
+	u64 addr_base, struct nvgpu_ctxsw_ucode_segments *segments,
+	u32 reg_offset)
+{
+	u32 addr_code32 = u64_lo32(nvgpu_safe_add_u64(addr_base,
+					segments->code.offset) >> 8);
+	u32 addr_data32 = u64_lo32(nvgpu_safe_add_u64(addr_base,
+					segments->data.offset) >> 8);
+
+	g->ops.gr.falcon.load_ctxsw_ucode_header(g, reg_offset,
+		segments->boot_signature, addr_code32, addr_data32,
+		segments->code.size, segments->data.size);
+}
+
+static void nvgpu_gr_falcon_load_ctxsw_ucode_boot(struct gk20a *g,
+	u64 addr_base, struct nvgpu_ctxsw_ucode_segments *segments,
+	u32 reg_offset)
+{
+	u32 addr_load32 = u64_lo32(nvgpu_safe_add_u64(addr_base,
+				segments->boot.offset) >> 8);
+	u32 blocks = (nvgpu_safe_add_u32(segments->boot.size, 0xFFU)
+								& ~0xFFU) >> 8;
+	u32 dst = segments->boot_imem_offset;
+
+	g->ops.gr.falcon.load_ctxsw_ucode_boot(g, reg_offset,
+		segments->boot_entry, addr_load32, blocks, dst);
+
+}
+
+static void nvgpu_gr_falcon_load_ctxsw_ucode_segments(
+		struct gk20a *g, u64 addr_base,
+		struct nvgpu_ctxsw_ucode_segments *segments, u32 reg_offset)
+{
+
+	/* Copy falcon bootloader into dmem */
+	nvgpu_gr_falcon_load_ctxsw_ucode_header(g, addr_base,
+						segments, reg_offset);
+	nvgpu_gr_falcon_load_ctxsw_ucode_boot(g,
+					addr_base, segments, reg_offset);
+}
+
+static void nvgpu_gr_falcon_load_with_bootloader(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	struct nvgpu_ctxsw_ucode_info *ucode_info =
+					&falcon->ctxsw_ucode_info;
+	u64 addr_base = ucode_info->surface_desc.gpu_va;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	nvgpu_gr_falcon_bind_instblk(g, falcon);
+
+	nvgpu_gr_falcon_load_ctxsw_ucode_segments(g, addr_base,
+		&falcon->ctxsw_ucode_info.fecs, 0);
+
+	nvgpu_gr_falcon_load_ctxsw_ucode_segments(g, addr_base,
+		&falcon->ctxsw_ucode_info.gpccs,
+		g->ops.gr.falcon.get_gpccs_start_reg_offset());
+}
+
+int nvgpu_gr_falcon_load_ctxsw_ucode(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	int err;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+#ifdef CONFIG_NVGPU_SIM
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		g->ops.gr.falcon.configure_fmodel(g);
+	}
+#endif
+
+	/*
+	 * In case bootloader is not supported, revert to the old way of
+	 * loading gr ucode, without the faster bootstrap routine.
+	 */
+	if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) {
+		nvgpu_gr_falcon_load_dmem(g);
+		nvgpu_gr_falcon_load_imem(g);
+		g->ops.gr.falcon.start_ucode(g);
+	} else {
+		if (!falcon->skip_ucode_init) {
+			err =  nvgpu_gr_falcon_init_ctxsw_ucode(g, falcon);
+			if (err != 0) {
+				return err;
+			}
+		}
+		nvgpu_gr_falcon_load_with_bootloader(g, falcon);
+		falcon->skip_ucode_init = true;
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	return 0;
+}
+
+static void nvgpu_gr_falcon_load_gpccs_with_bootloader(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	struct nvgpu_ctxsw_ucode_info *ucode_info =
+					&falcon->ctxsw_ucode_info;
+	u64 addr_base = ucode_info->surface_desc.gpu_va;
+
+	nvgpu_gr_falcon_bind_instblk(g, falcon);
+
+	nvgpu_gr_falcon_load_ctxsw_ucode_segments(g, addr_base,
+		&falcon->ctxsw_ucode_info.gpccs,
+		g->ops.gr.falcon.get_gpccs_start_reg_offset());
+}
+#endif
+
+#if defined(CONFIG_NVGPU_DGPU) || defined(CONFIG_NVGPU_LS_PMU)
+static int gr_falcon_sec2_or_ls_pmu_bootstrap(struct gk20a *g,
+				 bool *bootstrap, u32 falcon_id_mask)
+{
+	int err = 0;
+	bool bootstrap_set = false;
+
+#ifdef CONFIG_NVGPU_DGPU
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_SEC2_RTOS)) {
+		bootstrap_set = true;
+		nvgpu_log(g, gpu_dbg_gr, "bootstrap by SEC2");
+
+		err = nvgpu_sec2_bootstrap_ls_falcons(g,
+			&g->sec2, FALCON_ID_FECS);
+		if (err == 0) {
+			err = nvgpu_sec2_bootstrap_ls_falcons(g,
+				&g->sec2, FALCON_ID_GPCCS);
+		}
+	} else
+#endif
+#ifdef CONFIG_NVGPU_LS_PMU
+	if (g->support_ls_pmu) {
+		bootstrap_set = true;
+		nvgpu_log(g, gpu_dbg_gr, "bootstrap by LS PMU");
+
+		err = nvgpu_pmu_lsfm_bootstrap_ls_falcon(g,
+				g->pmu, g->pmu->lsfm,
+				falcon_id_mask);
+	}
+#endif
+
+	*bootstrap = bootstrap_set;
+	return err;
+}
+
+static int gr_falcon_sec2_or_ls_pmu_recovery_bootstrap(struct gk20a *g)
+{
+	int err = 0;
+	bool bootstrap = false;
+	u32 falcon_idmask = BIT32(FALCON_ID_FECS) | BIT32(FALCON_ID_GPCCS);
+
+	err = gr_falcon_sec2_or_ls_pmu_bootstrap(g,
+				&bootstrap,
+				falcon_idmask);
+	if ((err == 0) && (!bootstrap)) {
+		err = nvgpu_acr_bootstrap_hs_acr(g, g->acr);
+		if (err != 0) {
+			nvgpu_err(g,
+				"ACR GR LSF bootstrap failed");
+		}
+	}
+
+	return err;
+}
+
+static int gr_falcon_sec2_or_ls_pmu_coldboot_bootstrap(struct gk20a *g)
+{
+	int err = 0;
+	u8 falcon_id_mask = 0;
+	bool bootstrap = false;
+
+	if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+		return err;
+	}
+
+	if (nvgpu_acr_is_lsf_lazy_bootstrap(g, g->acr,
+					FALCON_ID_FECS)) {
+		falcon_id_mask |= BIT8(FALCON_ID_FECS);
+	}
+	if (nvgpu_acr_is_lsf_lazy_bootstrap(g, g->acr,
+					FALCON_ID_GPCCS)) {
+		falcon_id_mask |= BIT8(FALCON_ID_GPCCS);
+	}
+
+	err = gr_falcon_sec2_or_ls_pmu_bootstrap(g,
+				&bootstrap,
+				(u32)falcon_id_mask);
+	if ((err == 0) && (!bootstrap)) {
+		/* GR falcons bootstrapped by ACR */
+		nvgpu_log(g, gpu_dbg_gr, "bootstrap by ACR");
+		err = 0;
+	}
+
+	return err;
+}
+#endif
+
+static int gr_falcon_recovery_bootstrap(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	int err = 0;
+
+#ifdef CONFIG_NVGPU_GR_FALCON_NON_SECURE_BOOT
+	if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+		nvgpu_gr_falcon_load_gpccs_with_bootloader(g, falcon);
+#ifdef CONFIG_NVGPU_LS_PMU
+		err = nvgpu_pmu_lsfm_bootstrap_ls_falcon(g, g->pmu,
+				g->pmu->lsfm, BIT32(FALCON_ID_FECS));
+#endif
+	} else
+#endif
+	{
+		/* bind WPR VA inst block */
+		nvgpu_gr_falcon_bind_instblk(g, falcon);
+#if defined(CONFIG_NVGPU_DGPU) || defined(CONFIG_NVGPU_LS_PMU)
+		err = gr_falcon_sec2_or_ls_pmu_recovery_bootstrap(g);
+#else
+		err = nvgpu_acr_bootstrap_hs_acr(g, g->acr);
+		if (err != 0) {
+			nvgpu_err(g,
+				"ACR GR LSF bootstrap failed");
+		}
+#endif
+	}
+
+	return err;
+}
+
+static void gr_falcon_coldboot_bootstrap(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+#ifdef CONFIG_NVGPU_GR_FALCON_NON_SECURE_BOOT
+	if (!nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) {
+		nvgpu_gr_falcon_load_gpccs_with_bootloader(g, falcon);
+	} else
+#endif
+	{
+		/* bind WPR VA inst block */
+		nvgpu_gr_falcon_bind_instblk(g, falcon);
+	}
+}
+
+int nvgpu_gr_falcon_load_secure_ctxsw_ucode(struct gk20a *g,
+					struct nvgpu_gr_falcon *falcon)
+{
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+#ifdef CONFIG_NVGPU_SIM
+	if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) {
+		g->ops.gr.falcon.configure_fmodel(g);
+	}
+#endif
+
+	if (falcon->coldboot_bootstrap_done) {
+		nvgpu_log(g, gpu_dbg_gr, "recovery bootstrap");
+
+		/* this must be recovery so bootstrap fecs and gpccs */
+		err = gr_falcon_recovery_bootstrap(g, falcon);
+		if (err != 0) {
+			nvgpu_err(g, "Unable to recover GR falcon");
+			return err;
+		}
+
+	} else {
+		nvgpu_log(g, gpu_dbg_gr, "coldboot bootstrap");
+
+		/* cold boot or rg exit */
+		falcon->coldboot_bootstrap_done = true;
+		gr_falcon_coldboot_bootstrap(g, falcon);
+#if defined(CONFIG_NVGPU_DGPU) || defined(CONFIG_NVGPU_LS_PMU)
+		err = gr_falcon_sec2_or_ls_pmu_coldboot_bootstrap(g);
+		if (err != 0) {
+			nvgpu_err(g, "Unable to boot GPCCS");
+			return err;
+		}
+#endif
+	}
+
+	g->ops.gr.falcon.start_gpccs(g);
+	g->ops.gr.falcon.start_fecs(g);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+
+	return 0;
+}
+
+struct nvgpu_ctxsw_ucode_segments *nvgpu_gr_falcon_get_fecs_ucode_segments(
+					struct nvgpu_gr_falcon *falcon)
+{
+	return &falcon->ctxsw_ucode_info.fecs;
+}
+struct nvgpu_ctxsw_ucode_segments *nvgpu_gr_falcon_get_gpccs_ucode_segments(
+					struct nvgpu_gr_falcon *falcon)
+{
+	return &falcon->ctxsw_ucode_info.gpccs;
+}
+void *nvgpu_gr_falcon_get_surface_desc_cpu_va(struct nvgpu_gr_falcon *falcon)
+{
+	return falcon->ctxsw_ucode_info.surface_desc.cpu_va;
+}
+#ifdef CONFIG_NVGPU_ENGINE_RESET
+struct nvgpu_mutex *nvgpu_gr_falcon_get_fecs_mutex(
+					struct nvgpu_gr_falcon *falcon)
+{
+	return &falcon->fecs_mutex;
+}
+#endif
--- a/drivers/gpu/nvgpu/common/gr/gr_falcon_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/gr_falcon_priv.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_FALCON_PRIV_H
+#define NVGPU_GR_FALCON_PRIV_H
+
+#include <nvgpu/types.h>
+#include <nvgpu/nvgpu_mem.h>
+
+struct nvgpu_ctxsw_ucode_segments;
+
+/** GPCCS boot signature for T18X chip, type: with reserved. */
+#define FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED	0x68edab34U
+
+/** FECS boot signature for T21X chip, type: with DMEM size. */
+#define FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE	0x9121ab5cU
+/** FECS boot signature for T21X chip, type: with reserved. */
+#define FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED	0x9125ab5cU
+/** FECS boot signature for T21X chip, type: without reserved. */
+#define FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED	0x93671b7dU
+/** FECS boot signature for T21X chip, type: without reserved2. */
+#define FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2	0x4d6cbc10U
+/** GPCCS boot signature for T21X chip, type: with reserved. */
+#define FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED	0x3d3d65e2U
+/** GPCCS boot signature for T21X chip, type: without reserved. */
+#define FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED	0x393161daU
+
+/** FECS boot signature for T12X chip, type: with reserved. */
+#define FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED	0x8a621f78U
+/** FECS boot signature for T12X chip, type: without reserved. */
+#define FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED	0x67e5344bU
+/** FECS boot signature for T12X chip, type: older. */
+#define FALCON_UCODE_SIG_T12X_FECS_OLDER		0x56da09fU
+
+/** GPCCS boot signature for T12X chip, type: with reserved. */
+#define FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED	0x303465d5U
+/** GPCCS boot signature for T12X chip, type: without reserved. */
+#define FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED	0x3fdd33d3U
+/** GPCCS boot signature for T12X chip, type: older. */
+#define FALCON_UCODE_SIG_T12X_GPCCS_OLDER		0x53d7877U
+
+enum wait_ucode_status {
+	/** Status of ucode wait operation : LOOP. */
+	WAIT_UCODE_LOOP,
+	/** Status of ucode wait operation : timedout. */
+	WAIT_UCODE_TIMEOUT,
+	/** Status of ucode wait operation : error. */
+	WAIT_UCODE_ERROR,
+	/** Status of ucode wait operation : success. */
+	WAIT_UCODE_OK
+};
+
+/** Falcon operation condition : EQUAL. */
+#define	GR_IS_UCODE_OP_EQUAL			0U
+/** Falcon operation condition : NOT_EQUAL. */
+#define	GR_IS_UCODE_OP_NOT_EQUAL		1U
+/** Falcon operation condition : AND. */
+#define	GR_IS_UCODE_OP_AND			2U
+/** Falcon operation condition : LESSER. */
+#define	GR_IS_UCODE_OP_LESSER			3U
+/** Falcon operation condition : LESSER_EQUAL. */
+#define	GR_IS_UCODE_OP_LESSER_EQUAL		4U
+/** Falcon operation condition : SKIP. */
+#define	GR_IS_UCODE_OP_SKIP			5U
+
+/** Mailbox value in case of successful operation. */
+#define FALCON_UCODE_HANDSHAKE_INIT_COMPLETE	1U
+
+struct fecs_mthd_op_method {
+	/** Method address to send to FECS microcontroller. */
+	u32 addr;
+	/** Method data to send to FECS microcontroller. */
+	u32 data;
+};
+
+struct fecs_mthd_op_mailbox {
+	/** Mailbox ID to perform operation. */
+	u32 id;
+	/** Mailbox data to be written. */
+	u32 data;
+	/** Mailbox clear value. */
+	u32 clr;
+	/** Last read mailbox value. */
+	u32 *ret;
+	/** Mailbox value in case of operation success. */
+	u32 ok;
+	/** Mailbox value in case of operation failure. */
+	u32 fail;
+};
+
+struct fecs_mthd_op_cond {
+	/** Operation success condition. */
+	u32 ok;
+	/** Operation fail condition. */
+	u32 fail;
+};
+
+/**
+ * FECS method operation structure.
+ *
+ * This structure defines the protocol for communication with FECS
+ * microcontroller.
+ */
+struct nvgpu_fecs_method_op {
+	/** Method struct */
+	struct fecs_mthd_op_method method;
+	/** Mailbox struct */
+	struct fecs_mthd_op_mailbox mailbox;
+	/** Condition struct */
+	struct fecs_mthd_op_cond cond;
+};
+
+/**
+ * CTXSW falcon bootloader descriptor structure.
+ */
+struct nvgpu_ctxsw_bootloader_desc {
+	/** Start offset, unused. */
+	u32 start_offset;
+	/** Size, unused. */
+	u32 size;
+	/** IMEM offset. */
+	u32 imem_offset;
+	/** Falcon boot vector. */
+	u32 entry_point;
+};
+
+/**
+ * CTXSW ucode information structure.
+ */
+struct nvgpu_ctxsw_ucode_info {
+	/** Memory to store ucode instance block. */
+	struct nvgpu_mem inst_blk_desc;
+	/** Memory to store ucode contents locally. */
+	struct nvgpu_mem surface_desc;
+	/** Ucode segments for FECS. */
+	struct nvgpu_ctxsw_ucode_segments fecs;
+	/** Ucode segments for GPCCS. */
+	struct nvgpu_ctxsw_ucode_segments gpccs;
+};
+
+/**
+ * Structure to store various sizes queried from FECS
+ */
+struct nvgpu_gr_falcon_query_sizes {
+	/** Size of golden context image. */
+	u32 golden_image_size;
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+	u32 pm_ctxsw_image_size;
+#endif
+#ifdef CONFIG_NVGPU_GFXP
+	u32 preempt_image_size;
+#endif
+#ifdef CONFIG_NVGPU_GRAPHICS
+	u32 zcull_image_size;
+#endif
+};
+
+/**
+ * GR falcon data structure.
+ *
+ * This structure stores all data required to load and boot CTXSW ucode,
+ * and also to communicate with FECS microcontroller.
+ */
+struct nvgpu_gr_falcon {
+	/**
+	 * CTXSW ucode information structure.
+	 */
+	struct nvgpu_ctxsw_ucode_info ctxsw_ucode_info;
+
+	/**
+	 * Mutex to protect all FECS methods.
+	 */
+	struct nvgpu_mutex fecs_mutex;
+
+	/**
+	 * Flag to skip ucode initialization if it is already done.
+	 */
+	bool skip_ucode_init;
+
+	/**
+	 * Flag to trigger recovery bootstrap in case coldboot bootstrap
+	 * was already done.
+	 */
+	bool coldboot_bootstrap_done;
+
+	/**
+	 * Structure to hold various sizes that are queried from FECS
+	 * microcontroller.
+	 */
+	struct nvgpu_gr_falcon_query_sizes sizes;
+};
+
+#endif /* NVGPU_GR_FALCON_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/gr_intr.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_intr.c
--- a/drivers/gpu/nvgpu/common/gr/gr_intr_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/gr_intr_priv.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_INTR_PRIV_H
+#define NVGPU_GR_INTR_PRIV_H
+
+#include <nvgpu/types.h>
+#include <nvgpu/lock.h>
+#include <include/nvgpu/gr/gr_falcon.h>
+
+struct nvgpu_channel;
+
+/**
+ * Size of lookup buffer used for context translation to GPU channel
+ * and TSG identifiers.
+ * This value must be a power of 2.
+ */
+#define GR_CHANNEL_MAP_TLB_SIZE		2U
+
+/**
+ * GR interrupt information struct.
+ *
+ * This structure maintains information on pending GR engine interrupts.
+ */
+struct nvgpu_gr_intr_info {
+	/**
+	 * This value is set in case notification interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 notify;
+	/**
+	 * This value is set in case semaphore interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 semaphore;
+	/**
+	 * This value is set in case illegal notify interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 illegal_notify;
+	/**
+	 * This value is set in case illegal method interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 illegal_method;
+	/**
+	 * This value is set in case illegal class interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 illegal_class;
+	/**
+	 * This value is set in case FECS error interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 fecs_error;
+	/**
+	 * This value is set in case illegal class interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 class_error;
+	/**
+	 * This value is set in case firmware method interrupt is pending.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 fw_method;
+	/**
+	 * This value is set in case exception is pending in graphics pipe.
+	 * Same value is used to clear the interrupt.
+	 */
+	u32 exception;
+	/*
+	 * This value is set when the FE receives a valid method and it
+	 * matches with the value configured in PRI_FE_DEBUG_METHOD_* pri
+	 * registers; In case of a match, FE proceeds to drop that method.
+	 * This provides a way to the SW to turn off HW decoding of this
+	 * method and convert it to a SW method.
+	 */
+	u32 debug_method;
+	/*
+	 * This value is set on the completion of a LaunchDma method with
+	 * InterruptType field configured to INTERRUPT.
+	 */
+	u32 buffer_notify;
+};
+
+/**
+ * TPC exception data structure.
+ *
+ * TPC exceptions can be decomposed into exceptions triggered by its
+ * subunits. This structure keeps track of which subunits have
+ * triggered exception.
+ */
+struct nvgpu_gr_tpc_exception {
+	/**
+	 * This flag is set in case TEX exception is pending.
+	 */
+	bool tex_exception;
+	/**
+	 * This flag is set in case SM exception is pending.
+	 */
+	bool sm_exception;
+	/**
+	 * This flag is set in case MPC exception is pending.
+	 */
+	bool mpc_exception;
+	/**
+	 * This flag is set in case PE exception is pending.
+	 */
+	bool pe_exception;
+};
+
+/**
+ * GR ISR data structure.
+ *
+ * This structure holds all necessary information to handle all GR engine
+ * error/exception interrupts.
+ */
+struct nvgpu_gr_isr_data {
+	/**
+	 * Contents of TRAPPED_ADDR register used to decode below
+	 * fields.
+	 */
+	u32 addr;
+	/**
+	 * Low word of the trapped method data.
+	 */
+	u32 data_lo;
+	/**
+	 * High word of the trapped method data.
+	 */
+	u32 data_hi;
+	/**
+	 * Information of current context.
+	 */
+	u32 curr_ctx;
+	/**
+	 * Pointer to faulted GPU channel.
+	 */
+	struct nvgpu_channel *ch;
+	/**
+	 * Address of the trapped method.
+	 */
+	u32 offset;
+	/**
+	 * Subchannel ID of the trapped method.
+	 */
+	u32 sub_chan;
+	/**
+	 * Class ID corresponding to above subchannel.
+	 */
+	u32 class_num;
+	/**
+	 * Value read from fecs_host_int_status h/w reg.
+	 */
+	u32 fecs_intr;
+	/**
+	 * S/W defined status for fecs_host_int_status.
+	 */
+	struct nvgpu_fecs_host_intr_status fecs_host_intr_status;
+};
+
+/**
+ * Details of lookup buffer used to translate context to GPU
+ * channel/TSG identifiers.
+ */
+struct gr_channel_map_tlb_entry {
+	/**
+	 * Information of context.
+	 */
+	u32 curr_ctx;
+	/**
+	 * GPU channel ID.
+	 */
+	u32 chid;
+	/**
+	 * GPU Time Slice Group  ID.
+	 */
+	u32 tsgid;
+};
+
+/**
+ * GR interrupt management data structure.
+ *
+ * This structure holds various fields to manage GR engine interrupt
+ * handling.
+ */
+struct nvgpu_gr_intr {
+	/**
+	 * Lookup buffer structure used to translate context to GPU
+	 * channel and TSG identifiers.
+	 */
+	struct gr_channel_map_tlb_entry chid_tlb[GR_CHANNEL_MAP_TLB_SIZE];
+	/**
+	 * Entry in lookup buffer that should be overwritten if there is
+	 * no remaining free entry.
+	 */
+	u32 channel_tlb_flush_index;
+	/**
+	 * Spinlock for all lookup buffer accesses.
+	 */
+	struct nvgpu_spinlock ch_tlb_lock;
+};
+
+#endif /* NVGPU_GR_INTR_PRIV_H */
+
--- a/drivers/gpu/nvgpu/common/gr/gr_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/gr_priv.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_PRIV_H
+#define NVGPU_GR_PRIV_H
+
+#include <nvgpu/types.h>
+#include <nvgpu/cond.h>
+
+struct nvgpu_gr_ctx_desc;
+struct nvgpu_gr_global_ctx_buffer_desc;
+struct nvgpu_gr_obj_ctx_golden_image;
+struct nvgpu_gr_config;
+#ifdef CONFIG_NVGPU_GRAPHICS
+struct nvgpu_gr_zbc;
+struct nvgpu_gr_zcull;
+#endif
+#ifdef CONFIG_NVGPU_DEBUGGER
+struct nvgpu_gr_hwpm_map;
+#endif
+
+/**
+ * GR engine data structure.
+ *
+ * This is the parent structure to all other GR engine data structures,
+ * and holds a pointer to all of them. This structure also stores
+ * various fields to track GR engine initialization state.
+ *
+ * Pointer to this structure is maintained in GPU driver structure.
+ */
+struct nvgpu_gr {
+	/**
+	 * Pointer to GPU driver struct.
+	 */
+	struct gk20a *g;
+
+	/**
+	 * Instance ID of GR engine.
+	 */
+	u32 instance_id;
+
+	/**
+	 * Condition variable for GR initialization.
+	 * Waiters shall wait on this condition to ensure GR engine
+	 * is initialized.
+	 */
+	struct nvgpu_cond init_wq;
+
+	/**
+	 * Flag to indicate if GR engine is initialized.
+	 */
+	bool initialized;
+
+	/**
+	 * Syspipe ID of the GR instance.
+	 */
+	u32 syspipe_id;
+
+	/**
+	 * Pointer to global context buffer descriptor structure.
+	 */
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer;
+
+	/**
+	 * Pointer to Golden context image structure.
+	 */
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image;
+
+	/**
+	 * Pointer to GR context descriptor structure.
+	 */
+	struct nvgpu_gr_ctx_desc *gr_ctx_desc;
+
+	/**
+	 * Pointer to GR configuration structure.
+	 */
+	struct nvgpu_gr_config *config;
+
+	/**
+	 * Pointer to GR falcon data structure.
+	 */
+	struct nvgpu_gr_falcon *falcon;
+
+	/**
+	 * Pointer to GR interrupt data structure.
+	 */
+	struct nvgpu_gr_intr *intr;
+
+	/**
+	 * Function pointer to remove GR s/w support.
+	 */
+	void (*remove_support)(struct gk20a *g);
+
+	/**
+	 * Flag to indicate GR s/w has been initialized.
+	 */
+	bool sw_ready;
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+	struct nvgpu_gr_hwpm_map *hwpm_map;
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	struct nvgpu_gr_zcull *zcull;
+
+	struct nvgpu_gr_zbc *zbc;
+#endif
+
+#ifdef CONFIG_NVGPU_NON_FUSA
+	u32 fecs_feature_override_ecc_val;
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+	u32 cilp_preempt_pending_chid;
+#endif
+
+#if defined(CONFIG_NVGPU_RECOVERY) || defined(CONFIG_NVGPU_DEBUGGER)
+	struct nvgpu_mutex ctxsw_disable_mutex;
+	int ctxsw_disable_count;
+#endif
+};
+
+#endif /* NVGPU_GR_PRIV_H */
+
--- a/drivers/gpu/nvgpu/common/gr/gr_setup.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_setup.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/log.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/gr/ctx.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/obj_ctx.h>
+#ifdef CONFIG_NVGPU_GRAPHICS
+#include <nvgpu/gr/zcull.h>
+#endif
+#include <nvgpu/gr/setup.h>
+#include <nvgpu/gr/gr_instances.h>
+#include <nvgpu/channel.h>
+#include <nvgpu/preempt.h>
+
+#include "gr_priv.h"
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+static int nvgpu_gr_setup_zcull(struct gk20a *g, struct nvgpu_channel *c,
+				struct nvgpu_gr_ctx *gr_ctx)
+{
+	int ret = 0;
+
+	nvgpu_log_fn(g, " ");
+
+	ret = nvgpu_channel_disable_tsg(g, c);
+	if (ret != 0) {
+		nvgpu_err(g, "failed to disable channel/TSG");
+		return ret;
+	}
+
+	ret = nvgpu_preempt_channel(g, c);
+	if (ret != 0) {
+		nvgpu_err(g, "failed to preempt channel/TSG");
+		goto out;
+	}
+
+	ret = nvgpu_gr_zcull_ctx_setup(g, c->subctx, gr_ctx);
+	if (ret != 0) {
+		nvgpu_err(g, "failed to setup zcull");
+		goto out;
+	}
+	/* no error at this point */
+	ret = nvgpu_channel_enable_tsg(g, c);
+	if (ret != 0) {
+		nvgpu_err(g, "failed to re-enable channel/TSG");
+	}
+
+	return ret;
+
+out:
+	/*
+	 * control reaches here if preempt failed or nvgpu_gr_zcull_ctx_setup
+	 * failed. Propagate preempt failure err or err for
+	 * nvgpu_gr_zcull_ctx_setup
+	 */
+	if (nvgpu_channel_enable_tsg(g, c) != 0) {
+		/* ch might not be bound to tsg */
+		nvgpu_err(g, "failed to enable channel/TSG");
+	}
+
+	return ret;
+}
+
+int nvgpu_gr_setup_bind_ctxsw_zcull(struct gk20a *g, struct nvgpu_channel *c,
+			u64 zcull_va, u32 mode)
+{
+	struct nvgpu_tsg *tsg;
+	struct nvgpu_gr_ctx *gr_ctx;
+
+	tsg = nvgpu_tsg_from_ch(c);
+	if (tsg == NULL) {
+		return -EINVAL;
+	}
+
+	gr_ctx = tsg->gr_ctx;
+	nvgpu_gr_ctx_set_zcull_ctx(g, gr_ctx, mode, zcull_va);
+
+	return nvgpu_gr_setup_zcull(g, c, gr_ctx);
+}
+#endif
+
+static int nvgpu_gr_setup_validate_channel_and_class(struct gk20a *g,
+					struct nvgpu_channel *c, u32 class_num)
+{
+	int err = 0;
+
+	/* an address space needs to have been bound at this point.*/
+	if (!nvgpu_channel_as_bound(c)) {
+		nvgpu_err(g,
+			   "not bound to address space at time"
+			   " of grctx allocation");
+		return -EINVAL;
+	}
+
+	if (!g->ops.gpu_class.is_valid(class_num)) {
+		nvgpu_err(g,
+			   "invalid obj class 0x%x", class_num);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int nvgpu_gr_setup_alloc_subctx(struct gk20a *g, struct nvgpu_channel *c)
+{
+	int err = 0;
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		if (c->subctx == NULL) {
+			c->subctx = nvgpu_gr_subctx_alloc(g, c->vm);
+			if (c->subctx == NULL) {
+				err = -ENOMEM;
+			}
+		}
+	}
+
+	return err;
+}
+
+int nvgpu_gr_setup_alloc_obj_ctx(struct nvgpu_channel *c, u32 class_num,
+		u32 flags)
+{
+	struct gk20a *g = c->g;
+	struct nvgpu_gr_ctx *gr_ctx;
+	struct nvgpu_tsg *tsg = NULL;
+	int err = 0;
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr,
+		"GR%u: allocate object context for channel %u",
+		gr->instance_id, c->chid);
+
+	err = nvgpu_gr_setup_validate_channel_and_class(g, c, class_num);
+	if (err != 0) {
+		goto out;
+	}
+
+	c->obj_class = class_num;
+
+#ifndef CONFIG_NVGPU_HAL_NON_FUSA
+	/*
+	 * Only compute and graphics classes need object context.
+	 * Return success for valid non-compute and non-graphics classes.
+	 * Invalid classes are already captured in
+	 * nvgpu_gr_setup_validate_channel_and_class() function.
+	 */
+	if (!g->ops.gpu_class.is_valid_compute(class_num) &&
+	    !g->ops.gpu_class.is_valid_gfx(class_num)) {
+		return 0;
+	}
+#endif
+
+	tsg = nvgpu_tsg_from_ch(c);
+	if (tsg == NULL) {
+		return -EINVAL;
+	}
+
+	err = nvgpu_gr_setup_alloc_subctx(g, c);
+	if (err != 0) {
+		nvgpu_err(g, "failed to allocate gr subctx buffer");
+		goto out;
+	}
+
+	nvgpu_mutex_acquire(&tsg->ctx_init_lock);
+
+	gr_ctx = tsg->gr_ctx;
+
+	if (!nvgpu_mem_is_valid(nvgpu_gr_ctx_get_ctx_mem(gr_ctx))) {
+		tsg->vm = c->vm;
+		nvgpu_vm_get(tsg->vm);
+
+		err = nvgpu_gr_obj_ctx_alloc(g, gr->golden_image,
+				gr->global_ctx_buffer, gr->gr_ctx_desc,
+				gr->config, gr_ctx, c->subctx,
+				tsg->vm, &c->inst_block, class_num, flags,
+				c->cde, c->vpr);
+		if (err != 0) {
+			nvgpu_err(g,
+				"failed to allocate gr ctx buffer");
+			nvgpu_mutex_release(&tsg->ctx_init_lock);
+			nvgpu_vm_put(tsg->vm);
+			tsg->vm = NULL;
+			goto out;
+		}
+
+		nvgpu_gr_ctx_set_tsgid(gr_ctx, tsg->tsgid);
+	} else {
+		/* commit gr ctx buffer */
+		nvgpu_gr_obj_ctx_commit_inst(g, &c->inst_block, gr_ctx,
+			c->subctx, nvgpu_gr_ctx_get_ctx_mem(gr_ctx)->gpu_va);
+	}
+
+#ifdef CONFIG_NVGPU_FECS_TRACE
+	if (g->ops.gr.fecs_trace.bind_channel && !c->vpr) {
+		err = g->ops.gr.fecs_trace.bind_channel(g, &c->inst_block,
+			c->subctx, gr_ctx, tsg->tgid, 0);
+		if (err != 0) {
+			nvgpu_warn(g,
+				"fail to bind channel for ctxsw trace");
+		}
+	}
+#endif
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+	if ((g->num_sys_perfmon == 0U) &&
+			(g->ops.perf.get_num_hwpm_perfmon != NULL) &&
+			(err == 0)) {
+		g->ops.perf.get_num_hwpm_perfmon(g, &g->num_sys_perfmon,
+				&g->num_fbp_perfmon, &g->num_gpc_perfmon);
+		nvgpu_log(g, gpu_dbg_gr | gpu_dbg_gpu_dbg,
+			"num_sys_perfmon[%u] num_fbp_perfmon[%u] "
+				"num_gpc_perfmon[%u] ",
+			g->num_sys_perfmon, g->num_fbp_perfmon,
+			g->num_gpc_perfmon);
+		nvgpu_assert((g->num_sys_perfmon != 0U) &&
+			(g->num_fbp_perfmon != 0U) &&
+			(g->num_gpc_perfmon != 0U));
+	}
+#endif
+
+	nvgpu_mutex_release(&tsg->ctx_init_lock);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	return 0;
+out:
+	if (c->subctx != NULL) {
+		nvgpu_gr_subctx_free(g, c->subctx, c->vm);
+		c->subctx = NULL;
+	}
+
+	/* 1. gr_ctx, patch_ctx and global ctx buffer mapping
+	   can be reused so no need to release them.
+	   2. golden image init and load is a one time thing so if
+	   they pass, no need to undo. */
+	nvgpu_err(g, "fail");
+	return err;
+}
+
+void nvgpu_gr_setup_free_gr_ctx(struct gk20a *g,
+		struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx)
+{
+	nvgpu_log_fn(g, " ");
+
+	if (gr_ctx != NULL) {
+#ifdef CONFIG_DEBUG_FS
+		if ((g->ops.gr.ctxsw_prog.dump_ctxsw_stats != NULL) &&
+		     nvgpu_gr_ctx_desc_dump_ctxsw_stats_on_channel_close(
+					g->gr->gr_ctx_desc)) {
+			g->ops.gr.ctxsw_prog.dump_ctxsw_stats(g,
+				 nvgpu_gr_ctx_get_ctx_mem(gr_ctx));
+		}
+#endif
+
+		nvgpu_gr_ctx_free(g, gr_ctx, g->gr->global_ctx_buffer, vm);
+	}
+}
+
+void nvgpu_gr_setup_free_subctx(struct nvgpu_channel *c)
+{
+	nvgpu_log_fn(c->g, " ");
+
+	if (!nvgpu_is_enabled(c->g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		return;
+	}
+
+	if (c->subctx != NULL) {
+		nvgpu_gr_subctx_free(c->g, c->subctx, c->vm);
+		c->subctx = NULL;
+	}
+}
+
+static bool nvgpu_gr_setup_validate_preemption_mode(u32 *graphics_preempt_mode,
+				u32 *compute_preempt_mode,
+				struct nvgpu_gr_ctx *gr_ctx)
+{
+#ifdef CONFIG_NVGPU_GRAPHICS
+	/* skip setting anything if both modes are already set */
+	if ((*graphics_preempt_mode != 0U) &&
+		(*graphics_preempt_mode ==
+			nvgpu_gr_ctx_get_graphics_preemption_mode(gr_ctx))) {
+		*graphics_preempt_mode = 0;
+	}
+#endif /* CONFIG_NVGPU_GRAPHICS */
+
+	if ((*compute_preempt_mode != 0U) &&
+	    (*compute_preempt_mode ==
+		    nvgpu_gr_ctx_get_compute_preemption_mode(gr_ctx))) {
+		*compute_preempt_mode = 0;
+	}
+
+	if ((*graphics_preempt_mode == 0U) && (*compute_preempt_mode == 0U)) {
+		return false;
+	}
+
+	return true;
+}
+
+
+
+int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
+		u32 graphics_preempt_mode, u32 compute_preempt_mode,
+		u32 gr_instance_id)
+{
+	struct nvgpu_gr_ctx *gr_ctx;
+	struct gk20a *g = ch->g;
+	struct nvgpu_tsg *tsg;
+	struct vm_gk20a *vm;
+	struct nvgpu_gr *gr;
+	u32 class_num;
+	int err = 0;
+
+	gr = &g->gr[gr_instance_id];
+
+	class_num = ch->obj_class;
+	if (class_num == 0U) {
+		return -EINVAL;
+	}
+
+	if (!g->ops.gpu_class.is_valid(class_num)) {
+		nvgpu_err(g, "invalid obj class 0x%x", class_num);
+		return -EINVAL;
+	}
+
+	tsg = nvgpu_tsg_from_ch(ch);
+	if (tsg == NULL) {
+		return -EINVAL;
+	}
+
+	vm = tsg->vm;
+	gr_ctx = tsg->gr_ctx;
+
+	if (nvgpu_gr_setup_validate_preemption_mode(&graphics_preempt_mode,
+				&compute_preempt_mode, gr_ctx) == false) {
+		return 0;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr | gpu_dbg_sched, "chid=%d tsgid=%d pid=%d "
+			"graphics_preempt_mode=%u compute_preempt_mode=%u",
+			ch->chid, ch->tsgid, ch->tgid,
+			graphics_preempt_mode, compute_preempt_mode);
+
+	err = nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(g, gr->config,
+			gr->gr_ctx_desc, gr_ctx, vm, class_num,
+			graphics_preempt_mode, compute_preempt_mode);
+	if (err != 0) {
+		nvgpu_err(g, "set_ctxsw_preemption_mode failed");
+		return err;
+	}
+
+	g->ops.tsg.disable(tsg);
+
+	err = nvgpu_preempt_channel(g, ch);
+	if (err != 0) {
+		nvgpu_err(g, "failed to preempt channel/TSG");
+		goto enable_ch;
+	}
+
+	nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(g, gr->config, gr_ctx,
+		ch->subctx);
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
+		g->ops.gr.init.commit_global_cb_manager(g, gr->config, gr_ctx,
+			true);
+		nvgpu_gr_ctx_patch_write_end(g, gr_ctx, true);
+	}
+
+	g->ops.tsg.enable(tsg);
+
+	return err;
+
+enable_ch:
+	g->ops.tsg.enable(tsg);
+	return err;
+}
--- a/drivers/gpu/nvgpu/common/gr/gr_utils.c
+++ b/drivers/gpu/nvgpu/common/gr/gr_utils.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/types.h>
+#include <nvgpu/gr/gr_utils.h>
+#include <nvgpu/gr/gr_instances.h>
+
+#include <nvgpu/gr/config.h>
+
+#include "gr_priv.h"
+
+u32 nvgpu_gr_checksum_u32(u32 a, u32 b)
+{
+	return nvgpu_safe_cast_u64_to_u32(((u64)a + (u64)b) & (U32_MAX));
+}
+
+struct nvgpu_gr_falcon *nvgpu_gr_get_falcon_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->falcon;
+}
+
+struct nvgpu_gr_config *nvgpu_gr_get_config_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->config;
+}
+
+struct nvgpu_gr_config *nvgpu_gr_get_gr_instance_config_ptr(struct gk20a *g,
+		u32 gr_instance_id)
+{
+	return g->gr[gr_instance_id].config;
+}
+
+struct nvgpu_gr_intr *nvgpu_gr_get_intr_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->intr;
+}
+
+#ifdef CONFIG_NVGPU_NON_FUSA
+u32 nvgpu_gr_get_override_ecc_val(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->fecs_feature_override_ecc_val;
+}
+
+void nvgpu_gr_override_ecc_val(struct nvgpu_gr *gr, u32 ecc_val)
+{
+	gr->fecs_feature_override_ecc_val = ecc_val;
+}
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+struct nvgpu_gr_zcull *nvgpu_gr_get_zcull_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->zcull;
+}
+
+struct nvgpu_gr_zbc *nvgpu_gr_get_zbc_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->zbc;
+}
+#endif
+
+#ifdef CONFIG_NVGPU_FECS_TRACE
+struct nvgpu_gr_global_ctx_buffer_desc *nvgpu_gr_get_global_ctx_buffer_ptr(
+							struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+	return gr->global_ctx_buffer;
+}
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+u32 nvgpu_gr_get_cilp_preempt_pending_chid(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	return gr->cilp_preempt_pending_chid;
+}
+
+void nvgpu_gr_clear_cilp_preempt_pending_chid(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	gr->cilp_preempt_pending_chid =
+				NVGPU_INVALID_CHANNEL_ID;
+}
+#endif
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+struct nvgpu_gr_obj_ctx_golden_image *nvgpu_gr_get_golden_image_ptr(
+		struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	return gr->golden_image;
+}
+
+struct nvgpu_gr_hwpm_map *nvgpu_gr_get_hwpm_map_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	return gr->hwpm_map;
+}
+
+void nvgpu_gr_reset_falcon_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	gr->falcon = NULL;
+}
+
+void nvgpu_gr_reset_golden_image_ptr(struct gk20a *g)
+{
+	struct nvgpu_gr *gr = nvgpu_gr_get_cur_instance_ptr(g);
+
+	gr->golden_image = NULL;
+}
+#endif
--- a/drivers/gpu/nvgpu/common/gr/hwpm_map.c
+++ b/drivers/gpu/nvgpu/common/gr/hwpm_map.c
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/netlist.h>
+#include <nvgpu/log.h>
+#include <nvgpu/sort.h>
+#include <nvgpu/kmem.h>
+#include <nvgpu/bsearch.h>
+#include <nvgpu/fbp.h>
+#include <nvgpu/gr/config.h>
+#include <nvgpu/gr/hwpm_map.h>
+
+/* needed for pri_is_ppc_addr_shared */
+#include "hal/gr/gr/gr_pri_gk20a.h"
+
+#define NV_PCFG_BASE		0x00088000U
+#define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200U
+#define NV_PERF_PMMGPCROUTER_STRIDE	0x0200U
+#define NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE	0x0020U
+
+/* Dummy address for ctxsw'ed pri reg checksum. */
+#define CTXSW_PRI_CHECKSUM_DUMMY_REG  0x00ffffffU
+
+int nvgpu_gr_hwpm_map_init(struct gk20a *g, struct nvgpu_gr_hwpm_map **hwpm_map,
+	u32 size)
+{
+	struct nvgpu_gr_hwpm_map *tmp_map;
+
+	nvgpu_log(g, gpu_dbg_gr, "size = %u", size);
+
+	if (size == 0U) {
+		return -EINVAL;
+	}
+
+	tmp_map = nvgpu_kzalloc(g, sizeof(*tmp_map));
+	if (tmp_map == NULL) {
+		return -ENOMEM;
+	}
+
+	tmp_map->pm_ctxsw_image_size = size;
+	tmp_map->init = false;
+
+	*hwpm_map = tmp_map;
+
+	return 0;
+}
+
+void nvgpu_gr_hwpm_map_deinit(struct gk20a *g,
+	struct nvgpu_gr_hwpm_map *hwpm_map)
+{
+	if (hwpm_map->init) {
+		nvgpu_big_free(g, hwpm_map->map);
+	}
+
+	nvgpu_kfree(g, hwpm_map);
+}
+
+u32 nvgpu_gr_hwpm_map_get_size(struct nvgpu_gr_hwpm_map *hwpm_map)
+{
+	return hwpm_map->pm_ctxsw_image_size;
+}
+
+static int map_cmp(const void *a, const void *b)
+{
+	const struct ctxsw_buf_offset_map_entry *e1;
+	const struct ctxsw_buf_offset_map_entry *e2;
+
+	e1 = (const struct ctxsw_buf_offset_map_entry *)a;
+	e2 = (const struct ctxsw_buf_offset_map_entry *)b;
+
+	if (e1->addr < e2->addr) {
+		return -1;
+	}
+
+	if (e1->addr > e2->addr) {
+		return 1;
+	}
+	return 0;
+}
+
+static int add_ctxsw_buffer_map_entries_pmsys(
+	struct ctxsw_buf_offset_map_entry *map,
+	struct netlist_aiv_list *regs,	u32 *count, u32 *offset,
+	u32 max_cnt, u32 base, u32 mask)
+{
+	u32 idx;
+	u32 cnt = *count;
+	u32 off = *offset;
+
+	if ((cnt + regs->count) > max_cnt) {
+		return -EINVAL;
+	}
+
+	for (idx = 0; idx < regs->count; idx++) {
+		if ((base + (regs->l[idx].addr & mask)) < 0xFFFU) {
+			map[cnt].addr = base + (regs->l[idx].addr & mask)
+					+ NV_PCFG_BASE;
+		} else {
+			map[cnt].addr = base + (regs->l[idx].addr & mask);
+		}
+		map[cnt++].offset = off;
+		off += 4U;
+	}
+	*count = cnt;
+	*offset = off;
+	return 0;
+}
+
+static int add_ctxsw_buffer_map_entries_pmgpc(struct gk20a *g,
+					struct ctxsw_buf_offset_map_entry *map,
+					struct netlist_aiv_list *regs,
+					u32 *count, u32 *offset,
+					u32 max_cnt, u32 base, u32 mask)
+{
+	u32 idx;
+	u32 cnt = *count;
+	u32 off = *offset;
+
+	if ((cnt + regs->count) > max_cnt) {
+		return -EINVAL;
+	}
+
+	/* NOTE: The PPC offsets get added to the pm_gpc list if numPpc <= 1
+	 * To handle the case of PPC registers getting added into GPC, the below
+	 * code specifically checks for any PPC offsets and adds them using
+	 * proper mask
+	 */
+	for (idx = 0; idx < regs->count; idx++) {
+		/* Check if the address is PPC address */
+		if (pri_is_ppc_addr_shared(g, regs->l[idx].addr & mask)) {
+			u32 ppc_in_gpc_base = nvgpu_get_litter_value(g,
+						GPU_LIT_PPC_IN_GPC_BASE);
+			u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g,
+						GPU_LIT_PPC_IN_GPC_STRIDE);
+			/* Use PPC mask instead of the GPC mask provided */
+			u32 ppcmask = ppc_in_gpc_stride - 1U;
+
+			map[cnt].addr = base + ppc_in_gpc_base
+					+ (regs->l[idx].addr & ppcmask);
+		} else {
+			map[cnt].addr = base + (regs->l[idx].addr & mask);
+		}
+		map[cnt++].offset = off;
+		off += 4U;
+	}
+	*count = cnt;
+	*offset = off;
+	return 0;
+}
+
+static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map,
+					struct netlist_aiv_list *regs,
+					u32 *count, u32 *offset,
+					u32 max_cnt, u32 base, u32 mask)
+{
+	u32 idx;
+	u32 cnt = *count;
+	u32 off = *offset;
+
+	if ((cnt + regs->count) > max_cnt) {
+		return -EINVAL;
+	}
+
+	for (idx = 0; idx < regs->count; idx++) {
+		map[cnt].addr = base + (regs->l[idx].addr & mask);
+		map[cnt++].offset = off;
+		off += 4U;
+	}
+	*count = cnt;
+	*offset = off;
+	return 0;
+}
+
+/* Helper function to add register entries to the register map for all
+ * subunits
+ */
+static int add_ctxsw_buffer_map_entries_subunits(
+				struct ctxsw_buf_offset_map_entry *map,
+				struct netlist_aiv_list *regs,
+				u32 *count, u32 *offset,
+				u32 max_cnt, u32 base, u32 num_units,
+				u32 active_unit_mask, u32 stride, u32 mask)
+{
+	u32 unit;
+	u32 idx;
+	u32 cnt = *count;
+	u32 off = *offset;
+
+	if ((cnt + (regs->count * num_units)) > max_cnt) {
+		return -EINVAL;
+	}
+
+	/* Data is interleaved for units in ctxsw buffer */
+	for (idx = 0; idx < regs->count; idx++) {
+		for (unit = 0; unit < num_units; unit++) {
+			if ((active_unit_mask & BIT32(unit)) != 0U) {
+				map[cnt].addr = base +
+						(regs->l[idx].addr & mask) +
+						(unit * stride);
+				map[cnt++].offset = off;
+				off += 4U;
+
+				/*
+				 * The ucode computes and saves the checksum of
+				 * all ctxsw'ed register values within a list.
+				 * Entries with addr=0x00ffffff are placeholder
+				 * for these checksums.
+				 *
+				 * There is only one checksum for a list
+				 * even if it contains multiple subunits. Hence,
+				 * skip iterating over all subunits for this
+				 * entry.
+				 */
+				if (regs->l[idx].addr ==
+						CTXSW_PRI_CHECKSUM_DUMMY_REG) {
+					break;
+				}
+			}
+		}
+	}
+	*count = cnt;
+	*offset = off;
+	return 0;
+}
+
+static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g,
+					struct ctxsw_buf_offset_map_entry *map,
+					u32 *count, u32 *offset, u32 max_cnt,
+					struct nvgpu_gr_config *config)
+{
+	u32 num_gpcs = nvgpu_gr_config_get_gpc_count(config);
+	u32 num_ppcs, num_tpcs, gpc_num, base;
+	u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
+	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
+	u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
+	u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
+	u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
+	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
+
+	for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) {
+		num_tpcs = nvgpu_gr_config_get_gpc_tpc_count(config, gpc_num);
+		base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base;
+		if (add_ctxsw_buffer_map_entries_subunits(map,
+					nvgpu_netlist_get_pm_tpc_ctxsw_regs(g),
+					count, offset, max_cnt, base,
+					num_tpcs, ~U32(0U), tpc_in_gpc_stride,
+					(tpc_in_gpc_stride - 1U)) != 0) {
+			return -EINVAL;
+		}
+
+		num_ppcs = nvgpu_gr_config_get_gpc_ppc_count(config, gpc_num);
+		base = gpc_base + (gpc_stride * gpc_num) + ppc_in_gpc_base;
+		if (add_ctxsw_buffer_map_entries_subunits(map,
+					nvgpu_netlist_get_pm_ppc_ctxsw_regs(g),
+					count, offset, max_cnt, base, num_ppcs,
+					~U32(0U), ppc_in_gpc_stride,
+					(ppc_in_gpc_stride - 1U)) != 0) {
+			return -EINVAL;
+		}
+
+		base = gpc_base + (gpc_stride * gpc_num);
+		if (add_ctxsw_buffer_map_entries_pmgpc(g, map,
+					nvgpu_netlist_get_pm_gpc_ctxsw_regs(g),
+					count, offset, max_cnt, base,
+					(gpc_stride - 1U)) != 0) {
+			return -EINVAL;
+		}
+
+		base = NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE * gpc_num;
+		if (add_ctxsw_buffer_map_entries(map,
+				nvgpu_netlist_get_pm_ucgpc_ctxsw_regs(g),
+				count, offset, max_cnt, base, ~U32(0U)) != 0) {
+			return -EINVAL;
+		}
+
+		base = (g->ops.perf.get_pmmgpc_per_chiplet_offset() * gpc_num);
+		if (add_ctxsw_buffer_map_entries(map,
+				nvgpu_netlist_get_perf_gpc_ctxsw_regs(g),
+				count, offset, max_cnt, base, ~U32(0U)) != 0) {
+			return -EINVAL;
+		}
+
+		base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num);
+		if (add_ctxsw_buffer_map_entries(map,
+				nvgpu_netlist_get_gpc_router_ctxsw_regs(g),
+				count, offset, max_cnt, base, ~U32(0U)) != 0) {
+			return -EINVAL;
+		}
+
+		/* Counter Aggregation Unit, if available */
+		if (nvgpu_netlist_get_pm_cau_ctxsw_regs(g)->count != 0U) {
+			base = gpc_base + (gpc_stride * gpc_num)
+					+ tpc_in_gpc_base;
+			if (add_ctxsw_buffer_map_entries_subunits(map,
+					nvgpu_netlist_get_pm_cau_ctxsw_regs(g),
+					count, offset, max_cnt, base, num_tpcs,
+					~U32(0U), tpc_in_gpc_stride,
+					(tpc_in_gpc_stride - 1U)) != 0) {
+				return -EINVAL;
+			}
+		}
+
+		*offset = NVGPU_ALIGN(*offset, 256U);
+
+		base = (g->ops.perf.get_pmmgpc_per_chiplet_offset() * gpc_num);
+		if (add_ctxsw_buffer_map_entries(map,
+				nvgpu_netlist_get_perf_gpc_control_ctxsw_regs(g),
+				count, offset, max_cnt, base, ~U32(0U)) != 0) {
+			return -EINVAL;
+		}
+
+		*offset = NVGPU_ALIGN(*offset, 256U);
+	}
+	return 0;
+}
+
+/*
+ * PM CTXSW BUFFER LAYOUT:
+ *|=============================================|0x00 <----PM CTXSW BUFFER BASE
+ *|        LIST_compressed_pm_ctx_reg_SYS       |Space allocated: numRegs words
+ *|    LIST_compressed_nv_perf_ctx_reg_SYS      |Space allocated: numRegs words
+ *|  LIST_compressed_nv_perf_ctx_reg_sysrouter  |Space allocated: numRegs words
+ *|  PADDING for 256 byte alignment on Maxwell+ |
+ *|=============================================|<----256 byte aligned on Maxwell and later
+ *| LIST_compressed_nv_perf_sys_control_ctx_regs|Space allocated: numRegs words (+ padding)
+ *|        PADDING for 256 byte alignment       |(If reg list is empty, 0 bytes allocated.)
+ *|=============================================|<----256 byte aligned
+ *|    LIST_compressed_nv_perf_ctx_reg_PMA      |Space allocated: numRegs words (+ padding)
+ *|        PADDING for 256 byte alignment       |
+ *|=============================================|<----256 byte aligned (if prev segment exists)
+ *| LIST_compressed_nv_perf_pma_control_ctx_regs|Space allocated: numRegs words (+ padding)
+ *|        PADDING for 256 byte alignment       |(If reg list is empty, 0 bytes allocated.)
+ *|=============================================|<----256 byte aligned
+ *|    LIST_compressed_nv_perf_fbp_ctx_regs     |Space allocated: numRegs * n words (for n FB units)
+ *| LIST_compressed_nv_perf_fbprouter_ctx_regs  |Space allocated: numRegs * n words (for n FB units)
+ *|    LIST_compressed_pm_fbpa_ctx_regs         |Space allocated: numRegs * n words (for n FB units)
+ *|    LIST_compressed_pm_rop_ctx_regs          |Space allocated: numRegs * n words (for n FB units)
+ *|    LIST_compressed_pm_ltc_ctx_regs          |
+ *|                                  LTC0 LTS0  |
+ *|                                  LTC1 LTS0  |Space allocated: numRegs * n words (for n LTC units)
+ *|                                  LTCn LTS0  |
+ *|                                  LTC0 LTS1  |
+ *|                                  LTC1 LTS1  |
+ *|                                  LTCn LTS1  |
+ *|                                  LTC0 LTSn  |
+ *|                                  LTC1 LTSn  |
+ *|                                  LTCn LTSn  |
+ *|        PADDING for 256 byte alignment       |
+ *|=============================================|<----256 byte aligned on Maxwell and later
+ *| LIST_compressed_nv_perf_fbp_control_ctx_regs|Space allocated: numRegs words + padding
+ *|        PADDING for 256 byte alignment       |(If reg list is empty, 0 bytes allocated.)
+ *|=============================================|<----256 byte aligned on Maxwell and later
+ *
+ * Each "GPCn PRI register" segment above has this layout:
+ *|=============================================|<----256 byte aligned
+ *|                            GPC0  REG0 TPC0  |Each GPC has space allocated to accomodate
+ *|                                  REG0 TPC1  |    all the GPC/TPC register lists
+ *| Lists in each GPC region:        REG0 TPCn  |Per GPC allocated space is always 256 byte aligned
+ *|  LIST_pm_ctx_reg_TPC             REG1 TPC0  |
+ *|             * numTpcs            REG1 TPC1  |
+ *|  LIST_pm_ctx_reg_PPC             REG1 TPCn  |
+ *|             * numPpcs            REGn TPC0  |
+ *|  LIST_pm_ctx_reg_GPC             REGn TPC1  |
+ *|  List_pm_ctx_reg_uc_GPC          REGn TPCn  |
+ *|  LIST_nv_perf_ctx_reg_GPC                   |
+ *|  LIST_nv_perf_gpcrouter_ctx_reg             |
+ *|  LIST_nv_perf_ctx_reg_CAU (Tur)             |
+ *|=============================================|
+ *| LIST_compressed_nv_perf_gpc_control_ctx_regs|Space allocated: numRegs words + padding
+ *|        PADDING for 256 byte alignment       |(If reg list is empty, 0 bytes allocated.)
+ *|=============================================|<----256 byte aligned on Maxwell and later
+ */
+
+static int nvgpu_gr_hwpm_map_create(struct gk20a *g,
+	struct nvgpu_gr_hwpm_map *hwpm_map, struct nvgpu_gr_config *config)
+{
+	u32 hwpm_ctxsw_buffer_size = hwpm_map->pm_ctxsw_image_size;
+	struct ctxsw_buf_offset_map_entry *map;
+	u32 hwpm_ctxsw_reg_count_max;
+	u32 map_size;
+	u32 i, count = 0;
+	u32 offset = 0;
+	int ret;
+	u32 active_fbpa_mask;
+	u32 num_fbps = nvgpu_fbp_get_num_fbps(g->fbp);
+	u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE);
+	u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS);
+	u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE);
+	u32 num_ltc = g->ops.top.get_max_ltc_per_fbp(g) *
+		      g->ops.priv_ring.get_fbp_count(g);
+
+	if (hwpm_ctxsw_buffer_size == 0U) {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
+			"no PM Ctxsw buffer memory in context buffer");
+		return -EINVAL;
+	}
+
+	hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2;
+	map_size = hwpm_ctxsw_reg_count_max * (u32)sizeof(*map);
+
+	map = nvgpu_big_zalloc(g, map_size);
+	if (map == NULL) {
+		return -ENOMEM;
+	}
+
+	/* Add entries from _LIST_pm_ctx_reg_SYS */
+	if (add_ctxsw_buffer_map_entries_pmsys(map,
+		nvgpu_netlist_get_pm_sys_ctxsw_regs(g),
+		&count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	/* Add entries from _LIST_nv_perf_ctx_reg_SYS */
+	if (add_ctxsw_buffer_map_entries(map,
+		nvgpu_netlist_get_perf_sys_ctxsw_regs(g),
+		&count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	/* Add entries from _LIST_nv_perf_sysrouter_ctx_reg*/
+	if (add_ctxsw_buffer_map_entries(map,
+		nvgpu_netlist_get_perf_sys_router_ctxsw_regs(g),
+		&count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	/* Add entries from _LIST_nv_perf_sys_control_ctx_reg*/
+	if (nvgpu_netlist_get_perf_sys_control_ctxsw_regs(g)->count > 0U) {
+		offset = NVGPU_ALIGN(offset, 256U);
+
+		ret = add_ctxsw_buffer_map_entries(map,
+			nvgpu_netlist_get_perf_sys_control_ctxsw_regs(g),
+			&count, &offset,
+			hwpm_ctxsw_reg_count_max, 0, ~U32(0U));
+		if (ret != 0) {
+			goto cleanup;
+		}
+	}
+
+	if (g->ops.gr.hwpm_map.align_regs_perf_pma) {
+		g->ops.gr.hwpm_map.align_regs_perf_pma(&offset);
+	}
+
+	/* Add entries from _LIST_nv_perf_pma_ctx_reg*/
+	ret = add_ctxsw_buffer_map_entries(map,
+		nvgpu_netlist_get_perf_pma_ctxsw_regs(g), &count, &offset,
+			hwpm_ctxsw_reg_count_max, 0, ~U32(0U));
+	if (ret != 0) {
+		goto cleanup;
+	}
+
+	offset = NVGPU_ALIGN(offset, 256U);
+
+	/* Add entries from _LIST_nv_perf_pma_control_ctx_reg*/
+	ret = add_ctxsw_buffer_map_entries(map,
+		nvgpu_netlist_get_perf_pma_control_ctxsw_regs(g), &count, &offset,
+			hwpm_ctxsw_reg_count_max, 0, ~U32(0U));
+	if (ret != 0) {
+		goto cleanup;
+	}
+
+	offset = NVGPU_ALIGN(offset, 256U);
+
+	/* Add entries from _LIST_nv_perf_fbp_ctx_regs */
+	if (add_ctxsw_buffer_map_entries_subunits(map,
+		nvgpu_netlist_get_fbp_ctxsw_regs(g), &count, &offset,
+			hwpm_ctxsw_reg_count_max, 0, num_fbps, ~U32(0U),
+			g->ops.perf.get_pmmfbp_per_chiplet_offset(),
+			~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	/* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */
+	if (add_ctxsw_buffer_map_entries_subunits(map,
+			nvgpu_netlist_get_fbp_router_ctxsw_regs(g),
+			&count, &offset, hwpm_ctxsw_reg_count_max, 0,
+			num_fbps, ~U32(0U), NV_PERF_PMM_FBP_ROUTER_STRIDE,
+			~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	if (g->ops.gr.hwpm_map.get_active_fbpa_mask) {
+		active_fbpa_mask = g->ops.gr.hwpm_map.get_active_fbpa_mask(g);
+	} else {
+		active_fbpa_mask = ~U32(0U);
+	}
+
+	/* Add entries from _LIST_nv_pm_fbpa_ctx_regs */
+	if (add_ctxsw_buffer_map_entries_subunits(map,
+			nvgpu_netlist_get_pm_fbpa_ctxsw_regs(g),
+			&count, &offset, hwpm_ctxsw_reg_count_max, 0,
+			num_fbpas, active_fbpa_mask, fbpa_stride, ~U32(0U))
+				!= 0) {
+		goto cleanup;
+	}
+
+	/* Add entries from _LIST_nv_pm_rop_ctx_regs */
+	if (add_ctxsw_buffer_map_entries(map,
+		nvgpu_netlist_get_pm_rop_ctxsw_regs(g), &count, &offset,
+			hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	/* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */
+	if (add_ctxsw_buffer_map_entries_subunits(map,
+			nvgpu_netlist_get_pm_ltc_ctxsw_regs(g), &count, &offset,
+			hwpm_ctxsw_reg_count_max, 0, num_ltc, ~U32(0U),
+			ltc_stride, ~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	offset = NVGPU_ALIGN(offset, 256U);
+
+	/* Add entries from _LIST_nv_perf_fbp_control_ctx_regs */
+	if (add_ctxsw_buffer_map_entries_subunits(map,
+			nvgpu_netlist_get_perf_fbp_control_ctxsw_regs(g),
+			&count, &offset, hwpm_ctxsw_reg_count_max, 0,
+			num_fbps, ~U32(0U),
+			g->ops.perf.get_pmmfbp_per_chiplet_offset(),
+			~U32(0U)) != 0) {
+		goto cleanup;
+	}
+
+	offset = NVGPU_ALIGN(offset, 256U);
+
+	/* Add GPC entries */
+	if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset,
+			hwpm_ctxsw_reg_count_max, config) != 0) {
+		goto cleanup;
+	}
+
+	if (offset > hwpm_ctxsw_buffer_size) {
+		nvgpu_err(g, "offset > buffer size");
+		goto cleanup;
+	}
+
+	sort(map, count, sizeof(*map), map_cmp, NULL);
+
+	hwpm_map->map = map;
+	hwpm_map->count = count;
+	hwpm_map->init = true;
+
+	nvgpu_log_info(g, "Reg Addr => HWPM Ctxt switch buffer offset");
+
+	for (i = 0; i < count; i++) {
+		nvgpu_log_info(g, "%08x => %08x", map[i].addr, map[i].offset);
+	}
+
+	return 0;
+
+cleanup:
+	nvgpu_err(g, "Failed to create HWPM buffer offset map");
+	nvgpu_big_free(g, map);
+	return -EINVAL;
+}
+
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the PM context buffer.
+ */
+int nvgpu_gr_hwmp_map_find_priv_offset(struct gk20a *g,
+	struct nvgpu_gr_hwpm_map *hwpm_map,
+	u32 addr, u32 *priv_offset, struct nvgpu_gr_config *config)
+{
+	struct ctxsw_buf_offset_map_entry *map, *result, map_key;
+	int err = 0;
+	u32 count;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+	/* Create map of pri address and pm offset if necessary */
+	if (!hwpm_map->init) {
+		err = nvgpu_gr_hwpm_map_create(g, hwpm_map, config);
+		if (err != 0) {
+			return err;
+		}
+	}
+
+	*priv_offset = 0;
+
+	map = hwpm_map->map;
+	count = hwpm_map->count;
+
+	map_key.addr = addr;
+	result = nvgpu_bsearch(&map_key, map, count, sizeof(*map), map_cmp);
+
+	if (result != NULL) {
+		*priv_offset = result->offset;
+	} else {
+		nvgpu_err(g, "Lookup failed for address 0x%x", addr);
+		err = -EINVAL;
+	}
+
+	return err;
+}
--- a/drivers/gpu/nvgpu/common/gr/obj_ctx.c
+++ b/drivers/gpu/nvgpu/common/gr/obj_ctx.c
@@ -0,0 +1,982 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/log.h>
+#include <nvgpu/io.h>
+#include <nvgpu/mm.h>
+#ifdef CONFIG_NVGPU_POWER_PG
+#include <nvgpu/pmu/pmu_pg.h>
+#include <nvgpu/power_features/pg.h>
+#endif
+#include <nvgpu/gr/ctx.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/global_ctx.h>
+#include <nvgpu/gr/obj_ctx.h>
+#include <nvgpu/gr/config.h>
+#include <nvgpu/netlist.h>
+#include <nvgpu/gr/gr_falcon.h>
+#include <nvgpu/gr/fs_state.h>
+#include <nvgpu/power_features/cg.h>
+#include <nvgpu/static_analysis.h>
+
+#include "obj_ctx_priv.h"
+
+void nvgpu_gr_obj_ctx_commit_inst_gpu_va(struct gk20a *g,
+	struct nvgpu_mem *inst_block, u64 gpu_va)
+{
+	g->ops.ramin.set_gr_ptr(g, inst_block, gpu_va);
+}
+
+void nvgpu_gr_obj_ctx_commit_inst(struct gk20a *g, struct nvgpu_mem *inst_block,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx, u64 gpu_va)
+{
+	struct nvgpu_mem *ctxheader;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_TSG_SUBCONTEXTS)) {
+		nvgpu_gr_subctx_load_ctx_header(g, subctx, gr_ctx, gpu_va);
+
+		ctxheader = nvgpu_gr_subctx_get_ctx_header(subctx);
+		nvgpu_gr_obj_ctx_commit_inst_gpu_va(g, inst_block,
+			ctxheader->gpu_va);
+	} else {
+		nvgpu_gr_obj_ctx_commit_inst_gpu_va(g, inst_block, gpu_va);
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+
+#if defined(CONFIG_NVGPU_GFXP) || defined(CONFIG_NVGPU_CILP)
+static int nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx_desc *gr_ctx_desc,
+	struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
+	u32 class_num, u32 flags)
+{
+	int err;
+	u32 graphics_preempt_mode = 0U;
+	u32 compute_preempt_mode = 0U;
+	u32 default_graphics_preempt_mode = 0U;
+	u32 default_compute_preempt_mode = 0U;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	/* Skip for engines other than GR */
+	if (!g->ops.gpu_class.is_valid_compute(class_num) &&
+	    !g->ops.gpu_class.is_valid_gfx(class_num)) {
+		return 0;
+	}
+
+	g->ops.gr.init.get_default_preemption_modes(
+			&default_graphics_preempt_mode,
+			&default_compute_preempt_mode);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_GFXP) != 0U) {
+		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+	}
+
+	if (g->ops.gpu_class.is_valid_gfx(class_num) &&
+			nvgpu_gr_ctx_desc_force_preemption_gfxp(gr_ctx_desc)) {
+		graphics_preempt_mode = NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP;
+	}
+#endif
+
+#ifdef CONFIG_NVGPU_CILP
+	if ((flags & NVGPU_OBJ_CTX_FLAGS_SUPPORT_CILP) != 0U) {
+		compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+	}
+
+	if (g->ops.gpu_class.is_valid_compute(class_num) &&
+			nvgpu_gr_ctx_desc_force_preemption_cilp(gr_ctx_desc)) {
+		compute_preempt_mode = NVGPU_PREEMPTION_MODE_COMPUTE_CILP;
+	}
+#endif
+
+	if (compute_preempt_mode == 0U) {
+		compute_preempt_mode = default_compute_preempt_mode;
+	}
+
+	if (graphics_preempt_mode == 0U) {
+		graphics_preempt_mode = default_graphics_preempt_mode;
+	}
+
+	err = nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(g, config,
+		gr_ctx_desc, gr_ctx, vm, class_num, graphics_preempt_mode,
+		compute_preempt_mode);
+	if (err != 0) {
+		nvgpu_err(g, "set_ctxsw_preemption_mode failed");
+		return err;
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+static int nvgpu_gr_obj_ctx_set_graphics_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx_desc *gr_ctx_desc,
+	struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm,
+	u32 graphics_preempt_mode)
+{
+	int err = 0;
+
+	/* set preemption modes */
+	switch (graphics_preempt_mode) {
+#ifdef CONFIG_NVGPU_GFXP
+	case NVGPU_PREEMPTION_MODE_GRAPHICS_GFXP:
+		{
+		u32 rtv_cb_size;
+		u32 spill_size = g->ops.gr.init.get_ctx_spill_size(g);
+		u32 pagepool_size = g->ops.gr.init.get_ctx_pagepool_size(g);
+		u32 betacb_size = g->ops.gr.init.get_ctx_betacb_size(g);
+		u32 attrib_cb_size =
+			g->ops.gr.init.get_ctx_attrib_cb_size(g, betacb_size,
+				nvgpu_gr_config_get_tpc_count(config),
+				nvgpu_gr_config_get_max_tpc_count(config));
+
+		nvgpu_log_info(g, "gfxp context spill_size=%d", spill_size);
+		nvgpu_log_info(g, "gfxp context pagepool_size=%d", pagepool_size);
+		nvgpu_log_info(g, "gfxp context attrib_cb_size=%d",
+				attrib_cb_size);
+
+		nvgpu_gr_ctx_set_size(gr_ctx_desc,
+			NVGPU_GR_CTX_SPILL_CTXSW, spill_size);
+		nvgpu_gr_ctx_set_size(gr_ctx_desc,
+			NVGPU_GR_CTX_BETACB_CTXSW, attrib_cb_size);
+		nvgpu_gr_ctx_set_size(gr_ctx_desc,
+			NVGPU_GR_CTX_PAGEPOOL_CTXSW, pagepool_size);
+
+		if (g->ops.gr.init.get_gfxp_rtv_cb_size != NULL) {
+			rtv_cb_size = g->ops.gr.init.get_gfxp_rtv_cb_size(g);
+			nvgpu_gr_ctx_set_size(gr_ctx_desc,
+				NVGPU_GR_CTX_GFXP_RTVCB_CTXSW, rtv_cb_size);
+		}
+
+		err = nvgpu_gr_ctx_alloc_ctxsw_buffers(g, gr_ctx,
+			gr_ctx_desc, vm);
+		if (err != 0) {
+			nvgpu_err(g, "cannot allocate ctxsw buffers");
+			return err;
+		}
+
+		nvgpu_gr_ctx_init_graphics_preemption_mode(gr_ctx,
+			graphics_preempt_mode);
+		break;
+		}
+#endif
+	case NVGPU_PREEMPTION_MODE_GRAPHICS_WFI:
+		nvgpu_gr_ctx_init_graphics_preemption_mode(gr_ctx,
+			graphics_preempt_mode);
+		break;
+
+	default:
+		nvgpu_log_info(g, "graphics_preempt_mode=%u",
+			graphics_preempt_mode);
+		break;
+	}
+
+	return err;
+}
+#endif
+
+static int nvgpu_gr_obj_ctx_set_compute_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_ctx *gr_ctx, u32 class_num, u32 compute_preempt_mode)
+{
+
+	if (g->ops.gpu_class.is_valid_compute(class_num)
+#ifdef CONFIG_NVGPU_GRAPHICS
+		|| g->ops.gpu_class.is_valid_gfx(class_num)
+#endif
+		) {
+		nvgpu_gr_ctx_init_compute_preemption_mode(gr_ctx,
+			compute_preempt_mode);
+		return 0;
+	} else {
+		return -EINVAL;
+	}
+
+}
+
+int nvgpu_gr_obj_ctx_set_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx_desc *gr_ctx_desc,
+	struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm, u32 class_num,
+	u32 graphics_preempt_mode, u32 compute_preempt_mode)
+{
+	int err = 0;
+
+	/* check for invalid combinations */
+	if (nvgpu_gr_ctx_check_valid_preemption_mode(g, gr_ctx,
+			graphics_preempt_mode, compute_preempt_mode) == false) {
+		err = -EINVAL;
+		goto fail;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "graphics_preempt_mode=%u compute_preempt_mode=%u",
+			graphics_preempt_mode, compute_preempt_mode);
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	err = nvgpu_gr_obj_ctx_set_graphics_preemption_mode(g, config,
+				gr_ctx_desc, gr_ctx, vm, graphics_preempt_mode);
+
+	if (err != 0) {
+		goto fail;
+	}
+#endif
+
+	err = nvgpu_gr_obj_ctx_set_compute_preemption_mode(g, gr_ctx,
+					class_num, compute_preempt_mode);
+
+fail:
+	return err;
+}
+
+void nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(struct gk20a *g,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_gr_subctx *subctx)
+{
+#ifdef CONFIG_NVGPU_GFXP
+	u64 addr;
+	u32 size;
+	struct nvgpu_mem *mem;
+#endif
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	nvgpu_gr_ctx_set_preemption_modes(g, gr_ctx);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_PREEMPTION_GFXP)) {
+		goto done;
+	}
+
+	if (!nvgpu_mem_is_valid(
+			nvgpu_gr_ctx_get_preempt_ctxsw_buffer(gr_ctx))) {
+		goto done;
+	}
+
+	if (subctx != NULL) {
+		nvgpu_gr_subctx_set_preemption_buffer_va(g, subctx,
+			gr_ctx);
+	} else {
+		nvgpu_gr_ctx_set_preemption_buffer_va(g, gr_ctx);
+	}
+
+	nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, true);
+
+	addr = nvgpu_gr_ctx_get_betacb_ctxsw_buffer(gr_ctx)->gpu_va;
+	g->ops.gr.init.commit_global_attrib_cb(g, gr_ctx,
+		nvgpu_gr_config_get_tpc_count(config),
+		nvgpu_gr_config_get_max_tpc_count(config), addr,
+		true);
+
+	mem = nvgpu_gr_ctx_get_pagepool_ctxsw_buffer(gr_ctx);
+	addr = mem->gpu_va;
+	nvgpu_assert(mem->size <= U32_MAX);
+	size = (u32)mem->size;
+
+	g->ops.gr.init.commit_global_pagepool(g, gr_ctx, addr, size,
+		true, false);
+
+	mem = nvgpu_gr_ctx_get_spill_ctxsw_buffer(gr_ctx);
+	addr = mem->gpu_va;
+	nvgpu_assert(mem->size <= U32_MAX);
+	size = (u32)mem->size;
+
+	g->ops.gr.init.commit_ctxsw_spill(g, gr_ctx, addr, size, true);
+
+	g->ops.gr.init.commit_cbes_reserve(g, gr_ctx, true);
+
+	if (g->ops.gr.init.gfxp_wfi_timeout != NULL) {
+		g->ops.gr.init.gfxp_wfi_timeout(g, gr_ctx, true);
+	}
+
+	if (g->ops.gr.init.commit_gfxp_rtv_cb != NULL) {
+		g->ops.gr.init.commit_gfxp_rtv_cb(g, gr_ctx, true);
+	}
+
+	nvgpu_gr_ctx_patch_write_end(g, gr_ctx, true);
+
+done:
+#endif
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+
+void nvgpu_gr_obj_ctx_commit_global_ctx_buffers(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
+	struct nvgpu_gr_config *config,	struct nvgpu_gr_ctx *gr_ctx, bool patch)
+{
+	u64 addr;
+	u32 size;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	if (patch) {
+		nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, false);
+	}
+
+	/*
+	 * MIG supports only compute class.
+	 * Skip BUNDLE_CB, PAGEPOOL, ATTRIBUTE_CB and RTV_CB
+	 * if 2D/3D/I2M classes(graphics) are not supported.
+	 */
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		/* global pagepool buffer */
+		addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx,
+			NVGPU_GR_CTX_PAGEPOOL_VA);
+		size = nvgpu_safe_cast_u64_to_u32(nvgpu_gr_global_ctx_get_size(
+				global_ctx_buffer,
+				NVGPU_GR_GLOBAL_CTX_PAGEPOOL));
+
+		g->ops.gr.init.commit_global_pagepool(g, gr_ctx, addr, size,
+			patch, true);
+
+		/* global bundle cb */
+		addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx,
+			NVGPU_GR_CTX_CIRCULAR_VA);
+		size = nvgpu_safe_cast_u64_to_u32(
+				g->ops.gr.init.get_bundle_cb_default_size(g));
+
+		g->ops.gr.init.commit_global_bundle_cb(g, gr_ctx, addr, size,
+			patch);
+
+		/* global attrib cb */
+		addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx,
+				NVGPU_GR_CTX_ATTRIBUTE_VA);
+
+		g->ops.gr.init.commit_global_attrib_cb(g, gr_ctx,
+			nvgpu_gr_config_get_tpc_count(config),
+			nvgpu_gr_config_get_max_tpc_count(config), addr, patch);
+
+		g->ops.gr.init.commit_global_cb_manager(g, config, gr_ctx,
+			patch);
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+		if (g->ops.gr.init.commit_rtv_cb != NULL) {
+			/* RTV circular buffer */
+			addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx,
+				NVGPU_GR_CTX_RTV_CIRCULAR_BUFFER_VA);
+
+			g->ops.gr.init.commit_rtv_cb(g, addr, gr_ctx, patch);
+		}
+#endif
+	}
+
+#ifdef CONFIG_NVGPU_SM_DIVERSITY
+	if ((nvgpu_is_enabled(g, NVGPU_SUPPORT_SM_DIVERSITY)) &&
+			(nvgpu_gr_ctx_get_sm_diversity_config(gr_ctx) !=
+			NVGPU_DEFAULT_SM_DIVERSITY_CONFIG) &&
+			(g->ops.gr.init.commit_sm_id_programming != NULL)) {
+		int err;
+
+		err = g->ops.gr.init.commit_sm_id_programming(
+			g, config, gr_ctx, patch);
+		if (err != 0) {
+			nvgpu_err(g,
+				"commit_sm_id_programming failed err=%d", err);
+		}
+	}
+#endif
+
+	if (patch) {
+		nvgpu_gr_ctx_patch_write_end(g, gr_ctx, false);
+	}
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+}
+
+static int nvgpu_gr_obj_ctx_alloc_sw_bundle(struct gk20a *g)
+{
+	int err = 0;
+	struct netlist_av_list *sw_bundle_init =
+			nvgpu_netlist_get_sw_bundle_init_av_list(g);
+	struct netlist_av_list *sw_veid_bundle_init =
+			nvgpu_netlist_get_sw_veid_bundle_init_av_list(g);
+#ifdef CONFIG_NVGPU_DGPU
+	struct netlist_av64_list *sw_bundle64_init =
+			nvgpu_netlist_get_sw_bundle64_init_av64_list(g);
+#endif
+
+	/* enable pipe mode override */
+	g->ops.gr.init.pipe_mode_override(g, true);
+
+	/* load bundle init */
+	err = g->ops.gr.init.load_sw_bundle_init(g, sw_bundle_init);
+	if (err != 0) {
+		goto error;
+	}
+
+	if (g->ops.gr.init.load_sw_veid_bundle != NULL) {
+		err = g->ops.gr.init.load_sw_veid_bundle(g,
+				sw_veid_bundle_init);
+		if (err != 0) {
+			goto error;
+		}
+	}
+
+#ifdef CONFIG_NVGPU_DGPU
+	if (g->ops.gr.init.load_sw_bundle64 != NULL) {
+		err = g->ops.gr.init.load_sw_bundle64(g, sw_bundle64_init);
+		if (err != 0) {
+			goto error;
+		}
+	}
+#endif
+
+	/* disable pipe mode override */
+	g->ops.gr.init.pipe_mode_override(g, false);
+
+	err = g->ops.gr.init.wait_idle(g);
+
+	return err;
+
+error:
+	/* in case of error skip waiting for GR idle - just restore state */
+	g->ops.gr.init.pipe_mode_override(g, false);
+
+	return err;
+}
+
+static int nvgpu_gr_obj_ctx_init_hw_state(struct gk20a *g,
+					struct nvgpu_mem *inst_block)
+{
+	int err = 0;
+	u32 data;
+	u32 i;
+	struct netlist_aiv_list *sw_ctx_load =
+				nvgpu_netlist_get_sw_ctx_load_aiv_list(g);
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	err = g->ops.gr.init.fe_pwr_mode_force_on(g, true);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	g->ops.gr.init.override_context_reset(g);
+
+	err = g->ops.gr.init.fe_pwr_mode_force_on(g, false);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	data = g->ops.gr.falcon.get_fecs_current_ctx_data(g, inst_block);
+	err = g->ops.gr.falcon.ctrl_ctxsw(g,
+			NVGPU_GR_FALCON_METHOD_ADDRESS_BIND_PTR, data, NULL);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	err = g->ops.gr.init.wait_idle(g);
+
+	/* load ctx init */
+	nvgpu_log_info(g, "begin: netlist: sw_ctx_load: register writes");
+	for (i = 0U; i < sw_ctx_load->count; i++) {
+		nvgpu_writel(g, sw_ctx_load->l[i].addr,
+			     sw_ctx_load->l[i].value);
+	}
+	nvgpu_log_info(g, "end: netlist: sw_ctx_load: register writes");
+
+	nvgpu_log_info(g, "configure sm_hww_esr_report mask after sw_ctx_load");
+	g->ops.gr.intr.set_hww_esr_report_mask(g);
+
+#ifdef CONFIG_NVGPU_GFXP
+	if (g->ops.gr.init.preemption_state != NULL) {
+		err = g->ops.gr.init.preemption_state(g);
+		if (err != 0) {
+			goto clean_up;
+		}
+	}
+#endif
+
+	nvgpu_cg_blcg_gr_load_enable(g);
+
+	err = g->ops.gr.init.wait_idle(g);
+
+clean_up:
+	if (err == 0) {
+		nvgpu_log(g, gpu_dbg_gr, "done");
+	}
+	return err;
+}
+
+static int nvgpu_gr_obj_ctx_commit_hw_state(struct gk20a *g,
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
+	struct nvgpu_gr_config *config, struct nvgpu_gr_ctx *gr_ctx)
+{
+	int err = 0;
+	struct netlist_av_list *sw_method_init =
+				nvgpu_netlist_get_sw_method_init_av_list(g);
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+	struct netlist_av_list *sw_bundle_init =
+			nvgpu_netlist_get_sw_bundle_init_av_list(g);
+#endif
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	/* disable fe_go_idle */
+	g->ops.gr.init.fe_go_idle_timeout(g, false);
+
+	nvgpu_gr_obj_ctx_commit_global_ctx_buffers(g, global_ctx_buffer,
+		config, gr_ctx, false);
+
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		/* override a few ctx state registers */
+		g->ops.gr.init.commit_global_timeslice(g);
+	}
+
+	/* floorsweep anything left */
+	err = nvgpu_gr_fs_state_init(g, config);
+	if (err != 0) {
+		goto restore_fe_go_idle;
+	}
+
+	err = g->ops.gr.init.wait_idle(g);
+	if (err != 0) {
+		goto restore_fe_go_idle;
+	}
+#if defined(CONFIG_NVGPU_HAL_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
+	if (g->ops.gr.init.auto_go_idle != NULL) {
+		g->ops.gr.init.auto_go_idle(g, false);
+	}
+#endif
+	err = nvgpu_gr_obj_ctx_alloc_sw_bundle(g);
+	if (err != 0) {
+		goto restore_fe_go_idle;
+	}
+
+#if defined(CONFIG_NVGPU_HAL_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
+	if (g->ops.gr.init.auto_go_idle != NULL) {
+		g->ops.gr.init.auto_go_idle(g, true);
+	}
+#endif
+
+	/* restore fe_go_idle */
+	g->ops.gr.init.fe_go_idle_timeout(g, true);
+
+	/* load method init */
+	g->ops.gr.init.load_method_init(g, sw_method_init);
+
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+	/* restore stats bundle data through mme shadow methods */
+	if (g->ops.gr.init.restore_stats_counter_bundle_data != NULL) {
+		g->ops.gr.init.restore_stats_counter_bundle_data(g,
+							sw_bundle_init);
+	}
+#endif
+
+	err = g->ops.gr.init.wait_idle(g);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+	return 0;
+
+restore_fe_go_idle:
+	/* restore fe_go_idle */
+	g->ops.gr.init.fe_go_idle_timeout(g, true);
+#if defined(CONFIG_NVGPU_HAL_NON_FUSA) && defined(CONFIG_NVGPU_NEXT)
+	if (g->ops.gr.init.auto_go_idle != NULL) {
+		g->ops.gr.init.auto_go_idle(g, true);
+	}
+#endif
+
+clean_up:
+	return err;
+}
+
+static int nvgpu_gr_obj_ctx_save_golden_ctx(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+	struct nvgpu_gr_ctx *gr_ctx, struct nvgpu_mem *inst_block)
+{
+	int err = 0;
+	struct nvgpu_mem *gr_mem;
+	u64 size;
+	u32 data;
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image_temp =
+									NULL;
+#endif
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	gr_mem = nvgpu_gr_ctx_get_ctx_mem(gr_ctx);
+
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+	/*
+	 * Save ctx data before first golden context save. Restore same data
+	 * before second golden context save. This temporary copy is
+	 * saved in local_golden_image_temp.
+	 */
+
+	size = nvgpu_gr_obj_ctx_get_golden_image_size(golden_image);
+
+	local_golden_image_temp =
+		nvgpu_gr_global_ctx_init_local_golden_image(g, gr_mem, size);
+	if (local_golden_image_temp == NULL) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+#endif
+
+	data = g->ops.gr.falcon.get_fecs_current_ctx_data(g, inst_block);
+	err = g->ops.gr.falcon.ctrl_ctxsw(g,
+			NVGPU_GR_FALCON_METHOD_GOLDEN_IMAGE_SAVE, data, NULL);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	size = nvgpu_gr_obj_ctx_get_golden_image_size(golden_image);
+
+	golden_image->local_golden_image =
+		nvgpu_gr_global_ctx_init_local_golden_image(g, gr_mem, size);
+	if (golden_image->local_golden_image == NULL) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+	/* Before second golden context save restore to before known state */
+	nvgpu_gr_global_ctx_load_local_golden_image(g,
+					local_golden_image_temp, gr_mem);
+	/* free local copy now */
+	nvgpu_gr_global_ctx_deinit_local_golden_image(g,
+						local_golden_image_temp);
+	local_golden_image_temp = NULL;
+
+	/* Initiate second golden context save */
+	data = g->ops.gr.falcon.get_fecs_current_ctx_data(g, inst_block);
+	err = g->ops.gr.falcon.ctrl_ctxsw(g,
+			NVGPU_GR_FALCON_METHOD_GOLDEN_IMAGE_SAVE, data, NULL);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	/* Copy the data to local buffer */
+	local_golden_image_temp =
+		nvgpu_gr_global_ctx_init_local_golden_image(g, gr_mem, size);
+	if (local_golden_image_temp == NULL) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	/* Compare two golden context images */
+	if (!nvgpu_gr_global_ctx_compare_golden_images(g,
+		nvgpu_mem_is_sysmem(gr_mem),
+		golden_image->local_golden_image,
+		local_golden_image_temp,
+		size)) {
+		nvgpu_err(g, "golden context mismatch");
+		err = -ENOMEM;
+	}
+#endif
+
+clean_up:
+#ifdef CONFIG_NVGPU_GR_GOLDEN_CTX_VERIFICATION
+	if (local_golden_image_temp != NULL) {
+		nvgpu_gr_global_ctx_deinit_local_golden_image(g,
+						local_golden_image_temp);
+	}
+#endif
+
+	if (err == 0) {
+		nvgpu_log(g, gpu_dbg_gr, "golden image saved with size = %llu", size);
+	}
+	return err;
+}
+
+/*
+ * init global golden image from a fresh gr_ctx in channel ctx.
+ * save a copy in local_golden_image.
+ */
+int nvgpu_gr_obj_ctx_alloc_golden_ctx_image(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_mem *inst_block)
+{
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	/*
+	 * golden ctx is global to all channels. Although only the first
+	 * channel initializes golden image, driver needs to prevent multiple
+	 * channels from initializing golden ctx at the same time
+	 */
+	nvgpu_mutex_acquire(&golden_image->ctx_mutex);
+
+	if (golden_image->ready) {
+		nvgpu_log(g, gpu_dbg_gr, "golden image already saved");
+		goto clean_up;
+	}
+
+	err = nvgpu_gr_obj_ctx_init_hw_state(g, inst_block);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	err = nvgpu_gr_obj_ctx_commit_hw_state(g, global_ctx_buffer,
+							config, gr_ctx);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	if (!nvgpu_is_enabled(g, NVGPU_SUPPORT_MIG)) {
+		err = nvgpu_gr_ctx_init_zcull(g, gr_ctx);
+		if (err != 0) {
+			goto clean_up;
+		}
+	}
+#endif
+
+	err = nvgpu_gr_obj_ctx_save_golden_ctx(g, golden_image,
+			gr_ctx, inst_block);
+	if (err != 0) {
+		goto clean_up;
+	}
+
+	golden_image->ready = true;
+#ifdef CONFIG_NVGPU_POWER_PG
+	nvgpu_pmu_set_golden_image_initialized(g, true);
+#endif
+	g->ops.gr.falcon.set_current_ctx_invalid(g);
+
+clean_up:
+	if (err != 0) {
+		nvgpu_err(g, "fail");
+	} else {
+		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	}
+
+	nvgpu_mutex_release(&golden_image->ctx_mutex);
+	return err;
+}
+
+static int nvgpu_gr_obj_ctx_gr_ctx_alloc(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+	struct nvgpu_gr_ctx_desc *gr_ctx_desc, struct nvgpu_gr_ctx *gr_ctx,
+	struct vm_gk20a *vm)
+{
+	u64 size;
+	int err = 0;
+
+	nvgpu_log_fn(g, " ");
+
+	size = nvgpu_gr_obj_ctx_get_golden_image_size(golden_image);
+	nvgpu_gr_ctx_set_size(gr_ctx_desc, NVGPU_GR_CTX_CTX,
+		nvgpu_safe_cast_u64_to_u32(size));
+
+	nvgpu_log(g, gpu_dbg_gr, "gr_ctx size = %llu", size);
+	err = nvgpu_gr_ctx_alloc(g, gr_ctx, gr_ctx_desc, vm);
+	if (err != 0) {
+		return err;
+	}
+
+	return 0;
+}
+
+int nvgpu_gr_obj_ctx_alloc(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+	struct nvgpu_gr_global_ctx_buffer_desc *global_ctx_buffer,
+	struct nvgpu_gr_ctx_desc *gr_ctx_desc,
+	struct nvgpu_gr_config *config,
+	struct nvgpu_gr_ctx *gr_ctx,
+	struct nvgpu_gr_subctx *subctx,
+	struct vm_gk20a *vm,
+	struct nvgpu_mem *inst_block,
+	u32 class_num, u32 flags,
+	bool cde, bool vpr)
+{
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, " ");
+
+	err = nvgpu_gr_obj_ctx_gr_ctx_alloc(g, golden_image, gr_ctx_desc,
+		gr_ctx, vm);
+	if (err != 0) {
+		nvgpu_err(g, "fail to allocate TSG gr ctx buffer");
+		goto out;
+	}
+
+	/* allocate patch buffer */
+	if (!nvgpu_mem_is_valid(nvgpu_gr_ctx_get_patch_ctx_mem(gr_ctx))) {
+		nvgpu_gr_ctx_set_patch_ctx_data_count(gr_ctx, 0);
+
+		nvgpu_gr_ctx_set_size(gr_ctx_desc,
+			NVGPU_GR_CTX_PATCH_CTX,
+			nvgpu_safe_mult_u32(
+				g->ops.gr.init.get_patch_slots(g, config),
+				PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY));
+
+		err = nvgpu_gr_ctx_alloc_patch_ctx(g, gr_ctx, gr_ctx_desc, vm);
+		if (err != 0) {
+			nvgpu_err(g, "fail to allocate patch buffer");
+			goto out;
+		}
+	}
+
+#if defined(CONFIG_NVGPU_GFXP) || defined(CONFIG_NVGPU_CILP)
+	err = nvgpu_gr_obj_ctx_init_ctxsw_preemption_mode(g, config,
+		gr_ctx_desc, gr_ctx, vm, class_num, flags);
+	if (err != 0) {
+		nvgpu_err(g, "fail to init preemption mode");
+		goto out;
+	}
+#endif
+
+	/* map global buffer to channel gpu_va and commit */
+	err = nvgpu_gr_ctx_map_global_ctx_buffers(g, gr_ctx,
+			global_ctx_buffer, vm, vpr);
+	if (err != 0) {
+		nvgpu_err(g, "fail to map global ctx buffer");
+		goto out;
+	}
+
+	nvgpu_gr_obj_ctx_commit_global_ctx_buffers(g, global_ctx_buffer,
+			config, gr_ctx, true);
+
+	/* commit gr ctx buffer */
+	nvgpu_gr_obj_ctx_commit_inst(g, inst_block, gr_ctx, subctx,
+			nvgpu_gr_ctx_get_ctx_mem(gr_ctx)->gpu_va);
+
+	/* init golden image */
+	err = nvgpu_gr_obj_ctx_alloc_golden_ctx_image(g, golden_image,
+		global_ctx_buffer, config, gr_ctx, inst_block);
+	if (err != 0) {
+		nvgpu_err(g, "fail to init golden ctx image");
+		goto out;
+	}
+
+#ifdef CONFIG_NVGPU_POWER_PG
+	/* Re-enable ELPG now that golden image has been initialized.
+	 * The PMU PG init code may already have tried to enable elpg, but
+	 * would not have been able to complete this action since the golden
+	 * image hadn't been initialized yet, so do this now.
+	 */
+	err = nvgpu_pmu_reenable_elpg(g);
+	if (err != 0) {
+		nvgpu_err(g, "fail to re-enable elpg");
+		goto out;
+	}
+#endif
+
+	/* load golden image */
+	nvgpu_gr_ctx_load_golden_ctx_image(g, gr_ctx,
+		golden_image->local_golden_image, cde);
+
+	nvgpu_gr_obj_ctx_update_ctxsw_preemption_mode(g, config, gr_ctx,
+		subctx);
+
+	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gr, "done");
+	return 0;
+out:
+	/*
+	 * 1. gr_ctx, patch_ctx and global ctx buffer mapping
+	 * can be reused so no need to release them.
+	 * 2. golden image init and load is a one time thing so if
+	 * they pass, no need to undo.
+	 */
+	nvgpu_err(g, "fail");
+	return err;
+}
+
+void nvgpu_gr_obj_ctx_set_golden_image_size(
+		struct nvgpu_gr_obj_ctx_golden_image *golden_image,
+		size_t size)
+{
+	golden_image->size = size;
+}
+
+size_t nvgpu_gr_obj_ctx_get_golden_image_size(
+		struct nvgpu_gr_obj_ctx_golden_image *golden_image)
+{
+	return golden_image->size;
+}
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+u32 *nvgpu_gr_obj_ctx_get_local_golden_image_ptr(
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image)
+{
+	return nvgpu_gr_global_ctx_get_local_golden_image_ptr(
+			golden_image->local_golden_image);
+}
+#endif
+
+bool nvgpu_gr_obj_ctx_is_golden_image_ready(
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image)
+{
+	bool ready;
+
+	nvgpu_mutex_acquire(&golden_image->ctx_mutex);
+	ready = golden_image->ready;
+	nvgpu_mutex_release(&golden_image->ctx_mutex);
+
+	return ready;
+}
+
+int nvgpu_gr_obj_ctx_init(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image **gr_golden_image, u32 size)
+{
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image;
+
+	nvgpu_log(g, gpu_dbg_gr, "size = %u", size);
+
+	golden_image = nvgpu_kzalloc(g, sizeof(*golden_image));
+	if (golden_image == NULL) {
+		return -ENOMEM;
+	}
+
+	nvgpu_gr_obj_ctx_set_golden_image_size(golden_image, size);
+
+	nvgpu_mutex_init(&golden_image->ctx_mutex);
+
+	*gr_golden_image = golden_image;
+
+	return 0;
+}
+
+void nvgpu_gr_obj_ctx_deinit(struct gk20a *g,
+	struct nvgpu_gr_obj_ctx_golden_image *golden_image)
+{
+	if (golden_image == NULL) {
+		return;
+	}
+
+	if (golden_image->local_golden_image != NULL) {
+		nvgpu_gr_global_ctx_deinit_local_golden_image(g,
+			golden_image->local_golden_image);
+		golden_image->local_golden_image = NULL;
+	}
+#ifdef CONFIG_NVGPU_POWER_PG
+	nvgpu_pmu_set_golden_image_initialized(g, false);
+#endif
+	golden_image->ready = false;
+	nvgpu_kfree(g, golden_image);
+}
+
--- a/drivers/gpu/nvgpu/common/gr/obj_ctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/obj_ctx_priv.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_OBJ_CTX_PRIV_H
+#define NVGPU_GR_OBJ_CTX_PRIV_H
+
+#include <nvgpu/types.h>
+#include <nvgpu/lock.h>
+
+struct nvgpu_gr_global_ctx_local_golden_image;
+
+/**
+ * Golden context image descriptor structure.
+ *
+ * This structure stores details of the Golden context image.
+ */
+struct nvgpu_gr_obj_ctx_golden_image {
+	/**
+	 * Flag to indicate if Golden context image is ready or not.
+	 */
+	bool ready;
+
+	/**
+	 * Mutex to hold for accesses to Golden context image.
+	 */
+	struct nvgpu_mutex ctx_mutex;
+
+	/**
+	 * Size of Golden context image.
+	 */
+	size_t size;
+
+	/**
+	 * Pointer to local Golden context image struct.
+	 */
+	struct nvgpu_gr_global_ctx_local_golden_image *local_golden_image;
+};
+
+#endif /* NVGPU_GR_OBJ_CTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/subctx.c
+++ b/drivers/gpu/nvgpu/common/gr/subctx.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/ctx.h>
+#include <nvgpu/gmmu.h>
+#include <nvgpu/dma.h>
+
+#include "common/gr/subctx_priv.h"
+
+struct nvgpu_gr_subctx *nvgpu_gr_subctx_alloc(struct gk20a *g,
+	struct vm_gk20a *vm)
+{
+	struct nvgpu_gr_subctx *subctx;
+	int err = 0;
+
+	nvgpu_log_fn(g, " ");
+
+	subctx = nvgpu_kzalloc(g, sizeof(*subctx));
+	if (subctx == NULL) {
+		return NULL;
+	}
+
+	err = nvgpu_dma_alloc_sys(g,
+			g->ops.gr.ctxsw_prog.hw_get_fecs_header_size(),
+			&subctx->ctx_header);
+	if (err != 0) {
+		nvgpu_err(g, "failed to allocate sub ctx header");
+		goto err_free_subctx;
+	}
+
+	subctx->ctx_header.gpu_va = nvgpu_gmmu_map(vm,
+				&subctx->ctx_header,
+				subctx->ctx_header.size,
+				0, /* not GPU-cacheable */
+				gk20a_mem_flag_none, true,
+				subctx->ctx_header.aperture);
+	if (subctx->ctx_header.gpu_va == 0ULL) {
+		nvgpu_err(g, "failed to map ctx header");
+		goto err_free_ctx_header;
+	}
+
+	return subctx;
+
+err_free_ctx_header:
+	nvgpu_dma_free(g, &subctx->ctx_header);
+err_free_subctx:
+	nvgpu_kfree(g, subctx);
+	return NULL;
+}
+
+void nvgpu_gr_subctx_free(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx,
+	struct vm_gk20a *vm)
+{
+	nvgpu_log_fn(g, " ");
+
+	nvgpu_gmmu_unmap(vm, &subctx->ctx_header,
+		subctx->ctx_header.gpu_va);
+	nvgpu_dma_free(g, &subctx->ctx_header);
+	nvgpu_kfree(g, subctx);
+}
+
+void nvgpu_gr_subctx_load_ctx_header(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx,
+	struct nvgpu_gr_ctx *gr_ctx, u64 gpu_va)
+{
+	struct nvgpu_mem *ctxheader = &subctx->ctx_header;
+	int err = 0;
+
+	err = g->ops.mm.cache.l2_flush(g, true);
+	if (err != 0) {
+		nvgpu_err(g, "l2_flush failed");
+	}
+
+	/* set priv access map */
+	g->ops.gr.ctxsw_prog.set_priv_access_map_addr(g, ctxheader,
+		nvgpu_gr_ctx_get_global_ctx_va(gr_ctx,
+			NVGPU_GR_CTX_PRIV_ACCESS_MAP_VA));
+
+	g->ops.gr.ctxsw_prog.set_patch_addr(g, ctxheader,
+		nvgpu_gr_ctx_get_patch_ctx_mem(gr_ctx)->gpu_va);
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+	g->ops.gr.ctxsw_prog.set_pm_ptr(g, ctxheader,
+		nvgpu_gr_ctx_get_pm_ctx_mem(gr_ctx)->gpu_va);
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+	g->ops.gr.ctxsw_prog.set_zcull_ptr(g, ctxheader,
+		nvgpu_gr_ctx_get_zcull_ctx_va(gr_ctx));
+#endif
+
+	g->ops.gr.ctxsw_prog.set_context_buffer_ptr(g, ctxheader, gpu_va);
+
+	g->ops.gr.ctxsw_prog.set_type_per_veid_header(g, ctxheader);
+}
+
+struct nvgpu_mem *nvgpu_gr_subctx_get_ctx_header(struct nvgpu_gr_subctx *subctx)
+{
+	return &subctx->ctx_header;
+}
+
+#ifdef CONFIG_NVGPU_HAL_NON_FUSA
+void nvgpu_gr_subctx_set_patch_ctx(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx, struct nvgpu_gr_ctx *gr_ctx)
+{
+	g->ops.gr.ctxsw_prog.set_patch_addr(g, &subctx->ctx_header,
+		 nvgpu_gr_ctx_get_patch_ctx_mem(gr_ctx)->gpu_va);
+}
+#endif
+
+#ifdef CONFIG_NVGPU_GRAPHICS
+void nvgpu_gr_subctx_zcull_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
+		struct nvgpu_gr_ctx *gr_ctx)
+{
+
+	nvgpu_log_fn(g, " ");
+
+	g->ops.gr.ctxsw_prog.set_zcull_ptr(g, &subctx->ctx_header,
+		nvgpu_gr_ctx_get_zcull_ctx_va(gr_ctx));
+}
+#endif /* CONFIG_NVGPU_GRAPHICS */
+
+#ifdef CONFIG_NVGPU_GFXP
+void nvgpu_gr_subctx_set_preemption_buffer_va(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx, struct nvgpu_gr_ctx *gr_ctx)
+{
+	g->ops.gr.ctxsw_prog.set_full_preemption_ptr(g, &subctx->ctx_header,
+		nvgpu_gr_ctx_get_preempt_ctxsw_buffer(gr_ctx)->gpu_va);
+
+	if (g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0 != NULL) {
+		g->ops.gr.ctxsw_prog.set_full_preemption_ptr_veid0(g,
+			&subctx->ctx_header,
+			nvgpu_gr_ctx_get_preempt_ctxsw_buffer(gr_ctx)->gpu_va);
+	}
+}
+#endif /* CONFIG_NVGPU_GFXP */
+
+#ifdef CONFIG_NVGPU_DEBUGGER
+void nvgpu_gr_subctx_set_hwpm_mode(struct gk20a *g,
+	struct nvgpu_gr_subctx *subctx, struct nvgpu_gr_ctx *gr_ctx)
+{
+	g->ops.gr.ctxsw_prog.set_pm_ptr(g, &subctx->ctx_header,
+		nvgpu_gr_ctx_get_pm_ctx_mem(gr_ctx)->gpu_va);
+}
+#endif
--- a/drivers/gpu/nvgpu/common/gr/subctx_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/subctx_priv.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_SUBCTX_PRIV_H
+#define NVGPU_GR_SUBCTX_PRIV_H
+
+struct nvgpu_mem;
+
+/**
+ * GR subcontext data structure.
+ *
+ * One subcontext is allocated per GPU channel.
+ */
+struct nvgpu_gr_subctx {
+	/**
+	 * Memory to hold subcontext header image.
+	 */
+	struct nvgpu_mem ctx_header;
+};
+
+#endif /* NVGPU_GR_SUBCTX_PRIV_H */
--- a/drivers/gpu/nvgpu/common/gr/zbc.c
+++ b/drivers/gpu/nvgpu/common/gr/zbc.c
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/gk20a.h>
+#include <nvgpu/io.h>
+#include <nvgpu/bug.h>
+#include <nvgpu/string.h>
+#include <nvgpu/power_features/pg.h>
+#ifdef CONFIG_NVGPU_LS_PMU
+#include <nvgpu/pmu/pmu_pg.h>
+#endif
+
+#include "zbc_priv.h"
+
+#define ZBC_ENTRY_UPDATED	1
+#define ZBC_ENTRY_ADDED		2
+
+static void nvgpu_gr_zbc_update_stencil_reg(struct gk20a *g,
+			     struct nvgpu_gr_zbc_entry *stencil_val, u32 index)
+{
+	/* update l2 table */
+	if (g->ops.ltc.set_zbc_s_entry != NULL) {
+		g->ops.ltc.set_zbc_s_entry(g, stencil_val->stencil, index);
+	}
+
+	/* update zbc stencil registers */
+	g->ops.gr.zbc.add_stencil(g, stencil_val, index);
+}
+
+static int nvgpu_gr_zbc_add_stencil(struct gk20a *g, struct nvgpu_gr_zbc *zbc,
+			struct nvgpu_gr_zbc_entry *stencil_val)
+{
+	struct zbc_stencil_table *s_tbl;
+	u32 i;
+	int entry_added = -ENOSPC;
+	bool entry_exist = false;
+
+	/* search existing tables */
+	for (i = zbc->min_stencil_index; i <= zbc->max_used_stencil_index;
+		i++) {
+
+		s_tbl = &zbc->zbc_s_tbl[i];
+
+		if ((s_tbl->ref_cnt != 0U) &&
+		    (s_tbl->stencil == stencil_val->stencil) &&
+		    (s_tbl->format == stencil_val->format)) {
+			s_tbl->ref_cnt = nvgpu_safe_add_u32(s_tbl->ref_cnt, 1U);
+			entry_exist = true;
+			entry_added = ZBC_ENTRY_UPDATED;
+			break;
+		}
+	}
+	/* add new table */
+	if (!entry_exist &&
+		(zbc->max_used_stencil_index < zbc->max_stencil_index)) {
+
+		/* Increment used index and add new entry at that index */
+		zbc->max_used_stencil_index =
+			nvgpu_safe_add_u32(zbc->max_used_stencil_index, 1U);
+
+		s_tbl = &zbc->zbc_s_tbl[zbc->max_used_stencil_index];
+		WARN_ON(s_tbl->ref_cnt != 0U);
+
+		/* update sw copy */
+		s_tbl->stencil = stencil_val->stencil;
+		s_tbl->format = stencil_val->format;
+		s_tbl->ref_cnt = nvgpu_safe_add_u32(s_tbl->ref_cnt, 1U);
+
+		nvgpu_gr_zbc_update_stencil_reg(g, stencil_val,
+			zbc->max_used_stencil_index);
+
+		entry_added = ZBC_ENTRY_ADDED;
+	}
+	return entry_added;
+}
+
+static void nvgpu_gr_zbc_update_depth_reg(struct gk20a *g,
+			struct nvgpu_gr_zbc_entry *depth_val, u32 index)
+{
+	/* update l2 table */
+	g->ops.ltc.set_zbc_depth_entry(g, depth_val->depth, index);
+
+	/* update zbc registers */
+	g->ops.gr.zbc.add_depth(g, depth_val, index);
+}
+
+static int nvgpu_gr_zbc_add_depth(struct gk20a *g, struct nvgpu_gr_zbc *zbc,
+			struct nvgpu_gr_zbc_entry *depth_val)
+{
+	struct zbc_depth_table *d_tbl;
+	u32 i;
+	int entry_added = -ENOSPC;
+	bool entry_exist = false;
+
+	/* search existing tables */
+	for (i = zbc->min_depth_index; i <= zbc->max_used_depth_index; i++) {
+
+		d_tbl = &zbc->zbc_dep_tbl[i];
+
+		if ((d_tbl->ref_cnt != 0U) &&
+		    (d_tbl->depth == depth_val->depth) &&
+		    (d_tbl->format == depth_val->format)) {
+			d_tbl->ref_cnt = nvgpu_safe_add_u32(d_tbl->ref_cnt, 1U);
+			entry_exist = true;
+			entry_added = ZBC_ENTRY_UPDATED;
+			break;
+		}
+	}
+	/* add new table */
+	if (!entry_exist &&
+		(zbc->max_used_depth_index < zbc->max_depth_index)) {
+
+		/* Increment used index and add new entry at that index */
+		zbc->max_used_depth_index =
+			nvgpu_safe_add_u32(zbc->max_used_depth_index, 1U);
+
+		d_tbl = &zbc->zbc_dep_tbl[zbc->max_used_depth_index];
+		WARN_ON(d_tbl->ref_cnt != 0U);
+
+		/* update sw copy */
+		d_tbl->depth = depth_val->depth;
+		d_tbl->format = depth_val->format;
+		d_tbl->ref_cnt = nvgpu_safe_add_u32(d_tbl->ref_cnt, 1U);
+
+		nvgpu_gr_zbc_update_depth_reg(g, depth_val,
+			zbc->max_used_depth_index);
+
+		entry_added = ZBC_ENTRY_ADDED;
+	}
+
+	return entry_added;
+}
+
+static void nvgpu_gr_zbc_update_color_reg(struct gk20a *g,
+			struct nvgpu_gr_zbc_entry *color_val, u32 index)
+{
+	/* update l2 table */
+	g->ops.ltc.set_zbc_color_entry(g, color_val->color_l2, index);
+
+	/* update zbc registers */
+	g->ops.gr.zbc.add_color(g, color_val, index);
+}
+
+static int nvgpu_gr_zbc_add_color(struct gk20a *g, struct nvgpu_gr_zbc *zbc,
+			struct nvgpu_gr_zbc_entry *color_val)
+{
+	struct zbc_color_table *c_tbl;
+	u32 i;
+	int entry_added = -ENOSPC;
+	bool entry_exist = false;
+
+	/* search existing table */
+	for (i = zbc->min_color_index; i <= zbc->max_used_color_index; i++) {
+
+		c_tbl = &zbc->zbc_col_tbl[i];
+
+		if ((c_tbl->ref_cnt != 0U) &&
+			(c_tbl->format == color_val->format) &&
+			(nvgpu_memcmp((u8 *)c_tbl->color_ds,
+				(u8 *)color_val->color_ds,
+				sizeof(color_val->color_ds)) == 0) &&
+			(nvgpu_memcmp((u8 *)c_tbl->color_l2,
+				(u8 *)color_val->color_l2,
+				sizeof(color_val->color_l2)) == 0)) {
+
+			c_tbl->ref_cnt = nvgpu_safe_add_u32(c_tbl->ref_cnt, 1U);
+			entry_exist = true;
+			entry_added = ZBC_ENTRY_UPDATED;
+			break;
+		}
+	}
+
+	/* add new entry */
+	if (!entry_exist &&
+		(zbc->max_used_color_index < zbc->max_color_index)) {
+
+		/* Increment used index and add new entry at that index */
+		zbc->max_used_color_index =
+			nvgpu_safe_add_u32(zbc->max_used_color_index, 1U);
+
+		c_tbl = &zbc->zbc_col_tbl[zbc->max_used_color_index];
+		WARN_ON(c_tbl->ref_cnt != 0U);
+
+		/* update local copy */
+		for (i = 0; i < NVGPU_GR_ZBC_COLOR_VALUE_SIZE; i++) {
+			c_tbl->color_l2[i] = color_val->color_l2[i];
+			c_tbl->color_ds[i] = color_val->color_ds[i];
+		}
+		c_tbl->format = color_val->format;
+		c_tbl->ref_cnt = nvgpu_safe_add_u32(c_tbl->ref_cnt, 1U);
+
+		nvgpu_gr_zbc_update_color_reg(g, color_val,
+			zbc->max_used_color_index);
+
+		entry_added = ZBC_ENTRY_ADDED;
+	}
+
+	return entry_added;
+}
+
+static int nvgpu_gr_zbc_add(struct gk20a *g, struct nvgpu_gr_zbc *zbc,
+			    struct nvgpu_gr_zbc_entry *zbc_val)
+{
+	int added = false;
+#if defined(CONFIG_NVGPU_LS_PMU) && defined(CONFIG_NVGPU_POWER_PG)
+	u32 entries;
+#endif
+
+	/* no endian swap ? */
+	nvgpu_mutex_acquire(&zbc->zbc_lock);
+	nvgpu_speculation_barrier();
+	switch (zbc_val->type) {
+	case NVGPU_GR_ZBC_TYPE_COLOR:
+		added = nvgpu_gr_zbc_add_color(g, zbc, zbc_val);
+		break;
+	case NVGPU_GR_ZBC_TYPE_DEPTH:
+		added = nvgpu_gr_zbc_add_depth(g, zbc, zbc_val);
+		break;
+	case NVGPU_GR_ZBC_TYPE_STENCIL:
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) {
+			added =  nvgpu_gr_zbc_add_stencil(g, zbc, zbc_val);
+		} else {
+			nvgpu_err(g,
+			"invalid zbc table type %d", zbc_val->type);
+			added = -EINVAL;
+			goto err_mutex;
+		}
+		break;
+	default:
+		nvgpu_err(g,
+			"invalid zbc table type %d", zbc_val->type);
+		added = -EINVAL;
+		goto err_mutex;
+	}
+
+#if defined(CONFIG_NVGPU_LS_PMU) && defined(CONFIG_NVGPU_POWER_PG)
+	if (added == ZBC_ENTRY_ADDED) {
+		/* update zbc for elpg only when new entry is added */
+		entries = max(
+			nvgpu_safe_sub_u32(zbc->max_used_color_index,
+				zbc->min_color_index),
+			nvgpu_safe_sub_u32(zbc->max_used_depth_index,
+				zbc->min_depth_index));
+		if (g->elpg_enabled) {
+			nvgpu_pmu_save_zbc(g, entries);
+		}
+	}
+#endif
+
+err_mutex:
+	nvgpu_mutex_release(&zbc->zbc_lock);
+	if (added < 0) {
+		return added;
+	}
+	return 0;
+}
+
+int nvgpu_gr_zbc_set_table(struct gk20a *g, struct nvgpu_gr_zbc *zbc,
+			   struct nvgpu_gr_zbc_entry *zbc_val)
+{
+	nvgpu_log(g, gpu_dbg_zbc, " zbc_val->type %u", zbc_val->type);
+
+	return nvgpu_pg_elpg_protected_call(g,
+		nvgpu_gr_zbc_add(g, zbc, zbc_val));
+}
+
+/* get a zbc table entry specified by index
+ * return table size when type is invalid */
+int nvgpu_gr_zbc_query_table(struct gk20a *g, struct nvgpu_gr_zbc *zbc,
+			struct nvgpu_gr_zbc_query_params *query_params)
+{
+	u32 index = query_params->index_size;
+	u32 i;
+
+	nvgpu_speculation_barrier();
+	switch (query_params->type) {
+	case NVGPU_GR_ZBC_TYPE_INVALID:
+		nvgpu_log(g, gpu_dbg_zbc, "Query zbc size");
+		query_params->index_size = nvgpu_safe_add_u32(
+			nvgpu_safe_sub_u32(zbc->max_color_index,
+				zbc->min_color_index), 1U);
+		break;
+	case NVGPU_GR_ZBC_TYPE_COLOR:
+		if ((index < zbc->min_color_index) ||
+				(index > zbc->max_color_index)) {
+			nvgpu_err(g, "invalid zbc color table index %u", index);
+			return -EINVAL;
+		}
+		nvgpu_log(g, gpu_dbg_zbc, "Query zbc color at index %u", index);
+
+		nvgpu_speculation_barrier();
+		for (i = 0; i < NVGPU_GR_ZBC_COLOR_VALUE_SIZE; i++) {
+			query_params->color_l2[i] =
+				zbc->zbc_col_tbl[index].color_l2[i];
+			query_params->color_ds[i] =
+				zbc->zbc_col_tbl[index].color_ds[i];
+		}
+		query_params->format = zbc->zbc_col_tbl[index].format;
+		query_params->ref_cnt = zbc->zbc_col_tbl[index].ref_cnt;
+
+		break;
+	case NVGPU_GR_ZBC_TYPE_DEPTH:
+		if ((index < zbc->min_depth_index) ||
+				(index > zbc->max_depth_index)) {
+			nvgpu_err(g, "invalid zbc depth table index %u", index);
+			return -EINVAL;
+		}
+		nvgpu_log(g, gpu_dbg_zbc, "Query zbc depth at index %u", index);
+
+		nvgpu_speculation_barrier();
+		query_params->depth = zbc->zbc_dep_tbl[index].depth;
+		query_params->format = zbc->zbc_dep_tbl[index].format;
+		query_params->ref_cnt = zbc->zbc_dep_tbl[index].ref_cnt;
+		break;
+	case NVGPU_GR_ZBC_TYPE_STENCIL:
+		if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) {
+			if ((index < zbc->min_stencil_index) ||
+					(index > zbc->max_stencil_index)) {
+				nvgpu_err(g,
+					"invalid zbc stencil table index %u",
+					index);
+				return -EINVAL;
+			}
+			nvgpu_log(g, gpu_dbg_zbc,
+				"Query zbc stencil at index %u", index);
+
+			nvgpu_speculation_barrier();
+			query_params->stencil = zbc->zbc_s_tbl[index].stencil;
+			query_params->format = zbc->zbc_s_tbl[index].format;
+			query_params->ref_cnt = zbc->zbc_s_tbl[index].ref_cnt;
+		} else {
+			nvgpu_err(g, "invalid zbc table type");
+			return -EINVAL;
+		}
+		break;
+	default:
+		nvgpu_err(g, "invalid zbc table type");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Update zbc table registers as per sw copy of zbc tables
+ */
+void nvgpu_gr_zbc_load_table(struct gk20a *g, struct nvgpu_gr_zbc *zbc)
+{
+	unsigned int i;
+
+	for (i = zbc->min_color_index; i <= zbc->max_used_color_index; i++) {
+		struct zbc_color_table *c_tbl = &zbc->zbc_col_tbl[i];
+		struct nvgpu_gr_zbc_entry zbc_val;
+
+		zbc_val.type = NVGPU_GR_ZBC_TYPE_COLOR;
+		nvgpu_memcpy((u8 *)zbc_val.color_ds,
+			(u8 *)c_tbl->color_ds, sizeof(zbc_val.color_ds));
+		nvgpu_memcpy((u8 *)zbc_val.color_l2,
+			(u8 *)c_tbl->color_l2, sizeof(zbc_val.color_l2));
+		zbc_val.format = c_tbl->format;
+
+		nvgpu_gr_zbc_update_color_reg(g, &zbc_val, i);
+	}
+
+	for (i = zbc->min_depth_index; i <= zbc->max_used_depth_index; i++) {
+		struct zbc_depth_table *d_tbl = &zbc->zbc_dep_tbl[i];
+		struct nvgpu_gr_zbc_entry zbc_val;
+
+		zbc_val.type = NVGPU_GR_ZBC_TYPE_DEPTH;
+		zbc_val.depth = d_tbl->depth;
+		zbc_val.format = d_tbl->format;
+
+		nvgpu_gr_zbc_update_depth_reg(g, &zbc_val, i);
+	}
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) {
+		for (i = zbc->min_stencil_index;
+			i <= zbc->max_used_stencil_index; i++) {
+			struct zbc_stencil_table *s_tbl = &zbc->zbc_s_tbl[i];
+			struct nvgpu_gr_zbc_entry zbc_val;
+
+			zbc_val.type = NVGPU_GR_ZBC_TYPE_STENCIL;
+			zbc_val.stencil = s_tbl->stencil;
+			zbc_val.format = s_tbl->format;
+
+			nvgpu_gr_zbc_update_stencil_reg(g, &zbc_val, i);
+		}
+	}
+}
+
+static void nvgpu_gr_zbc_load_default_sw_stencil_table(struct gk20a *g,
+					  struct nvgpu_gr_zbc *zbc)
+{
+	u32 index = zbc->min_stencil_index;
+
+	zbc->zbc_s_tbl[index].stencil = 0x0;
+	zbc->zbc_s_tbl[index].format = GR_ZBC_STENCIL_CLEAR_FMT_U8;
+	zbc->zbc_s_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_s_tbl[index].ref_cnt, 1U);
+	index = nvgpu_safe_add_u32(index, 1U);
+
+	zbc->zbc_s_tbl[index].stencil = 0x1;
+	zbc->zbc_s_tbl[index].format = GR_ZBC_STENCIL_CLEAR_FMT_U8;
+	zbc->zbc_s_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_s_tbl[index].ref_cnt, 1U);
+	index = nvgpu_safe_add_u32(index, 1U);
+
+	zbc->zbc_s_tbl[index].stencil = 0xff;
+	zbc->zbc_s_tbl[index].format = GR_ZBC_STENCIL_CLEAR_FMT_U8;
+	zbc->zbc_s_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_s_tbl[index].ref_cnt, 1U);
+
+	zbc->max_used_stencil_index = index;
+}
+
+static void nvgpu_gr_zbc_load_default_sw_depth_table(struct gk20a *g,
+					struct nvgpu_gr_zbc *zbc)
+{
+	u32 index = zbc->min_depth_index;
+
+	zbc->zbc_dep_tbl[index].format = GR_ZBC_Z_FMT_VAL_FP32;
+	zbc->zbc_dep_tbl[index].depth = 0x3f800000;
+	zbc->zbc_dep_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_dep_tbl[index].ref_cnt, 1U);
+	index = nvgpu_safe_add_u32(index, 1U);
+
+	zbc->zbc_dep_tbl[index].format = GR_ZBC_Z_FMT_VAL_FP32;
+	zbc->zbc_dep_tbl[index].depth = 0;
+	zbc->zbc_dep_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_dep_tbl[index].ref_cnt, 1U);
+
+	zbc->max_used_depth_index = index;
+}
+
+static void nvgpu_gr_zbc_load_default_sw_color_table(struct gk20a *g,
+					struct nvgpu_gr_zbc *zbc)
+{
+	u32 i;
+	u32 index = zbc->min_color_index;
+
+	/* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
+	zbc->zbc_col_tbl[index].format = GR_ZBC_SOLID_BLACK_COLOR_FMT;
+	for (i = 0U; i < NVGPU_GR_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc->zbc_col_tbl[index].color_ds[i] = 0U;
+		zbc->zbc_col_tbl[index].color_l2[i] = 0xff000000U;
+	}
+	zbc->zbc_col_tbl[index].color_ds[3] = 0x3f800000U;
+	zbc->zbc_col_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_col_tbl[index].ref_cnt, 1U);
+	index = nvgpu_safe_add_u32(index, 1U);
+
+	/* Transparent black = (fmt 1 = zero) */
+	zbc->zbc_col_tbl[index].format = GR_ZBC_TRANSPARENT_BLACK_COLOR_FMT;
+	for (i = 0; i < NVGPU_GR_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc->zbc_col_tbl[index].color_ds[i] = 0U;
+		zbc->zbc_col_tbl[index].color_l2[i] = 0U;
+	}
+	zbc->zbc_col_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_col_tbl[index].ref_cnt, 1U);
+	index = nvgpu_safe_add_u32(index, 1U);
+
+	/* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
+	zbc->zbc_col_tbl[index].format = GR_ZBC_SOLID_WHITE_COLOR_FMT;
+	for (i = 0; i < NVGPU_GR_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc->zbc_col_tbl[index].color_ds[i] = 0x3f800000U;
+		zbc->zbc_col_tbl[index].color_l2[i] = 0xffffffffU;
+	}
+	zbc->zbc_col_tbl[index].ref_cnt =
+		nvgpu_safe_add_u32(zbc->zbc_col_tbl[index].ref_cnt, 1U);
+
+	zbc->max_used_color_index = index;
+}
+
+static void nvgpu_gr_zbc_init_indices(struct gk20a *g, struct nvgpu_gr_zbc *zbc)
+{
+	struct nvgpu_gr_zbc_table_indices zbc_indices;
+
+	g->ops.gr.zbc.init_table_indices(g, &zbc_indices);
+
+	zbc->min_color_index = zbc_indices.min_color_index;
+	zbc->max_color_index = zbc_indices.max_color_index;
+	zbc->min_depth_index = zbc_indices.min_depth_index;
+	zbc->max_depth_index = zbc_indices.max_depth_index;
+	zbc->min_stencil_index = zbc_indices.min_stencil_index;
+	zbc->max_stencil_index = zbc_indices.max_stencil_index;
+
+	nvgpu_log(g, gpu_dbg_zbc, "zbc->min_color_index %u",
+		zbc->min_color_index);
+	nvgpu_log(g, gpu_dbg_zbc, "zbc->max_color_index %u",
+		zbc->max_color_index);
+	nvgpu_log(g, gpu_dbg_zbc, "zbc->min_depth_index %u",
+		zbc->min_depth_index);
+	nvgpu_log(g, gpu_dbg_zbc, "zbc->max_depth_index %u",
+		zbc->max_depth_index);
+	nvgpu_log(g, gpu_dbg_zbc, "zbc->min_stencil_index %u",
+		zbc->min_stencil_index);
+	nvgpu_log(g, gpu_dbg_zbc, "zbc->max_stencil_index %u",
+		zbc->max_stencil_index);
+}
+
+static void nvgpu_gr_zbc_load_default_sw_table(struct gk20a *g,
+					struct nvgpu_gr_zbc *zbc)
+{
+	nvgpu_mutex_init(&zbc->zbc_lock);
+
+	nvgpu_gr_zbc_load_default_sw_color_table(g, zbc);
+
+	nvgpu_gr_zbc_load_default_sw_depth_table(g, zbc);
+
+	if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) {
+		nvgpu_gr_zbc_load_default_sw_stencil_table(g, zbc);
+	}
+}
+
+static int gr_zbc_allocate_local_tbls(struct gk20a *g, struct nvgpu_gr_zbc *zbc)
+{
+	u32 zbc_col_size = nvgpu_safe_add_u32(zbc->max_color_index,
+				zbc->min_color_index);
+	u32 zbc_dep_size = nvgpu_safe_add_u32(zbc->max_depth_index,
+				zbc->min_depth_index);
+	u32 zbc_s_size = nvgpu_safe_add_u32(zbc->max_stencil_index,
+				zbc->min_stencil_index);
+
+	zbc->zbc_col_tbl = nvgpu_kzalloc(g,
+			sizeof(struct zbc_color_table) * zbc_col_size);
+	if (zbc->zbc_col_tbl == NULL) {
+		goto alloc_col_tbl_err;
+	}
+
+	zbc->zbc_dep_tbl = nvgpu_kzalloc(g,
+			sizeof(struct zbc_depth_table) * zbc_dep_size);
+
+	if (zbc->zbc_dep_tbl == NULL) {
+		goto alloc_dep_tbl_err;
+	}
+
+	zbc->zbc_s_tbl = nvgpu_kzalloc(g,
+			sizeof(struct zbc_stencil_table) * zbc_s_size);
+	if (zbc->zbc_s_tbl == NULL) {
+		goto alloc_s_tbl_err;
+	}
+
+	return 0;
+
+alloc_s_tbl_err:
+	nvgpu_kfree(g, zbc->zbc_dep_tbl);
+alloc_dep_tbl_err:
+	nvgpu_kfree(g, zbc->zbc_col_tbl);
+alloc_col_tbl_err:
+	return -ENOMEM;
+}
+
+/* allocate the struct and load the table */
+int nvgpu_gr_zbc_init(struct gk20a *g, struct nvgpu_gr_zbc **zbc)
+{
+	int ret = -ENOMEM;
+	struct nvgpu_gr_zbc *gr_zbc = NULL;
+
+	*zbc = NULL;
+
+	gr_zbc = nvgpu_kzalloc(g, sizeof(*gr_zbc));
+	if (gr_zbc == NULL) {
+		return ret;
+	}
+
+	nvgpu_gr_zbc_init_indices(g, gr_zbc);
+
+	ret = gr_zbc_allocate_local_tbls(g, gr_zbc);
+	if (ret != 0) {
+		goto alloc_err;
+	}
+
+	nvgpu_gr_zbc_load_default_sw_table(g, gr_zbc);
+
+	*zbc = gr_zbc;
+	return ret;
+
+alloc_err:
+	nvgpu_kfree(g, gr_zbc);
+	return ret;
+}
+
+/* deallocate the memory for the struct */
+void nvgpu_gr_zbc_deinit(struct gk20a *g, struct nvgpu_gr_zbc *zbc)
+{
+	if (zbc == NULL) {
+		return;
+	}
+
+	nvgpu_kfree(g, zbc->zbc_col_tbl);
+	nvgpu_kfree(g, zbc->zbc_dep_tbl);
+	nvgpu_kfree(g, zbc->zbc_s_tbl);
+	nvgpu_kfree(g, zbc);
+}
+
+struct nvgpu_gr_zbc_entry *nvgpu_gr_zbc_entry_alloc(struct gk20a *g)
+{
+	return nvgpu_kzalloc(g, sizeof(struct nvgpu_gr_zbc_entry));
+}
+void nvgpu_gr_zbc_entry_free(struct gk20a *g, struct nvgpu_gr_zbc_entry *entry)
+{
+	nvgpu_kfree(g, entry);
+}
+
+u32 nvgpu_gr_zbc_get_entry_color_ds(struct nvgpu_gr_zbc_entry *entry,
+		int idx)
+{
+	return entry->color_ds[idx];
+}
+
+void nvgpu_gr_zbc_set_entry_color_ds(struct nvgpu_gr_zbc_entry *entry,
+		int idx, u32 ds)
+{
+	entry->color_ds[idx] = ds;
+}
+
+u32 nvgpu_gr_zbc_get_entry_color_l2(struct nvgpu_gr_zbc_entry *entry,
+		int idx)
+{
+	return entry->color_l2[idx];
+}
+
+void nvgpu_gr_zbc_set_entry_color_l2(struct nvgpu_gr_zbc_entry *entry,
+		int idx, u32 l2)
+{
+	entry->color_l2[idx] = l2;
+}
+
+u32 nvgpu_gr_zbc_get_entry_depth(struct nvgpu_gr_zbc_entry *entry)
+{
+	return entry->depth;
+}
+
+void nvgpu_gr_zbc_set_entry_depth(struct nvgpu_gr_zbc_entry *entry,
+		u32 depth)
+{
+	entry->depth = depth;
+}
+
+u32 nvgpu_gr_zbc_get_entry_stencil(struct nvgpu_gr_zbc_entry *entry)
+{
+	return entry->stencil;
+}
+
+void nvgpu_gr_zbc_set_entry_stencil(struct nvgpu_gr_zbc_entry *entry,
+		u32 stencil)
+{
+	entry->stencil = stencil;
+}
+
+u32 nvgpu_gr_zbc_get_entry_type(struct nvgpu_gr_zbc_entry *entry)
+{
+	return entry->type;
+}
+
+void nvgpu_gr_zbc_set_entry_type(struct nvgpu_gr_zbc_entry *entry,
+		u32 type)
+{
+	entry->type = type;
+}
+
+u32 nvgpu_gr_zbc_get_entry_format(struct nvgpu_gr_zbc_entry *entry)
+{
+	return entry->format;
+}
+
+void nvgpu_gr_zbc_set_entry_format(struct nvgpu_gr_zbc_entry *entry,
+		u32 format)
+{
+	entry->format = format;
+}
--- a/drivers/gpu/nvgpu/common/gr/zbc_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/zbc_priv.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_ZBC_PRIV_H
+#define NVGPU_GR_ZBC_PRIV_H
+
+#include <nvgpu/gr/zbc.h>
+
+/* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */
+#define GR_ZBC_SOLID_BLACK_COLOR_FMT		0x28
+/* Transparent black = (fmt 1 = zero) */
+#define GR_ZBC_TRANSPARENT_BLACK_COLOR_FMT	0x1
+/* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */
+#define GR_ZBC_SOLID_WHITE_COLOR_FMT		0x2
+/* z format with fp32 */
+#define GR_ZBC_Z_FMT_VAL_FP32			0x1
+
+#define GR_ZBC_STENCIL_CLEAR_FMT_INVAILD	0U
+#define GR_ZBC_STENCIL_CLEAR_FMT_U8		1U
+
+struct zbc_color_table {
+	u32 color_ds[NVGPU_GR_ZBC_COLOR_VALUE_SIZE];
+	u32 color_l2[NVGPU_GR_ZBC_COLOR_VALUE_SIZE];
+	u32 format;
+	u32 ref_cnt;
+};
+
+struct zbc_depth_table {
+	u32 depth;
+	u32 format;
+	u32 ref_cnt;
+};
+
+struct zbc_stencil_table {
+	u32 stencil;
+	u32 format;
+	u32 ref_cnt;
+};
+
+struct nvgpu_gr_zbc_entry {
+	u32 color_ds[NVGPU_GR_ZBC_COLOR_VALUE_SIZE];
+	u32 color_l2[NVGPU_GR_ZBC_COLOR_VALUE_SIZE];
+	u32 depth;
+	u32 stencil;
+	u32 type;
+	u32 format;
+};
+
+/*
+ * HW ZBC table valid entries start at index 1.
+ * Entry 0 is reserved to mean "no matching entry found, do not use ZBC"
+ */
+struct nvgpu_gr_zbc {
+	struct nvgpu_mutex zbc_lock;	/* Lock to access zbc table */
+	struct zbc_color_table *zbc_col_tbl; /* SW zbc color table pointer */
+	struct zbc_depth_table *zbc_dep_tbl; /* SW zbc depth table pointer */
+	struct zbc_stencil_table *zbc_s_tbl; /* SW zbc stencil table pointer */
+	u32 min_color_index;	/* Minimum valid color table index */
+	u32 min_depth_index;	/* Minimum valid depth table index */
+	u32 min_stencil_index;	/* Minimum valid stencil table index */
+	u32 max_color_index;	/* Maximum valid color table index */
+	u32 max_depth_index;	/* Maximum valid depth table index */
+	u32 max_stencil_index;	/* Maximum valid stencil table index */
+	u32 max_used_color_index; /* Max used color table index */
+	u32 max_used_depth_index; /* Max used depth table index */
+	u32 max_used_stencil_index; /* Max used stencil table index */
+};
+
+#endif /* NVGPU_GR_ZBC_PRIV_H */
+
--- a/drivers/gpu/nvgpu/common/gr/zcull.c
+++ b/drivers/gpu/nvgpu/common/gr/zcull.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <nvgpu/log.h>
+#include <nvgpu/io.h>
+#include <nvgpu/gk20a.h>
+#include <nvgpu/gr/subctx.h>
+#include <nvgpu/gr/ctx.h>
+#include <nvgpu/gr/zcull.h>
+#include <nvgpu/gr/config.h>
+
+#include "zcull_priv.h"
+
+int nvgpu_gr_zcull_init(struct gk20a *g, struct nvgpu_gr_zcull **gr_zcull,
+			u32 size, struct nvgpu_gr_config *config)
+{
+	struct nvgpu_gr_zcull *zcull;
+	int err = 0;
+
+	nvgpu_log(g, gpu_dbg_gr, "size = %u", size);
+
+	zcull = nvgpu_kzalloc(g, sizeof(*zcull));
+	if (zcull == NULL) {
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	zcull->g = g;
+
+	zcull->zcull_ctxsw_image_size = size;
+
+	zcull->aliquot_width = nvgpu_gr_config_get_tpc_count(config) * 16U;
+	zcull->aliquot_height = 16;
+
+	zcull->width_align_pixels =
+		nvgpu_gr_config_get_tpc_count(config) * 16U;
+	zcull->height_align_pixels = 32;
+
+	zcull->aliquot_size =
+		zcull->aliquot_width * zcull->aliquot_height;
+
+	/* assume no floor sweeping since we only have 1 tpc in 1 gpc */
+	zcull->pixel_squares_by_aliquots =
+		nvgpu_gr_config_get_zcb_count(config) * 16U * 16U *
+		nvgpu_gr_config_get_tpc_count(config) /
+		(nvgpu_gr_config_get_gpc_count(config) *
+		 nvgpu_gr_config_get_gpc_tpc_count(config, 0U));
+
+exit:
+	*gr_zcull = zcull;
+	return err;
+}
+
+void nvgpu_gr_zcull_deinit(struct gk20a *g, struct nvgpu_gr_zcull *gr_zcull)
+{
+	if (gr_zcull == NULL) {
+		return;
+	}
+
+	nvgpu_kfree(g, gr_zcull);
+}
+
+u32 nvgpu_gr_get_ctxsw_zcull_size(struct gk20a *g,
+				struct nvgpu_gr_zcull *gr_zcull)
+{
+	/* assuming zcull has already been initialized */
+	return gr_zcull->zcull_ctxsw_image_size;
+}
+
+int nvgpu_gr_zcull_init_hw(struct gk20a *g,
+			struct nvgpu_gr_zcull *gr_zcull,
+			struct nvgpu_gr_config *gr_config)
+{
+	u32 *zcull_map_tiles, *zcull_bank_counters;
+	u32 map_counter;
+	u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS);
+	u32 num_tpc_per_gpc = nvgpu_get_litter_value(g,
+						GPU_LIT_NUM_TPC_PER_GPC);
+	u32 zcull_alloc_num = num_gpcs * num_tpc_per_gpc;
+	u32 map_tile_count;
+	int ret = 0;
+
+	nvgpu_log(g, gpu_dbg_gr, " ");
+
+	if (nvgpu_gr_config_get_map_tiles(gr_config) == NULL) {
+		return -1;
+	}
+
+	if (zcull_alloc_num % 8U != 0U) {
+		/* Total 8 fields per map reg i.e. tile_0 to tile_7*/
+		zcull_alloc_num += (zcull_alloc_num % 8U);
+	}
+	zcull_map_tiles = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
+
+	if (zcull_map_tiles == NULL) {
+		nvgpu_err(g,
+			"failed to allocate zcull map titles");
+		return -ENOMEM;
+	}
+
+	zcull_bank_counters = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32));
+
+	if (zcull_bank_counters == NULL) {
+		nvgpu_err(g,
+			"failed to allocate zcull bank counters");
+		nvgpu_kfree(g, zcull_map_tiles);
+		return -ENOMEM;
+	}
+
+	for (map_counter = 0;
+	     map_counter < nvgpu_gr_config_get_tpc_count(gr_config);
+	     map_counter++) {
+		map_tile_count =
+			nvgpu_gr_config_get_map_tile_count(gr_config,
+							map_counter);
+		zcull_map_tiles[map_counter] =
+			zcull_bank_counters[map_tile_count];
+		zcull_bank_counters[map_tile_count]++;
+	}
+
+	if (g->ops.gr.zcull.program_zcull_mapping != NULL) {
+		g->ops.gr.zcull.program_zcull_mapping(g, zcull_alloc_num,
+						zcull_map_tiles);
+	}
+
+	nvgpu_kfree(g, zcull_map_tiles);
+	nvgpu_kfree(g, zcull_bank_counters);
+
+	if (g->ops.gr.zcull.init_zcull_hw != NULL) {
+		ret = g->ops.gr.zcull.init_zcull_hw(g, gr_zcull, gr_config);
+		if (ret != 0) {
+			nvgpu_err(g, "failed to init zcull hw. err:%d", ret);
+			return ret;
+		}
+	}
+
+	nvgpu_log(g, gpu_dbg_gr, "done");
+	return 0;
+}
+
+int nvgpu_gr_zcull_ctx_setup(struct gk20a *g, struct nvgpu_gr_subctx *subctx,
+		struct nvgpu_gr_ctx *gr_ctx)
+{
+	int ret = 0;
+
+	if (subctx != NULL) {
+		ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, false);
+		if (ret == 0) {
+			nvgpu_gr_subctx_zcull_setup(g, subctx, gr_ctx);
+		}
+	} else {
+		ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, true);
+	}
+
+	return ret;
+}
+
--- a/drivers/gpu/nvgpu/common/gr/zcull_priv.h
+++ b/drivers/gpu/nvgpu/common/gr/zcull_priv.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NVGPU_GR_ZCULL_PRIV_H
+#define NVGPU_GR_ZCULL_PRIV_H
+
+#include <nvgpu/types.h>
+
+struct gk20a;
+
+struct nvgpu_gr_zcull {
+	struct gk20a *g;
+
+	u32 aliquot_width;
+	u32 aliquot_height;
+	u32 aliquot_size;
+	u32 total_aliquots;
+
+	u32 width_align_pixels;
+	u32 height_align_pixels;
+	u32 pixel_squares_by_aliquots;
+
+	u32 zcull_ctxsw_image_size;
+};
+
+#endif /* NVGPU_GR_ZCULL_PRIV_H */