linux-nvgpu/drivers/gpu/nvgpu/gk20a/gr_gk20a.c

/*
 * GK20A Graphics
 *
 * Copyright (c) 2011-2019, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <nvgpu/dma.h>
#include <nvgpu/kmem.h>
#include <nvgpu/gmmu.h>
#include <nvgpu/timers.h>
#include <nvgpu/nvgpu_common.h>
#include <nvgpu/log.h>
#include <nvgpu/bug.h>
#include <nvgpu/firmware.h>
#include <nvgpu/enabled.h>
#include <nvgpu/debug.h>
#include <nvgpu/barrier.h>
#include <nvgpu/mm.h>
#include <nvgpu/debugger.h>
#include <nvgpu/netlist.h>
#include <nvgpu/error_notifier.h>
#include <nvgpu/ecc.h>
#include <nvgpu/io.h>
#include <nvgpu/utils.h>
#include <nvgpu/fifo.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/channel.h>
#include <nvgpu/string.h>
#include <nvgpu/regops.h>
#include <nvgpu/gr/global_ctx.h>
#include <nvgpu/gr/subctx.h>
#include <nvgpu/gr/ctx.h>
#include <nvgpu/gr/gr.h>
#include <nvgpu/gr/gr_intr.h>
#include <nvgpu/gr/gr_falcon.h>
#include <nvgpu/gr/obj_ctx.h>
#include <nvgpu/gr/config.h>
#include <nvgpu/gr/fecs_trace.h>
#include <nvgpu/gr/hwpm_map.h>
#include <nvgpu/engines.h>
#include <nvgpu/engine_status.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/power_features/cg.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/preempt.h>

#include "gr_gk20a.h"
#include "gr_pri_gk20a.h"

#include "common/gr/gr_priv.h"

#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>

static void nvgpu_report_gr_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
		u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
{
	int ret;
	struct gr_sm_mcerr_info err_info;
	struct channel_gk20a *ch;
	struct gr_err_info info;
	u32 tsgid, chid, curr_ctx, inst = 0;

	if (g->ops.gr.err_ops.report_gr_err == NULL) {
		return;
	}

	tsgid = NVGPU_INVALID_TSG_ID;
	curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
	ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
	chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
	if (ch != NULL) {
		gk20a_channel_put(ch);
	}

	(void) memset(&err_info, 0, sizeof(err_info));
	(void) memset(&info, 0, sizeof(info));
	err_info.curr_ctx = curr_ctx;
	err_info.chid = chid;
	err_info.tsgid = tsgid;
	err_info.hww_warp_esr_pc = hww_warp_esr_pc;
	err_info.hww_warp_esr_status = hww_warp_esr_status;
	err_info.gpc = gpc;
	err_info.tpc = tpc;
	err_info.sm = sm;
	info.sm_mcerr_info = &err_info;
	ret = g->ops.gr.err_ops.report_gr_err(g,
			NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR,
			&info);
	if (ret != 0) {
		nvgpu_err(g, "failed to report SM_EXCEPTION "
				"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
				gpc, tpc, sm, hww_warp_esr_status);
	}
}

static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
		u32 mailbox_value)
{
	int ret = 0;
	struct ctxsw_err_info err_info;

	err_info.curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
	err_info.ctxsw_status0 = g->ops.gr.falcon.read_fecs_ctxsw_status0(g);
	err_info.ctxsw_status1 = g->ops.gr.falcon.read_fecs_ctxsw_status1(g);
	err_info.mailbox_value = mailbox_value;
	err_info.chid = chid;

	if (g->ops.gr.err_ops.report_ctxsw_err != NULL) {
		ret = g->ops.gr.err_ops.report_ctxsw_err(g,
				NVGPU_ERR_MODULE_FECS,
				err_type, (void *)&err_info);
		if (ret != 0) {
			nvgpu_err(g, "Failed to report FECS CTXSW error: %d",
					err_type);
		}
	}
}

int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
				    struct channel_gk20a *c,
				    bool enable_smpc_ctxsw)
{
	struct tsg_gk20a *tsg;
	int ret;

	nvgpu_log_fn(g, " ");

	tsg = tsg_gk20a_from_ch(c);
	if (tsg == NULL) {
		return -EINVAL;
	}

	ret = gk20a_disable_channel_tsg(g, c);
	if (ret != 0) {
		nvgpu_err(g, "failed to disable channel/TSG");
		goto out;
	}
	ret = nvgpu_preempt_channel(g, c);
	if (ret != 0) {
		gk20a_enable_channel_tsg(g, c);
		nvgpu_err(g, "failed to preempt channel/TSG");
		goto out;
	}

	ret = nvgpu_gr_ctx_set_smpc_mode(g, tsg->gr_ctx, enable_smpc_ctxsw);

out:
	gk20a_enable_channel_tsg(g, c);
	return ret;
}

int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g,
				  struct channel_gk20a *c,
				  u64 gpu_va,
				  u32 mode)
{
	struct tsg_gk20a *tsg;
	struct nvgpu_gr_ctx *gr_ctx;
	bool skip_update = false;
	int ret;

	nvgpu_log_fn(g, " ");

	tsg = tsg_gk20a_from_ch(c);
	if (tsg == NULL) {
		return -EINVAL;
	}

	gr_ctx = tsg->gr_ctx;

	if (mode != NVGPU_GR_CTX_HWPM_CTXSW_MODE_NO_CTXSW) {
		nvgpu_gr_ctx_set_size(g->gr->gr_ctx_desc,
			NVGPU_GR_CTX_PM_CTX,
			nvgpu_gr_hwpm_map_get_size(g->gr->hwpm_map));

		ret = nvgpu_gr_ctx_alloc_pm_ctx(g, gr_ctx,
			g->gr->gr_ctx_desc, c->vm,
			gpu_va);
		if (ret != 0) {
			nvgpu_err(g,
				"failed to allocate pm ctxt buffer");
			return ret;
		}

		if ((mode == NVGPU_GR_CTX_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) &&
			(g->ops.gr.init_hwpm_pmm_register != NULL)) {
			g->ops.gr.init_hwpm_pmm_register(g);
		}
	}

	ret = nvgpu_gr_ctx_prepare_hwpm_mode(g, gr_ctx, mode, &skip_update);
	if (ret != 0) {
		return ret;
	}
	if (skip_update) {
		return 0;
	}

	ret = gk20a_disable_channel_tsg(g, c);
	if (ret != 0) {
		nvgpu_err(g, "failed to disable channel/TSG");
		return ret;
	}

	ret = nvgpu_preempt_channel(g, c);
	if (ret != 0) {
		gk20a_enable_channel_tsg(g, c);
		nvgpu_err(g, "failed to preempt channel/TSG");
		return ret;
	}

	if (c->subctx != NULL) {
		struct channel_gk20a *ch;

		nvgpu_rwsem_down_read(&tsg->ch_list_lock);
		nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) {
			ret = nvgpu_gr_ctx_set_hwpm_mode(g, gr_ctx, false);
			if (ret == 0) {
				nvgpu_gr_subctx_set_hwpm_mode(g, ch->subctx,
					gr_ctx);
			}
		}
		nvgpu_rwsem_up_read(&tsg->ch_list_lock);
	} else {
		ret = nvgpu_gr_ctx_set_hwpm_mode(g, gr_ctx, true);
	}

	/* enable channel */
	gk20a_enable_channel_tsg(g, c);

	return ret;
}

int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch,
					  struct nvgpu_gr_isr_data *isr_data)
{
	u32 gr_fecs_intr, mailbox_value;
	int ret = 0;
	struct nvgpu_fecs_host_intr_status fecs_host_intr;
	u32 chid = isr_data->ch != NULL ?
		isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID;
	u32 mailbox_id = NVGPU_GR_FALCON_FECS_CTXSW_MAILBOX6;

	gr_fecs_intr = g->ops.gr.falcon.fecs_host_intr_status(g,
						&fecs_host_intr);
	if (gr_fecs_intr == 0U) {
		return 0;
	}

	if (fecs_host_intr.unimp_fw_method_active) {
		mailbox_value = g->ops.gr.falcon.read_fecs_ctxsw_mailbox(g,
								mailbox_id);
		nvgpu_gr_intr_set_error_notifier(g, isr_data,
			 NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD);
		nvgpu_err(g,
			  "firmware method error 0x%08x for offset 0x%04x",
			  mailbox_value,
			  isr_data->data_lo);
		ret = -1;
	} else if (fecs_host_intr.watchdog_active) {
		gr_report_ctxsw_error(g, GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
				chid, 0);
		/* currently, recovery is not initiated */
		nvgpu_err(g, "fecs watchdog triggered for channel %u, "
				"cannot ctxsw anymore !!", chid);
		g->ops.gr.falcon.dump_stats(g);
	} else if (fecs_host_intr.ctxsw_intr0 != 0U) {
		mailbox_value = g->ops.gr.falcon.read_fecs_ctxsw_mailbox(g,
								mailbox_id);
#ifdef CONFIG_GK20A_CTXSW_TRACE
		if (mailbox_value ==
			g->ops.gr.fecs_trace.get_buffer_full_mailbox_val()) {
			nvgpu_info(g, "ctxsw intr0 set by ucode, "
					"timestamp buffer full");
			nvgpu_gr_fecs_trace_reset_buffer(g);
		} else
#endif
		/*
		 * The mailbox values may vary across chips hence keeping it
		 * as a HAL.
		 */
		if ((g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val != NULL)
			&& (mailbox_value ==
			g->ops.gr.get_ctxsw_checksum_mismatch_mailbox_val())) {

			gr_report_ctxsw_error(g, GPU_FECS_CTXSW_CRC_MISMATCH,
					chid, mailbox_value);
			nvgpu_err(g, "ctxsw intr0 set by ucode, "
					"ctxsw checksum mismatch");
			ret = -1;
		} else {
			/*
			 * Other errors are also treated as fatal and channel
			 * recovery is initiated and error is reported to
			 * 3LSS.
			 */
			gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
					chid, mailbox_value);
			nvgpu_err(g,
				 "ctxsw intr0 set by ucode, error_code: 0x%08x",
				 mailbox_value);
			ret = -1;
		}
	} else if (fecs_host_intr.fault_during_ctxsw_active) {
		gr_report_ctxsw_error(g, GPU_FECS_FAULT_DURING_CTXSW,
				chid, 0);
		nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
		ret = -1;
	} else {
		nvgpu_err(g,
			"unhandled fecs error interrupt 0x%08x for channel %u",
			gr_fecs_intr, chid);
		g->ops.gr.falcon.dump_stats(g);
	}

	g->ops.gr.falcon.fecs_host_clear_intr(g, gr_fecs_intr);

	return ret;
}

int gk20a_gr_lock_down_sm(struct gk20a *g,
			 u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask,
			 bool check_errors)
{
	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
	u32 dbgr_control0;

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			"GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm);

	/* assert stop trigger */
	dbgr_control0 =
		gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
	dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
	gk20a_writel(g,
		gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);

	return g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask,
			check_errors);
}

bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
{
	u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());

	/* check if an sm debugger is attached.
	 * assumption: all SMs will have debug mode enabled/disabled
	 * uniformly. */
	if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
			gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v()) {
		return true;
	}

	return false;
}

int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
		bool *post_event, struct channel_gk20a *fault_ch,
		u32 *hww_global_esr)
{
	int ret = 0;
	bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
	bool disable_sm_exceptions = true;
	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
	bool sm_debugger_attached;
	u32 global_esr, warp_esr, global_mask;
	u64 hww_warp_esr_pc = 0;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");

	sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);

	global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
	*hww_global_esr = global_esr;
	warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
	global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);

	if (!sm_debugger_attached) {
		nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
			  global_esr, warp_esr);
		return -EFAULT;
	}

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
		  "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);

	/*
	 * Check and report any fatal wrap errors.
	 */
	if ((global_esr & ~global_mask) != 0U) {
		if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) {
			hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g,
					offset);
		}
		nvgpu_report_gr_sm_exception(g, gpc, tpc, sm, warp_esr,
				hww_warp_esr_pc);
	}
	nvgpu_pg_elpg_protected_call(g,
		g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));

	if (g->ops.gr.pre_process_sm_exception != NULL) {
		ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
				global_esr, warp_esr,
				sm_debugger_attached,
				fault_ch,
				&early_exit,
				&ignore_debugger);
		if (ret != 0) {
			nvgpu_err(g, "could not pre-process sm error!");
			return ret;
		}
	}

	if (early_exit) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
				"returning early");
		return ret;
	}

	/*
	 * Disable forwarding of tpc exceptions,
	 * the debugger will reenable exceptions after servicing them.
	 *
	 * Do not disable exceptions if the only SM exception is BPT_INT
	 */
	if ((g->ops.gr.esr_bpt_pending_events(global_esr,
			NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) {
		disable_sm_exceptions = false;
	}

	if (!ignore_debugger && disable_sm_exceptions) {
		g->ops.gr.intr.tpc_exception_sm_disable(g, offset);
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			  "SM Exceptions disabled");
	}

	/* if a debugger is present and an error has occurred, do a warp sync */
	if (!ignore_debugger &&
	    ((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) {
		nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
		do_warp_sync = true;
	}

	if (do_warp_sync) {
		ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
				 global_mask, true);
		if (ret != 0) {
			nvgpu_err(g, "sm did not lock down!");
			return ret;
		}
	}

	if (ignore_debugger) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			"ignore_debugger set, skipping event posting");
	} else {
		*post_event = true;
	}

	return ret;
}

void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
				u32 *esr_sm_sel)
{
	*esr_sm_sel = 1;
}

static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
					       u32 addr,
					       bool is_quad, u32 quad,
					       u32 *context_buffer,
					       u32 context_buffer_size,
					       u32 *priv_offset);

/* This function will decode a priv address and return the partition type and numbers. */
int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
			      enum ctxsw_addr_type *addr_type,
			      u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
			      u32 *broadcast_flags)
{
	u32 gpc_addr;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	/* setup defaults */
	*addr_type = CTXSW_ADDR_TYPE_SYS;
	*broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
	*gpc_num = 0;
	*tpc_num = 0;
	*ppc_num = 0;
	*be_num  = 0;

	if (pri_is_gpc_addr(g, addr)) {
		*addr_type = CTXSW_ADDR_TYPE_GPC;
		gpc_addr = pri_gpccs_addr_mask(addr);
		if (pri_is_gpc_addr_shared(g, addr)) {
			*addr_type = CTXSW_ADDR_TYPE_GPC;
			*broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
		} else {
			*gpc_num = pri_get_gpc_num(g, addr);
		}

		if (pri_is_ppc_addr(g, gpc_addr)) {
			*addr_type = CTXSW_ADDR_TYPE_PPC;
			if (pri_is_ppc_addr_shared(g, gpc_addr)) {
				*broadcast_flags |= PRI_BROADCAST_FLAGS_PPC;
				return 0;
			}
		}
		if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
			*addr_type = CTXSW_ADDR_TYPE_TPC;
			if (pri_is_tpc_addr_shared(g, gpc_addr)) {
				*broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
				return 0;
			}
			*tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
		}
		return 0;
	} else if (pri_is_be_addr(g, addr)) {
		*addr_type = CTXSW_ADDR_TYPE_BE;
		if (pri_is_be_addr_shared(g, addr)) {
			*broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
			return 0;
		}
		*be_num = pri_get_be_num(g, addr);
		return 0;
	} else if (g->ops.ltc.pri_is_ltc_addr(g, addr)) {
		*addr_type = CTXSW_ADDR_TYPE_LTCS;
		if (g->ops.ltc.is_ltcs_ltss_addr(g, addr)) {
			*broadcast_flags |= PRI_BROADCAST_FLAGS_LTCS;
		} else if (g->ops.ltc.is_ltcn_ltss_addr(g, addr)) {
			*broadcast_flags |= PRI_BROADCAST_FLAGS_LTSS;
		}
		return 0;
	} else if (pri_is_fbpa_addr(g, addr)) {
		*addr_type = CTXSW_ADDR_TYPE_FBPA;
		if (pri_is_fbpa_addr_shared(g, addr)) {
			*broadcast_flags |= PRI_BROADCAST_FLAGS_FBPA;
			return 0;
		}
		return 0;
	} else if ((g->ops.gr.is_egpc_addr != NULL) &&
		   g->ops.gr.is_egpc_addr(g, addr)) {
			return g->ops.gr.decode_egpc_addr(g,
					addr, addr_type, gpc_num,
					tpc_num, broadcast_flags);
	} else {
		*addr_type = CTXSW_ADDR_TYPE_SYS;
		return 0;
	}
	/* PPC!?!?!?! */

	/*NOTREACHED*/
	return -EINVAL;
}

void gr_gk20a_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr,
				      u32 num_fbpas,
				      u32 *priv_addr_table, u32 *t)
{
	u32 fbpa_id;

	for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) {
		priv_addr_table[(*t)++] = pri_fbpa_addr(g,
				pri_fbpa_addr_mask(g, addr), fbpa_id);
	}
}

int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
				      u32 gpc_num,
				      u32 *priv_addr_table, u32 *t)
{
	u32 ppc_num;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	for (ppc_num = 0;
	     ppc_num < nvgpu_gr_config_get_gpc_ppc_count(g->gr->config, gpc_num);
	     ppc_num++) {
		priv_addr_table[(*t)++] = pri_ppc_addr(g, pri_ppccs_addr_mask(addr),
		gpc_num, ppc_num);
	}

	return 0;
}

/*
 * The context buffer is indexed using BE broadcast addresses and GPC/TPC
 * unicast addresses. This function will convert a BE unicast address to a BE
 * broadcast address and split a GPC/TPC broadcast address into a table of
 * GPC/TPC addresses.  The addresses generated by this function can be
 * successfully processed by gr_gk20a_find_priv_offset_in_buffer
 */
int gr_gk20a_create_priv_addr_table(struct gk20a *g,
					   u32 addr,
					   u32 *priv_addr_table,
					   u32 *num_registers)
{
	enum ctxsw_addr_type addr_type;
	u32 gpc_num, tpc_num, ppc_num, be_num;
	u32 priv_addr, gpc_addr;
	u32 broadcast_flags;
	u32 t;
	int err;
	struct nvgpu_gr_config *gr_config = g->gr->config;

	t = 0;
	*num_registers = 0;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
					&gpc_num, &tpc_num, &ppc_num, &be_num,
					&broadcast_flags);
	nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
	if (err != 0) {
		return err;
	}

	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
		/* The BE broadcast registers are included in the compressed PRI
		 * table. Convert a BE unicast address to a broadcast address
		 * so that we can look up the offset. */
		if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
		    ((broadcast_flags & PRI_BROADCAST_FLAGS_BE) == 0U)) {
			priv_addr_table[t++] = pri_be_shared_addr(g, addr);
		} else {
			priv_addr_table[t++] = addr;
		}

		*num_registers = t;
		return 0;
	}

	/* The GPC/TPC unicast registers are included in the compressed PRI
	 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
	 * that we can look up the offsets. */
	if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) != 0U) {
		for (gpc_num = 0;
		     gpc_num < nvgpu_gr_config_get_gpc_count(gr_config);
		     gpc_num++) {

			if ((broadcast_flags & PRI_BROADCAST_FLAGS_TPC) != 0U) {
				for (tpc_num = 0;
				     tpc_num < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc_num);
				     tpc_num++) {
					priv_addr_table[t++] =
						pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
							     gpc_num, tpc_num);
				}

			} else if ((broadcast_flags & PRI_BROADCAST_FLAGS_PPC) != 0U) {
				err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
							       priv_addr_table, &t);
				if (err != 0) {
					return err;
				}
			} else {
				priv_addr = pri_gpc_addr(g,
						pri_gpccs_addr_mask(addr),
						gpc_num);

				gpc_addr = pri_gpccs_addr_mask(priv_addr);
				tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
				if (tpc_num >= nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc_num)) {
					continue;
				}

				priv_addr_table[t++] = priv_addr;
			}
		}
	} else if (((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
			(addr_type == CTXSW_ADDR_TYPE_ETPC)) &&
			(g->ops.gr.egpc_etpc_priv_addr_table != NULL)) {
		nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type : EGPC/ETPC");
		g->ops.gr.egpc_etpc_priv_addr_table(g, addr, gpc_num, tpc_num,
				broadcast_flags, priv_addr_table, &t);
	} else if ((broadcast_flags & PRI_BROADCAST_FLAGS_LTSS) != 0U) {
		g->ops.ltc.split_lts_broadcast_addr(g, addr,
							priv_addr_table, &t);
	} else if ((broadcast_flags & PRI_BROADCAST_FLAGS_LTCS) != 0U) {
		g->ops.ltc.split_ltc_broadcast_addr(g, addr,
							priv_addr_table, &t);
	} else if ((broadcast_flags & PRI_BROADCAST_FLAGS_FBPA) != 0U) {
		g->ops.gr.split_fbpa_broadcast_addr(g, addr,
				nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS),
				priv_addr_table, &t);
	} else if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) == 0U) {
		if ((broadcast_flags & PRI_BROADCAST_FLAGS_TPC) != 0U) {
			for (tpc_num = 0;
			     tpc_num < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc_num);
			     tpc_num++) {
				priv_addr_table[t++] =
					pri_tpc_addr(g, pri_tpccs_addr_mask(addr),
						     gpc_num, tpc_num);
			}
		} else if ((broadcast_flags & PRI_BROADCAST_FLAGS_PPC) != 0U) {
			err = gr_gk20a_split_ppc_broadcast_addr(g,
					addr, gpc_num, priv_addr_table, &t);
		} else {
			priv_addr_table[t++] = addr;
		}
	}

	*num_registers = t;
	return 0;
}

int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
				    u32 addr,
				    u32 max_offsets,
				    u32 *offsets, u32 *offset_addrs,
				    u32 *num_offsets,
				    bool is_quad, u32 quad)
{
	u32 i;
	u32 priv_offset = 0;
	u32 *priv_registers;
	u32 num_registers = 0;
	int err = 0;
	struct nvgpu_gr *gr = g->gr;
	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
	u32 potential_offsets = nvgpu_gr_config_get_max_gpc_count(gr->config) *
		nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) *
		sm_per_tpc;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	/* implementation is crossed-up if either of these happen */
	if (max_offsets > potential_offsets) {
		nvgpu_log_fn(g, "max_offsets > potential_offsets");
		return -EINVAL;
	}

	if (!g->gr->ctx_vars.golden_image_initialized) {
		return -ENODEV;
	}

	priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
	if (priv_registers == NULL) {
		nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
		err = -ENOMEM;
		goto cleanup;
	}
	(void) memset(offsets,      0, sizeof(u32) * max_offsets);
	(void) memset(offset_addrs, 0, sizeof(u32) * max_offsets);
	*num_offsets = 0;

	g->ops.gr.create_priv_addr_table(g, addr, &priv_registers[0],
			&num_registers);

	if ((max_offsets > 1U) && (num_registers > max_offsets)) {
		nvgpu_log_fn(g, "max_offsets = %d, num_registers = %d",
				max_offsets, num_registers);
		err = -EINVAL;
		goto cleanup;
	}

	if ((max_offsets == 1U) && (num_registers > 1U)) {
		num_registers = 1;
	}

	if (!g->gr->ctx_vars.golden_image_initialized) {
		nvgpu_log_fn(g, "no context switch header info to work with");
		err = -EINVAL;
		goto cleanup;
	}

	for (i = 0; i < num_registers; i++) {
		err = gr_gk20a_find_priv_offset_in_buffer(g,
			  priv_registers[i],
			  is_quad, quad,
			  nvgpu_gr_obj_ctx_get_local_golden_image_ptr(
				g->gr->golden_image),
			  nvgpu_gr_obj_ctx_get_golden_image_size(
				g->gr->golden_image),
			  &priv_offset);
		if (err != 0) {
			nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
				      addr); /*, grPriRegStr(addr)));*/
			goto cleanup;
		}

		offsets[i] = priv_offset;
		offset_addrs[i] = priv_registers[i];
	}

	*num_offsets = num_registers;
cleanup:
	if (!IS_ERR_OR_NULL(priv_registers)) {
		nvgpu_kfree(g, priv_registers);
	}

	return err;
}

int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g,
				       u32 addr,
				       u32 max_offsets,
				       u32 *offsets, u32 *offset_addrs,
				       u32 *num_offsets)
{
	u32 i;
	u32 priv_offset = 0;
	u32 *priv_registers;
	u32 num_registers = 0;
	int err = 0;
	struct nvgpu_gr *gr = g->gr;
	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
	u32 potential_offsets = nvgpu_gr_config_get_max_gpc_count(gr->config) *
		nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) *
		sm_per_tpc;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	/* implementation is crossed-up if either of these happen */
	if (max_offsets > potential_offsets) {
		return -EINVAL;
	}

	if (!g->gr->ctx_vars.golden_image_initialized) {
		return -ENODEV;
	}

	priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets);
	if (priv_registers == NULL) {
		nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets);
		return -ENOMEM;
	}
	(void) memset(offsets,      0, sizeof(u32) * max_offsets);
	(void) memset(offset_addrs, 0, sizeof(u32) * max_offsets);
	*num_offsets = 0;

	g->ops.gr.create_priv_addr_table(g, addr, priv_registers,
			&num_registers);

	if ((max_offsets > 1U) && (num_registers > max_offsets)) {
		err = -EINVAL;
		goto cleanup;
	}

	if ((max_offsets == 1U) && (num_registers > 1U)) {
		num_registers = 1;
	}

	if (!g->gr->ctx_vars.golden_image_initialized) {
		nvgpu_log_fn(g, "no context switch header info to work with");
		err = -EINVAL;
		goto cleanup;
	}

	for (i = 0; i < num_registers; i++) {
		err = nvgpu_gr_hwmp_map_find_priv_offset(g, g->gr->hwpm_map,
						  priv_registers[i],
						  &priv_offset, gr->config);
		if (err != 0) {
			nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x",
				      addr); /*, grPriRegStr(addr)));*/
			goto cleanup;
		}

		offsets[i] = priv_offset;
		offset_addrs[i] = priv_registers[i];
	}

	*num_offsets = num_registers;
cleanup:
	nvgpu_kfree(g, priv_registers);

	return err;
}

/* Setup some register tables.  This looks hacky; our
 * register/offset functions are just that, functions.
 * So they can't be used as initializers... TBD: fix to
 * generate consts at least on an as-needed basis.
 */
static const u32 _num_ovr_perf_regs = 17;
static u32 _ovr_perf_regs[17] = { 0, };
/* Following are the blocks of registers that the ucode
 stores in the extended region.*/

void gk20a_gr_init_ovr_sm_dsm_perf(void)
{
	if (_ovr_perf_regs[0] != 0U) {
		return;
	}

	_ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
	_ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
	_ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
	_ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
	_ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
	_ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
	_ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
	_ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
	_ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
	_ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
	_ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
	_ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
	_ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
	_ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
	_ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
	_ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
	_ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();

}

/* TBD: would like to handle this elsewhere, at a higher level.
 * these are currently constructed in a "test-then-write" style
 * which makes it impossible to know externally whether a ctx
 * write will actually occur. so later we should put a lazy,
 *  map-and-hold system in the patch write state */
static int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
			    struct channel_gk20a *ch,
			    u32 addr, u32 data,
			    struct nvgpu_gr_ctx *gr_ctx)
{
	u32 num_gpc = nvgpu_gr_config_get_gpc_count(g->gr->config);
	u32 num_tpc;
	u32 tpc, gpc, reg;
	u32 chk_addr;
	u32 num_ovr_perf_regs = 0;
	u32 *ovr_perf_regs = NULL;
	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);

	g->ops.gr.init_ovr_sm_dsm_perf();
	g->ops.gr.init_sm_dsm_reg_info();
	g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs);

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	for (reg = 0; reg < num_ovr_perf_regs; reg++) {
		for (gpc = 0; gpc < num_gpc; gpc++)  {
			num_tpc = nvgpu_gr_config_get_gpc_tpc_count(g->gr->config, gpc);
			for (tpc = 0; tpc < num_tpc; tpc++) {
				chk_addr = ((gpc_stride * gpc) +
					    (tpc_in_gpc_stride * tpc) +
					    ovr_perf_regs[reg]);
				if (chk_addr != addr) {
					continue;
				}
				/* reset the patch count from previous
				   runs,if ucode has already processed
				   it */
				nvgpu_gr_ctx_reset_patch_count(g, gr_ctx);

				nvgpu_gr_ctx_patch_write(g, gr_ctx,
							 addr, data, true);

				if (ch->subctx != NULL) {
					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
						false);
					nvgpu_gr_subctx_set_patch_ctx(g,
						ch->subctx, gr_ctx);
				} else {
					nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx,
						true);
				}

				/* we're not caching these on cpu side,
				   but later watch for it */
				return 0;
			}
		}
	}

	return 0;
}

#define ILLEGAL_ID	~U32(0U)

void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs,
					       u32 **ovr_perf_regs)
{
	*num_ovr_perf_regs = _num_ovr_perf_regs;
	*ovr_perf_regs = _ovr_perf_regs;
}

static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
						   u32 addr,
						   bool is_quad, u32 quad,
						   u32 *context_buffer,
						   u32 context_buffer_size,
						   u32 *priv_offset)
{
	u32 i;
	u32 gpc_num, tpc_num;
	u32 num_gpcs;
	u32 chk_addr;
	u32 ext_priv_offset, ext_priv_size;
	u8 *context;
	u32 offset_to_segment, offset_to_segment_end;
	u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
	u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
	u32 num_ext_gpccs_ext_buffer_segments;
	u32 inter_seg_offset;
	u32 max_tpc_count;
	u32 *sm_dsm_perf_ctrl_regs = NULL;
	u32 num_sm_dsm_perf_ctrl_regs = 0;
	u32 *sm_dsm_perf_regs = NULL;
	u32 num_sm_dsm_perf_regs = 0;
	u32 buffer_segments_size = 0;
	u32 marker_size = 0;
	u32 control_register_stride = 0;
	u32 perf_register_stride = 0;
	struct nvgpu_gr *gr = g->gr;
	u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
	u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
	u32 tpc_gpc_mask = (tpc_in_gpc_stride - 1U);

	/* Only have TPC registers in extended region, so if not a TPC reg,
	   then return error so caller can look elsewhere. */
	if (pri_is_gpc_addr(g, addr))   {
		u32 gpc_addr = 0;
		gpc_num = pri_get_gpc_num(g, addr);
		gpc_addr = pri_gpccs_addr_mask(addr);
		if (g->ops.gr.is_tpc_addr(g, gpc_addr)) {
			tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr);
		} else {
			return -EINVAL;
		}

		nvgpu_log_info(g, " gpc = %d tpc = %d",
				gpc_num, tpc_num);
	} else if ((g->ops.gr.is_etpc_addr != NULL) &&
				g->ops.gr.is_etpc_addr(g, addr)) {
			g->ops.gr.get_egpc_etpc_num(g, addr, &gpc_num, &tpc_num);
			gpc_base = g->ops.gr.get_egpc_base(g);
	} else {
		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
				"does not exist in extended region");
		return -EINVAL;
	}

	buffer_segments_size = g->ops.gr.ctxsw_prog.hw_get_extended_buffer_segments_size_in_bytes();
	/* note below is in words/num_registers */
	marker_size = g->ops.gr.ctxsw_prog.hw_extended_marker_size_in_bytes() >> 2;

	context = (u8 *)context_buffer;
	/* sanity check main header */
	if (!g->ops.gr.ctxsw_prog.check_main_image_header_magic(context)) {
		nvgpu_err(g,
			   "Invalid main header: magic value");
		return -EINVAL;
	}
	num_gpcs = g->ops.gr.ctxsw_prog.get_num_gpcs(context);
	if (gpc_num >= num_gpcs) {
		nvgpu_err(g,
		   "GPC 0x%08x is greater than total count 0x%08x!",
			   gpc_num, num_gpcs);
		return -EINVAL;
	}

	g->ops.gr.ctxsw_prog.get_extended_buffer_size_offset(context,
		&ext_priv_size, &ext_priv_offset);
	if (0U == ext_priv_size) {
		nvgpu_log_info(g, " No extended memory in context buffer");
		return -EINVAL;
	}

	offset_to_segment = ext_priv_offset * 256U;
	offset_to_segment_end = offset_to_segment +
		(ext_priv_size * buffer_segments_size);

	/* check local header magic */
	context += g->ops.gr.ctxsw_prog.hw_get_fecs_header_size();
	if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) {
		nvgpu_err(g,
			   "Invalid local header: magic value");
		return -EINVAL;
	}

	/*
	 * See if the incoming register address is in the first table of
	 * registers. We check this by decoding only the TPC addr portion.
	 * If we get a hit on the TPC bit, we then double check the address
	 * by computing it from the base gpc/tpc strides.  Then make sure
	 * it is a real match.
	 */
	g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
				       &sm_dsm_perf_regs,
				       &perf_register_stride);

	g->ops.gr.init_sm_dsm_reg_info();

	for (i = 0; i < num_sm_dsm_perf_regs; i++) {
		if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
			sm_dsm_perf_reg_id = i;

			nvgpu_log_info(g, "register match: 0x%08x",
					sm_dsm_perf_regs[i]);

			chk_addr = (gpc_base + gpc_stride * gpc_num) +
				   tpc_in_gpc_base +
				   (tpc_in_gpc_stride * tpc_num) +
				   (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask);

			if (chk_addr != addr) {
				nvgpu_err(g,
				   "Oops addr miss-match! : 0x%08x != 0x%08x",
					   addr, chk_addr);
				return -EINVAL;
			}
			break;
		}
	}

	/* Didn't find reg in supported group 1.
	 *  so try the second group now */
	g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
				       &sm_dsm_perf_ctrl_regs,
				       &control_register_stride);

	if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
		for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
			if ((addr & tpc_gpc_mask) ==
			    (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
				sm_dsm_perf_ctrl_reg_id = i;

				nvgpu_log_info(g, "register match: 0x%08x",
						sm_dsm_perf_ctrl_regs[i]);

				chk_addr = (gpc_base + gpc_stride * gpc_num) +
					   tpc_in_gpc_base +
					   tpc_in_gpc_stride * tpc_num +
					   (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
					    tpc_gpc_mask);

				if (chk_addr != addr) {
					nvgpu_err(g,
						   "Oops addr miss-match! : 0x%08x != 0x%08x",
						   addr, chk_addr);
					return -EINVAL;

				}

				break;
			}
		}
	}

	if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
	    (ILLEGAL_ID == sm_dsm_perf_reg_id)) {
		return -EINVAL;
	}

	/* Skip the FECS extended header, nothing there for us now. */
	offset_to_segment += buffer_segments_size;

	/* skip through the GPCCS extended headers until we get to the data for
	 * our GPC.  The size of each gpc extended segment is enough to hold the
	 * max tpc count for the gpcs,in 256b chunks.
	 */

	max_tpc_count = nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config);

	num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1U) / 2U);

	offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
			      buffer_segments_size * gpc_num);

	/* skip the head marker to start with */
	inter_seg_offset = marker_size;

	if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
		/* skip over control regs of TPC's before the one we want.
		 *  then skip to the register in this tpc */
		inter_seg_offset = inter_seg_offset +
			(tpc_num * control_register_stride) +
			sm_dsm_perf_ctrl_reg_id;
	} else {
		return -EINVAL;
	}

	/* set the offset to the segment offset plus the inter segment offset to
	 *  our register */
	offset_to_segment += (inter_seg_offset * 4U);

	/* last sanity check: did we somehow compute an offset outside the
	 * extended buffer? */
	if (offset_to_segment > offset_to_segment_end) {
		nvgpu_err(g,
			   "Overflow ctxsw buffer! 0x%08x > 0x%08x",
			   offset_to_segment, offset_to_segment_end);
		return -EINVAL;
	}

	*priv_offset = offset_to_segment;

	return 0;
}


static int
gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
					     enum ctxsw_addr_type addr_type,
					     u32 pri_addr,
					     u32 gpc_num, u32 num_tpcs,
					     u32 num_ppcs, u32 ppc_mask,
					     u32 *priv_offset)
{
	u32 i;
	u32 address, base_address;
	u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
	u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
	struct netlist_aiv_list *list;
	struct netlist_aiv *reg;
	u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE);
	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
	u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE);
	u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE);
	u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE);
	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);

	if (!g->netlist_valid) {
		return -EINVAL;
	}

	/* Process the SYS/BE segment. */
	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
		list = nvgpu_netlist_get_sys_ctxsw_regs(g);
		for (i = 0; i < list->count; i++) {
			reg = &list->l[i];
			address    = reg->addr;
			sys_offset = reg->index;

			if (pri_addr == address) {
				*priv_offset = sys_offset;
				return 0;
			}
		}
	}

	/* Process the TPC segment. */
	if (addr_type == CTXSW_ADDR_TYPE_TPC) {
		for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
			list = nvgpu_netlist_get_tpc_ctxsw_regs(g);
			for (i = 0; i < list->count; i++) {
				reg = &list->l[i];
				address = reg->addr;
				tpc_addr = pri_tpccs_addr_mask(address);
				base_address = gpc_base +
					(gpc_num * gpc_stride) +
					tpc_in_gpc_base +
					(tpc_num * tpc_in_gpc_stride);
				address = base_address + tpc_addr;
				/*
				 * The data for the TPCs is interleaved in the context buffer.
				 * Example with num_tpcs = 2
				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
				 */
				tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4U);

				if (pri_addr == address) {
					*priv_offset = tpc_offset;
					return 0;
				}
			}
		}
	} else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
		(addr_type == CTXSW_ADDR_TYPE_ETPC)) {
		if (g->ops.gr.get_egpc_base == NULL) {
			return -EINVAL;
		}

		for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
			list = nvgpu_netlist_get_etpc_ctxsw_regs(g);
			for (i = 0; i < list->count; i++) {
				reg = &list->l[i];
				address = reg->addr;
				tpc_addr = pri_tpccs_addr_mask(address);
				base_address = g->ops.gr.get_egpc_base(g) +
					(gpc_num * gpc_stride) +
					tpc_in_gpc_base +
					(tpc_num * tpc_in_gpc_stride);
				address = base_address + tpc_addr;
				/*
				 * The data for the TPCs is interleaved in the context buffer.
				 * Example with num_tpcs = 2
				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
				 */
				tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4U);

				if (pri_addr == address) {
					*priv_offset = tpc_offset;
					nvgpu_log(g,
						gpu_dbg_fn | gpu_dbg_gpu_dbg,
						"egpc/etpc priv_offset=0x%#08x",
						*priv_offset);
					return 0;
				}
			}
		}
	}


	/* Process the PPC segment. */
	if (addr_type == CTXSW_ADDR_TYPE_PPC) {
		for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
			list = nvgpu_netlist_get_ppc_ctxsw_regs(g);
			for (i = 0; i < list->count; i++) {
				reg = &list->l[i];
				address = reg->addr;
				ppc_addr = pri_ppccs_addr_mask(address);
				base_address = gpc_base +
					(gpc_num * gpc_stride) +
					ppc_in_gpc_base +
					(ppc_num * ppc_in_gpc_stride);
				address = base_address + ppc_addr;
				/*
				 * The data for the PPCs is interleaved in the context buffer.
				 * Example with numPpcs = 2
				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
				 */
				ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4U);

				if (pri_addr == address)  {
					*priv_offset = ppc_offset;
					return 0;
				}
			}
		}
	}


	/* Process the GPC segment. */
	if (addr_type == CTXSW_ADDR_TYPE_GPC) {
		list = nvgpu_netlist_get_gpc_ctxsw_regs(g);
		for (i = 0; i < list->count; i++) {
			reg = &list->l[i];

			address = reg->addr;
			gpc_addr = pri_gpccs_addr_mask(address);
			gpc_offset = reg->index;

			base_address = gpc_base + (gpc_num * gpc_stride);
			address = base_address + gpc_addr;

			if (pri_addr == address) {
				*priv_offset = gpc_offset;
				return 0;
			}
		}
	}
	return -EINVAL;
}

static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
					       u8 *context,
					       u32 *num_ppcs, u32 *ppc_mask,
					       u32 *reg_ppc_count)
{
	u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC);

	/*
	 * if there is only 1 PES_PER_GPC, then we put the PES registers
	 * in the GPC reglist, so we can't error out if ppc.count == 0
	 */
	if ((!g->netlist_valid) ||
	    ((nvgpu_netlist_get_ppc_ctxsw_regs(g)->count == 0U) &&
	     (num_pes_per_gpc > 1U))) {
		return -EINVAL;
	}

	g->ops.gr.ctxsw_prog.get_ppc_info(context, num_ppcs, ppc_mask);
	*reg_ppc_count = nvgpu_netlist_get_ppc_ctxsw_regs(g)->count;

	return 0;
}

int gr_gk20a_get_offset_in_gpccs_segment(struct gk20a *g,
					enum ctxsw_addr_type addr_type,
					u32 num_tpcs,
					u32 num_ppcs,
					u32 reg_list_ppc_count,
					u32 *__offset_in_segment)
{
	u32 offset_in_segment = 0;
	u32 tpc_count = nvgpu_netlist_get_tpc_ctxsw_regs(g)->count;
	u32 etpc_count = nvgpu_netlist_get_etpc_ctxsw_regs(g)->count;

	if (addr_type == CTXSW_ADDR_TYPE_TPC) {
		/*
		 * reg = nvgpu_netlist_get_tpc_ctxsw_regs(g)->l;
		 * offset_in_segment = 0;
		 */
	} else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) ||
			(addr_type == CTXSW_ADDR_TYPE_ETPC)) {
		offset_in_segment = ((tpc_count * num_tpcs) << 2);

		nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg,
			"egpc etpc offset_in_segment 0x%#08x",
			offset_in_segment);
	} else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
		/*
		 * The ucode stores TPC data before PPC data.
		 * Advance offset past TPC data to PPC data.
		 */
		offset_in_segment =
			(((tpc_count + etpc_count) * num_tpcs) << 2);
	} else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
		/*
		 * The ucode stores TPC/PPC data before GPC data.
		 * Advance offset past TPC/PPC data to GPC data.
		 *
		 * Note 1 PES_PER_GPC case
		 */
		u32 num_pes_per_gpc = nvgpu_get_litter_value(g,
				GPU_LIT_NUM_PES_PER_GPC);
		if (num_pes_per_gpc > 1U) {
			offset_in_segment =
				((((tpc_count + etpc_count) * num_tpcs) << 2) +
					((reg_list_ppc_count * num_ppcs) << 2));
		} else {
			offset_in_segment =
				(((tpc_count + etpc_count) * num_tpcs) << 2);
		}
	} else {
		nvgpu_log_fn(g, "Unknown address type.");
		return -EINVAL;
	}

	*__offset_in_segment = offset_in_segment;
	return 0;
}

/*
 *  This function will return the 32 bit offset for a priv register if it is
 *  present in the context buffer. The context buffer is in CPU memory.
 */
static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
					       u32 addr,
					       bool is_quad, u32 quad,
					       u32 *context_buffer,
					       u32 context_buffer_size,
					       u32 *priv_offset)
{
	u32 i;
	int err;
	enum ctxsw_addr_type addr_type;
	u32 broadcast_flags;
	u32 gpc_num, tpc_num, ppc_num, be_num;
	u32 num_gpcs, num_tpcs, num_ppcs;
	u32 offset;
	u32 sys_priv_offset, gpc_priv_offset;
	u32 ppc_mask, reg_list_ppc_count;
	u8 *context;
	u32 offset_to_segment, offset_in_segment = 0;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);

	err = g->ops.gr.decode_priv_addr(g, addr, &addr_type,
					&gpc_num, &tpc_num, &ppc_num, &be_num,
					&broadcast_flags);
	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
			"addr_type = %d, broadcast_flags: %08x",
			addr_type, broadcast_flags);
	if (err != 0) {
		return err;
	}

	context = (u8 *)context_buffer;
	if (!g->ops.gr.ctxsw_prog.check_main_image_header_magic(context)) {
		nvgpu_err(g,
			   "Invalid main header: magic value");
		return -EINVAL;
	}
	num_gpcs = g->ops.gr.ctxsw_prog.get_num_gpcs(context);

	/* Parse the FECS local header. */
	context += g->ops.gr.ctxsw_prog.hw_get_fecs_header_size();
	if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) {
		nvgpu_err(g,
			   "Invalid FECS local header: magic value");
		return -EINVAL;
	}

	sys_priv_offset =
	       g->ops.gr.ctxsw_prog.get_local_priv_register_ctl_offset(context);
	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset);

	/* If found in Ext buffer, ok.
	 * If it failed and we expected to find it there (quad offset)
	 * then return the error.  Otherwise continue on.
	 */
	err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
				      addr, is_quad, quad, context_buffer,
				      context_buffer_size, priv_offset);
	if ((err == 0) || ((err != 0) && is_quad)) {
		nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
				"err = %d, is_quad = %s",
				err, is_quad ? "true" : "false");
		return err;
	}

	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
		/* Find the offset in the FECS segment. */
		offset_to_segment = sys_priv_offset * 256U;

		err = gr_gk20a_process_context_buffer_priv_segment(g,
					   addr_type, addr,
					   0, 0, 0, 0,
					   &offset);
		if (err != 0) {
			return err;
		}

		*priv_offset = (offset_to_segment + offset);
		return 0;
	}

	if ((gpc_num + 1U) > num_gpcs)  {
		nvgpu_err(g,
			   "GPC %d not in this context buffer.",
			   gpc_num);
		return -EINVAL;
	}

	/* Parse the GPCCS local header(s).*/
	for (i = 0; i < num_gpcs; i++) {
		context += g->ops.gr.ctxsw_prog.hw_get_gpccs_header_size();
		if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) {
			nvgpu_err(g,
				   "Invalid GPCCS local header: magic value");
			return -EINVAL;

		}
		gpc_priv_offset = g->ops.gr.ctxsw_prog.get_local_priv_register_ctl_offset(context);

		err = gr_gk20a_determine_ppc_configuration(g, context,
							   &num_ppcs, &ppc_mask,
							   &reg_list_ppc_count);
		if (err != 0) {
			nvgpu_err(g, "determine ppc configuration failed");
			return err;
		}


		num_tpcs = g->ops.gr.ctxsw_prog.get_num_tpcs(context);

		if ((i == gpc_num) && ((tpc_num + 1U) > num_tpcs)) {
			nvgpu_err(g,
			   "GPC %d TPC %d not in this context buffer.",
				   gpc_num, tpc_num);
			return -EINVAL;
		}

		/* Find the offset in the GPCCS segment.*/
		if (i == gpc_num) {
			nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
					"gpc_priv_offset 0x%#08x",
					gpc_priv_offset);
			offset_to_segment = gpc_priv_offset * 256U;

			err = g->ops.gr.get_offset_in_gpccs_segment(g,
					addr_type,
					num_tpcs, num_ppcs, reg_list_ppc_count,
					&offset_in_segment);
			if (err != 0) {
				return -EINVAL;
			}

			offset_to_segment += offset_in_segment;
			nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
				"offset_to_segment 0x%#08x",
				offset_to_segment);

			err = gr_gk20a_process_context_buffer_priv_segment(g,
							   addr_type, addr,
							   i, num_tpcs,
							   num_ppcs, ppc_mask,
							   &offset);
			if (err != 0) {
				return -EINVAL;
			}

			*priv_offset = offset_to_segment + offset;
			return 0;
		}
	}

	return -EINVAL;
}

bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch)
{
	u32 curr_gr_ctx;
	u32 curr_gr_tsgid;
	struct gk20a *g = ch->g;
	struct channel_gk20a *curr_ch;
	bool ret = false;
	struct tsg_gk20a *tsg;

	curr_gr_ctx = g->ops.gr.falcon.get_current_ctx(g);

	/* when contexts are unloaded from GR, the valid bit is reset
	 * but the instance pointer information remains intact. So the
	 * valid bit must be checked to be absolutely certain that a
	 * valid context is currently resident.
	 */
	if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) {
		return false;
	}

	curr_ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_gr_ctx,
					      &curr_gr_tsgid);

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
		  "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d"
		  " ch->chid=%d",
		  (curr_ch != NULL) ? curr_ch->chid : U32_MAX,
		  curr_gr_tsgid,
		  ch->tsgid,
		  ch->chid);

	if (curr_ch == NULL) {
		return false;
	}

	if (ch->chid == curr_ch->chid) {
		ret = true;
	}

	tsg = tsg_gk20a_from_ch(ch);
	if ((tsg != NULL) && (tsg->tsgid == curr_gr_tsgid)) {
		ret = true;
	}

	gk20a_channel_put(curr_ch);
	return ret;
}

int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
			    struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
			    u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
			    bool ch_is_curr_ctx)
{
	struct gk20a *g = ch->g;
	struct tsg_gk20a *tsg;
	struct nvgpu_gr_ctx *gr_ctx;
	bool gr_ctx_ready = false;
	bool pm_ctx_ready = false;
	struct nvgpu_mem *current_mem = NULL;
	u32 i, j, offset, v;
	struct nvgpu_gr *gr = g->gr;
	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);
	u32 max_offsets = nvgpu_gr_config_get_max_gpc_count(gr->config) *
		nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) *
		sm_per_tpc;
	u32 *offsets = NULL;
	u32 *offset_addrs = NULL;
	u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
	int err = 0, pass;

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
		   num_ctx_wr_ops, num_ctx_rd_ops);

	tsg = tsg_gk20a_from_ch(ch);
	if (tsg == NULL) {
		return -EINVAL;
	}

	gr_ctx = tsg->gr_ctx;

	if (ch_is_curr_ctx) {
		for (pass = 0; pass < 2; pass++) {
			ctx_op_nr = 0;
			for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
				/* only do ctx ops and only on the right pass */
				if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
				    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
				     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
					continue;
				}

				/* if this is a quad access, setup for special access*/
				if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD))
					&& (g->ops.gr.access_smpc_reg != NULL)) {
					g->ops.gr.access_smpc_reg(g,
							ctx_ops[i].quad,
							ctx_ops[i].offset);
				}
				offset = ctx_ops[i].offset;

				if (pass == 0) { /* write pass */
					v = gk20a_readl(g, offset);
					v &= ~ctx_ops[i].and_n_mask_lo;
					v |= ctx_ops[i].value_lo;
					gk20a_writel(g, offset, v);

					nvgpu_log(g, gpu_dbg_gpu_dbg,
						   "direct wr: offset=0x%x v=0x%x",
						   offset, v);

					if (ctx_ops[i].op == REGOP(WRITE_64)) {
						v = gk20a_readl(g, offset + 4U);
						v &= ~ctx_ops[i].and_n_mask_hi;
						v |= ctx_ops[i].value_hi;
						gk20a_writel(g, offset + 4U, v);

						nvgpu_log(g, gpu_dbg_gpu_dbg,
							   "direct wr: offset=0x%x v=0x%x",
							   offset + 4U, v);
					}

				} else { /* read pass */
					ctx_ops[i].value_lo =
						gk20a_readl(g, offset);

					nvgpu_log(g, gpu_dbg_gpu_dbg,
						   "direct rd: offset=0x%x v=0x%x",
						   offset, ctx_ops[i].value_lo);

					if (ctx_ops[i].op == REGOP(READ_64)) {
						ctx_ops[i].value_hi =
							gk20a_readl(g, offset + 4U);

						nvgpu_log(g, gpu_dbg_gpu_dbg,
							   "direct rd: offset=0x%x v=0x%x",
							   offset, ctx_ops[i].value_lo);
					} else {
						ctx_ops[i].value_hi = 0;
					}
				}
				ctx_op_nr++;
			}
		}
		goto cleanup;
	}

	/* they're the same size, so just use one alloc for both */
	offsets = nvgpu_kzalloc(g, 2U * sizeof(u32) * max_offsets);
	if (offsets == NULL) {
		err = -ENOMEM;
		goto cleanup;
	}
	offset_addrs = offsets + max_offsets;

	err = nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, false);
	if (err != 0) {
		goto cleanup;
	}

	err = g->ops.mm.cache.l2_flush(g, true);
	if (err != 0) {
		nvgpu_err(g, "l2_flush failed");
		goto cleanup;
	}

	/* write to appropriate place in context image,
	 * first have to figure out where that really is */

	/* first pass is writes, second reads */
	for (pass = 0; pass < 2; pass++) {
		ctx_op_nr = 0;
		for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
			u32 num_offsets;

			/* only do ctx ops and only on the right pass */
			if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
			    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
			     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) {
				continue;
			}

			err = gr_gk20a_get_ctx_buffer_offsets(g,
						ctx_ops[i].offset,
						max_offsets,
						offsets, offset_addrs,
						&num_offsets,
						ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
						ctx_ops[i].quad);
			if (err == 0) {
				if (!gr_ctx_ready) {
					gr_ctx_ready = true;
				}
				current_mem = nvgpu_gr_ctx_get_ctx_mem(gr_ctx);
			} else {
				err = gr_gk20a_get_pm_ctx_buffer_offsets(g,
							ctx_ops[i].offset,
							max_offsets,
							offsets, offset_addrs,
							&num_offsets);
				if (err != 0) {
					nvgpu_log(g, gpu_dbg_gpu_dbg,
					   "ctx op invalid offset: offset=0x%x",
					   ctx_ops[i].offset);
					ctx_ops[i].status =
						REGOP(STATUS_INVALID_OFFSET);
					continue;
				}
				if (!pm_ctx_ready) {
					/* Make sure ctx buffer was initialized */
					if (!nvgpu_mem_is_valid(nvgpu_gr_ctx_get_pm_ctx_mem(gr_ctx))) {
						nvgpu_err(g,
							"Invalid ctx buffer");
						err = -EINVAL;
						goto cleanup;
					}
					pm_ctx_ready = true;
				}
				current_mem = nvgpu_gr_ctx_get_pm_ctx_mem(gr_ctx);
			}

			/* if this is a quad access, setup for special access*/
			if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)) &&
				(g->ops.gr.access_smpc_reg != NULL)) {
				g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
							 ctx_ops[i].offset);
			}

			for (j = 0; j < num_offsets; j++) {
				/* sanity check gr ctxt offsets,
				 * don't write outside, worst case
				 */
				if ((current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx)) &&
						(offsets[j] >=
						 nvgpu_gr_obj_ctx_get_golden_image_size(
							g->gr->golden_image))) {
					continue;
				}
				if (pass == 0) { /* write pass */
					v = nvgpu_mem_rd(g, current_mem, offsets[j]);
					v &= ~ctx_ops[i].and_n_mask_lo;
					v |= ctx_ops[i].value_lo;
					nvgpu_mem_wr(g, current_mem, offsets[j], v);

					nvgpu_log(g, gpu_dbg_gpu_dbg,
						   "context wr: offset=0x%x v=0x%x",
						   offsets[j], v);

					if (ctx_ops[i].op == REGOP(WRITE_64)) {
						v = nvgpu_mem_rd(g, current_mem, offsets[j] + 4U);
						v &= ~ctx_ops[i].and_n_mask_hi;
						v |= ctx_ops[i].value_hi;
						nvgpu_mem_wr(g, current_mem, offsets[j] + 4U, v);

						nvgpu_log(g, gpu_dbg_gpu_dbg,
							   "context wr: offset=0x%x v=0x%x",
							   offsets[j] + 4U, v);
					}

					if (current_mem == nvgpu_gr_ctx_get_ctx_mem(gr_ctx)) {
						/* check to see if we need to add a special WAR
						   for some of the SMPC perf regs */
						gr_gk20a_ctx_patch_smpc(g, ch,
							offset_addrs[j],
							v, gr_ctx);
					}
				} else { /* read pass */
					ctx_ops[i].value_lo =
						nvgpu_mem_rd(g, current_mem, offsets[0]);

					nvgpu_log(g, gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
						   offsets[0], ctx_ops[i].value_lo);

					if (ctx_ops[i].op == REGOP(READ_64)) {
						ctx_ops[i].value_hi =
							nvgpu_mem_rd(g, current_mem, offsets[0] + 4U);

						nvgpu_log(g, gpu_dbg_gpu_dbg,
							   "context rd: offset=0x%x v=0x%x",
							   offsets[0] + 4U, ctx_ops[i].value_hi);
					} else {
						ctx_ops[i].value_hi = 0;
					}
				}
			}
			ctx_op_nr++;
		}
	}

 cleanup:
	if (offsets != NULL) {
		nvgpu_kfree(g, offsets);
	}

	if (nvgpu_gr_ctx_get_patch_ctx_mem(gr_ctx)->cpu_va != NULL) {
		nvgpu_gr_ctx_patch_write_end(g, gr_ctx, gr_ctx_ready);
	}

	return err;
}

int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
			  struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops,
			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops,
			  bool *is_curr_ctx)
{
	struct gk20a *g = ch->g;
	int err, tmp_err;
	bool ch_is_curr_ctx;

	/* disable channel switching.
	 * at that point the hardware state can be inspected to
	 * determine if the context we're interested in is current.
	 */
	err = g->ops.gr.disable_ctxsw(g);
	if (err != 0) {
		nvgpu_err(g, "unable to stop gr ctxsw");
		/* this should probably be ctx-fatal... */
		return err;
	}

	ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch);
	if (is_curr_ctx != NULL) {
		*is_curr_ctx = ch_is_curr_ctx;
	}
	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d",
		  ch_is_curr_ctx);

	err = __gr_gk20a_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops,
				      num_ctx_rd_ops, ch_is_curr_ctx);

	tmp_err = g->ops.gr.enable_ctxsw(g);
	if (tmp_err != 0) {
		nvgpu_err(g, "unable to restart ctxsw!");
		err = tmp_err;
	}

	return err;
}

int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
		u32 global_esr_mask, bool check_errors)
{
	bool locked_down;
	bool no_error_pending;
	u32 delay = POLL_DELAY_MIN_US;
	bool mmu_debug_mode_enabled = g->ops.fb.is_debug_mode_enabled(g);
	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
	u32 dbgr_status0 = 0, dbgr_control0 = 0;
	u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0;
	struct nvgpu_timeout timeout;
	u32 warp_esr;

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
		"GPC%d TPC%d SM%d: locking down SM", gpc, tpc, sm);

	nvgpu_timeout_init(g, &timeout, nvgpu_get_poll_timeout(g),
			   NVGPU_TIMER_CPU_TIMER);

	/* wait for the sm to lock down */
	do {
		u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g,
						gpc, tpc, sm);
		dbgr_status0 = gk20a_readl(g,
				gr_gpc0_tpc0_sm_dbgr_status0_r() + offset);

		warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);

		locked_down =
		    (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
		     gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
		no_error_pending =
			check_errors &&
			(gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) ==
			 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) &&
			((global_esr & ~global_esr_mask) == 0U);

		if (locked_down || no_error_pending) {
			nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
				  "GPC%d TPC%d SM%d: locked down SM",
					gpc, tpc, sm);
			return 0;
		}

		/* if an mmu fault is pending and mmu debug mode is not
		 * enabled, the sm will never lock down. */
		if (!mmu_debug_mode_enabled &&
		     (g->ops.mc.is_mmu_fault_pending(g))) {
			nvgpu_err(g,
				"GPC%d TPC%d: mmu fault pending,"
				" SM%d will never lock down!", gpc, tpc, sm);
			return -EFAULT;
		}

		nvgpu_usleep_range(delay, delay * 2U);
		delay = min_t(u32, delay << 1, POLL_DELAY_MAX_US);
	} while (nvgpu_timeout_expired(&timeout) == 0);

	dbgr_control0 = gk20a_readl(g,
				gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);

	/* 64 bit read */
	warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32;
	warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset);

	/* 64 bit read */
	warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32;
	warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset);

	/* 64 bit read */
	warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32;
	warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset);

	nvgpu_err(g,
		"GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc);
	nvgpu_err(g,
		"STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx",
		gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0,
		warps_valid, warps_paused, warps_trapped);

	return -ETIMEDOUT;
}

void gk20a_gr_suspend_single_sm(struct gk20a *g,
		u32 gpc, u32 tpc, u32 sm,
		u32 global_esr_mask, bool check_errors)
{
	int err;
	u32 dbgr_control0;
	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);

	/* if an SM debugger isn't attached, skip suspend */
	if (!g->ops.gr.sm_debugger_attached(g)) {
		nvgpu_err(g,
			"SM debugger not attached, skipping suspend!");
		return;
	}

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
		"suspending gpc:%d, tpc:%d, sm%d", gpc, tpc, sm);

	/* assert stop trigger. */
	dbgr_control0 = gk20a_readl(g,
				gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
	dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
	gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset,
			dbgr_control0);

	err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm,
			global_esr_mask, check_errors);
	if (err != 0) {
		nvgpu_err(g,
			"SuspendSm failed");
		return;
	}
}

void gk20a_gr_suspend_all_sms(struct gk20a *g,
		u32 global_esr_mask, bool check_errors)
{
	struct nvgpu_gr *gr = g->gr;
	u32 gpc, tpc, sm;
	int err;
	u32 dbgr_control0;
	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);

	/* if an SM debugger isn't attached, skip suspend */
	if (!g->ops.gr.sm_debugger_attached(g)) {
		nvgpu_err(g,
			"SM debugger not attached, skipping suspend!");
		return;
	}

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending all sms");
	/* assert stop trigger. uniformity assumption: all SMs will have
	 * the same state in dbg_control0.
	 */
	dbgr_control0 =
		gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
	dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();

	/* broadcast write */
	gk20a_writel(g,
		gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);

	for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) {
		for (tpc = 0;
		     tpc < nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc);
		     tpc++) {
			for (sm = 0; sm < sm_per_tpc; sm++) {
				err = g->ops.gr.wait_for_sm_lock_down(g,
					gpc, tpc, sm,
					global_esr_mask, check_errors);
				if (err != 0) {
					nvgpu_err(g, "SuspendAllSms failed");
					return;
				}
			}
		}
	}
}

void gk20a_gr_resume_single_sm(struct gk20a *g,
		u32 gpc, u32 tpc, u32 sm)
{
	u32 dbgr_control0;
	u32 offset;
	/*
	 * The following requires some clarification. Despite the fact that both
	 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
	 *  names, only one is actually a trigger, and that is the STOP_TRIGGER.
	 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
	 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
	 * (_DISABLE) as well.

	* Advice from the arch group:  Disable the stop trigger first, as a
	* separate operation, in order to ensure that the trigger has taken
	* effect, before enabling the run trigger.
	*/

	offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);

	/*De-assert stop trigger */
	dbgr_control0 =
		gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset);
	dbgr_control0 = set_field(dbgr_control0,
			gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(),
			gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f());
	gk20a_writel(g,
		gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);

	/* Run trigger */
	dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
	gk20a_writel(g,
		gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0);
}

void gk20a_gr_resume_all_sms(struct gk20a *g)
{
	u32 dbgr_control0;
	/*
	 * The following requires some clarification. Despite the fact that both
	 * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their
	 *  names, only one is actually a trigger, and that is the STOP_TRIGGER.
	 * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to
	 * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0
	 * (_DISABLE) as well.

	* Advice from the arch group:  Disable the stop trigger first, as a
	* separate operation, in order to ensure that the trigger has taken
	* effect, before enabling the run trigger.
	*/

	/*De-assert stop trigger */
	dbgr_control0 =
		gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r());
	dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();
	gk20a_writel(g,
		gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);

	/* Run trigger */
	dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f();
	gk20a_writel(g,
		gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);
}

int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
	struct channel_gk20a *ch, u64 sms, bool enable)
{
	struct nvgpu_dbg_reg_op *ops;
	unsigned int i = 0, sm_id;
	int err;
	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
	u32 no_of_sm = nvgpu_gr_config_get_no_of_sm(g->gr->config);

	ops = nvgpu_kcalloc(g, no_of_sm, sizeof(*ops));
	if (ops == NULL) {
		return -ENOMEM;
	}

	for (sm_id = 0; sm_id < no_of_sm; sm_id++) {
		u32 gpc, tpc;
		u32 tpc_offset, gpc_offset, reg_offset, reg_mask, reg_val;
		struct sm_info *sm_info;

		if ((sms & BIT64(sm_id)) == 0ULL) {
			continue;
		}
		sm_info = nvgpu_gr_config_get_sm_info(g->gr->config, sm_id);
		gpc = nvgpu_gr_config_get_sm_info_gpc_index(sm_info);
		tpc = nvgpu_gr_config_get_sm_info_tpc_index(sm_info);

		tpc_offset = tpc_in_gpc_stride * tpc;
		gpc_offset = gpc_stride * gpc;
		reg_offset = tpc_offset + gpc_offset;

		ops[i].op = REGOP(WRITE_32);
		ops[i].type = REGOP(TYPE_GR_CTX);
		ops[i].offset  = gr_gpc0_tpc0_sm_dbgr_control0_r() + reg_offset;

		reg_mask = 0;
		reg_val = 0;
		if (enable) {
			reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
			reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f();
			reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m();
			reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f();
			reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m();
			reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f();
		} else {
			reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m();
			reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f();
		}

		ops[i].and_n_mask_lo = reg_mask;
		ops[i].value_lo = reg_val;
		i++;
	}

	err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL);
	if (err != 0) {
		nvgpu_err(g, "Failed to access register");
	}
	nvgpu_kfree(g, ops);
	return err;
}

/*
 * gr_gk20a_suspend_context()
 * This API should be called with dbg_session lock held
 * and ctxsw disabled
 * Returns bool value indicating if context was resident
 * or not
 */
bool gr_gk20a_suspend_context(struct channel_gk20a *ch)
{
	struct gk20a *g = ch->g;
	bool ctx_resident = false;

	if (gk20a_is_channel_ctx_resident(ch)) {
		g->ops.gr.suspend_all_sms(g, 0, false);
		ctx_resident = true;
	} else {
		gk20a_disable_channel_tsg(g, ch);
	}

	return ctx_resident;
}

bool gr_gk20a_resume_context(struct channel_gk20a *ch)
{
	struct gk20a *g = ch->g;
	bool ctx_resident = false;

	if (gk20a_is_channel_ctx_resident(ch)) {
		g->ops.gr.resume_all_sms(g);
		ctx_resident = true;
	} else {
		gk20a_enable_channel_tsg(g, ch);
	}

	return ctx_resident;
}

int gr_gk20a_suspend_contexts(struct gk20a *g,
			      struct dbg_session_gk20a *dbg_s,
			      int *ctx_resident_ch_fd)
{
	int local_ctx_resident_ch_fd = -1;
	bool ctx_resident;
	struct channel_gk20a *ch;
	struct dbg_session_channel_data *ch_data;
	int err = 0;

	nvgpu_mutex_acquire(&g->dbg_sessions_lock);

	err = g->ops.gr.disable_ctxsw(g);
	if (err != 0) {
		nvgpu_err(g, "unable to stop gr ctxsw");
		goto clean_up;
	}

	nvgpu_mutex_acquire(&dbg_s->ch_list_lock);

	nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
			dbg_session_channel_data, ch_entry) {
		ch = g->fifo.channel + ch_data->chid;

		ctx_resident = gr_gk20a_suspend_context(ch);
		if (ctx_resident) {
			local_ctx_resident_ch_fd = ch_data->channel_fd;
		}
	}

	nvgpu_mutex_release(&dbg_s->ch_list_lock);

	err = g->ops.gr.enable_ctxsw(g);
	if (err != 0) {
		nvgpu_err(g, "unable to restart ctxsw!");
	}

	*ctx_resident_ch_fd = local_ctx_resident_ch_fd;

clean_up:
	nvgpu_mutex_release(&g->dbg_sessions_lock);

	return err;
}

int gr_gk20a_resume_contexts(struct gk20a *g,
			      struct dbg_session_gk20a *dbg_s,
			      int *ctx_resident_ch_fd)
{
	int local_ctx_resident_ch_fd = -1;
	bool ctx_resident;
	struct channel_gk20a *ch;
	int err = 0;
	struct dbg_session_channel_data *ch_data;

	nvgpu_mutex_acquire(&g->dbg_sessions_lock);

	err = g->ops.gr.disable_ctxsw(g);
	if (err != 0) {
		nvgpu_err(g, "unable to stop gr ctxsw");
		goto clean_up;
	}

	nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list,
			dbg_session_channel_data, ch_entry) {
		ch = g->fifo.channel + ch_data->chid;

		ctx_resident = gr_gk20a_resume_context(ch);
		if (ctx_resident) {
			local_ctx_resident_ch_fd = ch_data->channel_fd;
		}
	}

	err = g->ops.gr.enable_ctxsw(g);
	if (err != 0) {
		nvgpu_err(g, "unable to restart ctxsw!");
	}

	*ctx_resident_ch_fd = local_ctx_resident_ch_fd;

clean_up:
	nvgpu_mutex_release(&g->dbg_sessions_lock);

	return err;
}

int gr_gk20a_trigger_suspend(struct gk20a *g)
{
	int err = 0;
	u32 dbgr_control0;

	/* assert stop trigger. uniformity assumption: all SMs will have
	 * the same state in dbg_control0. */
	dbgr_control0 =
		gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
	dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f();

	/* broadcast write */
	gk20a_writel(g,
		gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0);

	return err;
}

int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state)
{
	int err = 0;
	struct nvgpu_gr *gr = g->gr;
	u32 gpc, tpc, sm, sm_id;
	u32 global_mask;
	u32 no_of_sm = nvgpu_gr_config_get_no_of_sm(gr->config);

	/* Wait for the SMs to reach full stop. This condition is:
	 * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE)
	 * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp
	 *    masks.
	*/
	global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);

	/* Lock down all SMs */
	for (sm_id = 0; sm_id < no_of_sm; sm_id++) {
		struct sm_info *sm_info =
			nvgpu_gr_config_get_sm_info(g->gr->config, sm_id);
		gpc = nvgpu_gr_config_get_sm_info_gpc_index(sm_info);
		tpc = nvgpu_gr_config_get_sm_info_tpc_index(sm_info);
		sm = nvgpu_gr_config_get_sm_info_sm_index(sm_info);

		err = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
				global_mask, false);
		if (err != 0) {
			nvgpu_err(g, "sm did not lock down!");
			return err;
		}
	}

	/* Read the warp status */
	g->ops.gr.bpt_reg_info(g, w_state);

	return 0;
}

int gr_gk20a_resume_from_pause(struct gk20a *g)
{
	int err = 0;

	/* Clear the pause mask to tell the GPU we want to resume everyone */
	gk20a_writel(g,
		gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(), 0);

	/* explicitly re-enable forwarding of SM interrupts upon any resume */
	g->ops.gr.intr.tpc_exception_sm_enable(g);

	/* Now resume all sms, write a 0 to the stop trigger
	 * then a 1 to the run trigger */
	g->ops.gr.resume_all_sms(g);

	return err;
}

int gr_gk20a_clear_sm_errors(struct gk20a *g)
{
	int ret = 0;
	u32 gpc, tpc, sm;
	struct nvgpu_gr *gr = g->gr;
	u32 global_esr;
	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);

	for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) {

		/* check if any tpc has an exception */
		for (tpc = 0;
		     tpc < nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc);
		     tpc++) {

			for (sm = 0; sm < sm_per_tpc; sm++) {
				global_esr = g->ops.gr.get_sm_hww_global_esr(g,
							 gpc, tpc, sm);

				/* clearing hwws, also causes tpc and gpc
				 * exceptions to be cleared
				 */
				g->ops.gr.clear_sm_hww(g,
					gpc, tpc, sm, global_esr);
			}
		}
	}

	return ret;
}

u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g)
{
	struct nvgpu_gr *gr = g->gr;
	u32 sm_id, tpc_exception_en = 0;
	u32 offset, regval, tpc_offset, gpc_offset;
	u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
	u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
	u32 no_of_sm = nvgpu_gr_config_get_no_of_sm(gr->config);

	for (sm_id = 0; sm_id < no_of_sm; sm_id++) {
		struct sm_info *sm_info =
			nvgpu_gr_config_get_sm_info(g->gr->config, sm_id);
		tpc_offset = tpc_in_gpc_stride *
			nvgpu_gr_config_get_sm_info_tpc_index(sm_info);
		gpc_offset = gpc_stride *
			nvgpu_gr_config_get_sm_info_gpc_index(sm_info);
		offset = tpc_offset + gpc_offset;

		regval = gk20a_readl(g,	gr_gpc0_tpc0_tpccs_tpc_exception_en_r() +
								offset);
		/* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */
		tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id;
	}

	return tpc_exception_en;
}

u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
{
	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
	u32 hww_warp_esr = gk20a_readl(g,
			 gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
	return hww_warp_esr;
}

u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm)
{
	u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);

	u32 hww_global_esr = gk20a_readl(g,
				 gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);

	return hww_global_esr;
}

u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g)
{
	/*
	 * These three interrupts don't require locking down the SM. They can
	 * be handled by usermode clients as they aren't fatal. Additionally,
	 * usermode clients may wish to allow some warps to execute while others
	 * are at breakpoints, as opposed to fatal errors where all warps should
	 * halt.
	 */
	u32 global_esr_mask =
		gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() |
		gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
		gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();

	return global_esr_mask;
}