linux-nvgpu/drivers/gpu/nvgpu/common/gr/gr_intr.c

/*
 * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/io.h>
#include <nvgpu/channel.h>
#include <nvgpu/rc.h>
#include <nvgpu/static_analysis.h>
#include <nvgpu/error_notifier.h>
#include <nvgpu/power_features/pg.h>
#if defined(CONFIG_NVGPU_CYCLESTATS)
#include <nvgpu/cyclestats.h>
#endif

#include <nvgpu/gr/gr.h>
#include <nvgpu/gr/gr_intr.h>
#include <nvgpu/gr/config.h>
#include <nvgpu/gr/gr_falcon.h>
#include <nvgpu/gr/fecs_trace.h>
#include <nvgpu/gr/gr_utils.h>

#include "gr_intr_priv.h"

void gr_intr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
		u32 mailbox_value)
{
	struct ctxsw_err_info err_info;

	err_info.curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
	err_info.ctxsw_status0 = g->ops.gr.falcon.read_fecs_ctxsw_status0(g);
	err_info.ctxsw_status1 = g->ops.gr.falcon.read_fecs_ctxsw_status1(g);
	err_info.mailbox_value = mailbox_value;
	err_info.chid = chid;

	nvgpu_report_ctxsw_err(g, NVGPU_ERR_MODULE_FECS,
		err_type, (void *)&err_info);
}

static int gr_intr_handle_pending_tpc_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
				bool *post_event, struct nvgpu_channel *fault_ch,
				u32 *hww_global_esr)
{
	int tmp_ret, ret = 0;
	u32 esr_sm_sel, sm;
	u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC);

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			"GPC%d TPC%d: SM exception pending", gpc, tpc);

	if (g->ops.gr.intr.handle_tpc_sm_ecc_exception != NULL) {
		g->ops.gr.intr.handle_tpc_sm_ecc_exception(g, gpc, tpc);
	}

	g->ops.gr.intr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel);

	for (sm = 0; sm < sm_per_tpc; sm++) {

		if ((esr_sm_sel & BIT32(sm)) == 0U) {
			continue;
		}

		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			"GPC%d TPC%d: SM%d exception pending",
			 gpc, tpc, sm);

		tmp_ret = g->ops.gr.intr.handle_sm_exception(g,
				gpc, tpc, sm, post_event, fault_ch,
				hww_global_esr);
		ret = (ret != 0) ? ret : tmp_ret;

		/* clear the hwws, also causes tpc and gpc
		 * exceptions to be cleared. Should be cleared
		 * only if SM is locked down or empty.
		 */
		g->ops.gr.intr.clear_sm_hww(g,
			gpc, tpc, sm, *hww_global_esr);
	}

	return ret;
}

static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
		bool *post_event, struct nvgpu_channel *fault_ch,
		u32 *hww_global_esr)
{
	int ret = 0;
	struct nvgpu_gr_tpc_exception pending_tpc;
	u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
					nvgpu_gr_tpc_offset(g, tpc));
	u32 tpc_exception = g->ops.gr.intr.get_tpc_exception(g, offset,
							&pending_tpc);


	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			"GPC%d TPC%d: pending exception 0x%x",
			gpc, tpc, tpc_exception);

	/* check if an sm exception is pending */
	if (pending_tpc.sm_exception) {
		ret = gr_intr_handle_pending_tpc_sm_exception(g, gpc, tpc,
			post_event, fault_ch, hww_global_esr);
	}

	/* check if a tex exception is pending */
	if (pending_tpc.tex_exception) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			  "GPC%d TPC%d: TEX exception pending", gpc, tpc);
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
		if (g->ops.gr.intr.handle_tex_exception != NULL) {
			g->ops.gr.intr.handle_tex_exception(g, gpc, tpc);
		}
#endif
	}

	/* check if a mpc exception is pending */
	if (pending_tpc.mpc_exception) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			  "GPC%d TPC%d: MPC exception pending", gpc, tpc);
		if (g->ops.gr.intr.handle_tpc_mpc_exception != NULL) {
			g->ops.gr.intr.handle_tpc_mpc_exception(g, gpc, tpc);
		}
	}

	/* check if a pe exception is pending */
	if (pending_tpc.pe_exception) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			  "GPC%d TPC%d: PE exception pending", gpc, tpc);
		if (g->ops.gr.intr.handle_tpc_pe_exception != NULL) {
			g->ops.gr.intr.handle_tpc_pe_exception(g, gpc, tpc);
		}
	}

	return ret;
}

#if defined(CONFIG_NVGPU_CHANNEL_TSG_CONTROL) && defined(CONFIG_NVGPU_DEBUGGER)
static void gr_intr_post_bpt_events(struct gk20a *g, struct nvgpu_tsg *tsg,
				    u32 global_esr)
{
	if (g->ops.gr.esr_bpt_pending_events(global_esr,
						NVGPU_EVENT_ID_BPT_INT)) {
		g->ops.tsg.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT);
	}

	if (g->ops.gr.esr_bpt_pending_events(global_esr,
						NVGPU_EVENT_ID_BPT_PAUSE)) {
		g->ops.tsg.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE);
	}
}
#endif

static int gr_intr_handle_illegal_method(struct gk20a *g,
					  struct nvgpu_gr_isr_data *isr_data)
{
	int ret = g->ops.gr.intr.handle_sw_method(g, isr_data->addr,
			isr_data->class_num, isr_data->offset,
			isr_data->data_lo);
	if (ret != 0) {
		nvgpu_gr_intr_set_error_notifier(g, isr_data,
			 NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
		nvgpu_err(g, "invalid method class 0x%08x"
			", offset 0x%08x address 0x%08x",
			isr_data->class_num, isr_data->offset, isr_data->addr);
	}
	return ret;
}

static void gr_intr_handle_class_error(struct gk20a *g,
				       struct nvgpu_gr_isr_data *isr_data)
{
	u32 chid = (isr_data->ch != NULL) ?
		isr_data->ch->chid : NVGPU_INVALID_CHANNEL_ID;

	nvgpu_log_fn(g, " ");

	g->ops.gr.intr.handle_class_error(g, chid, isr_data);

	nvgpu_gr_intr_set_error_notifier(g, isr_data,
			 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
}

static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
		u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
{
	struct gr_sm_mcerr_info err_info;
	struct nvgpu_channel *ch;
	struct gr_err_info info;
	u32 tsgid, chid, curr_ctx, inst = 0;

	tsgid = NVGPU_INVALID_TSG_ID;
	curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
	if (curr_ctx == 0U) {
		return;
	}

	ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
	chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;
	if (ch != NULL) {
		nvgpu_channel_put(ch);
	}

	(void) memset(&err_info, 0, sizeof(err_info));
	(void) memset(&info, 0, sizeof(info));
	err_info.curr_ctx = curr_ctx;
	err_info.chid = chid;
	err_info.tsgid = tsgid;
	err_info.hww_warp_esr_pc = hww_warp_esr_pc;
	err_info.hww_warp_esr_status = hww_warp_esr_status;
	err_info.gpc = gpc;
	err_info.tpc = tpc;
	err_info.sm = sm;
	info.sm_mcerr_info = &err_info;
	nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
			GPU_SM_MACHINE_CHECK_ERROR, &info, 0U);
}

/* Used by sw interrupt thread to translate current ctx to chid.
 * Also used by regops to translate current ctx to chid and tsgid.
 * For performance, we don't want to go through 128 channels every time.
 * curr_ctx should be the value read from gr falcon get_current_ctx op
 * A small tlb is used here to cache translation.
 *
 * Returned channel must be freed with nvgpu_channel_put() */
struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
			u32 curr_ctx, u32 *curr_tsgid)
{
	struct nvgpu_fifo *f = &g->fifo;
	struct nvgpu_gr_intr *intr = nvgpu_gr_get_intr_ptr(g);
	u32 chid;
	u32 tsgid = NVGPU_INVALID_TSG_ID;
	u32 i;
	struct nvgpu_channel *ret_ch = NULL;

	/* when contexts are unloaded from GR, the valid bit is reset
	 * but the instance pointer information remains intact.
	 * This might be called from gr_isr where contexts might be
	 * unloaded. No need to check ctx_valid bit
	 */

	nvgpu_spinlock_acquire(&intr->ch_tlb_lock);

	/* check cache first */
	for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
		if (intr->chid_tlb[i].curr_ctx == curr_ctx) {
			chid = intr->chid_tlb[i].chid;
			tsgid = intr->chid_tlb[i].tsgid;
			ret_ch = nvgpu_channel_from_id(g, chid);
			goto unlock;
		}
	}

	/* slow path */
	for (chid = 0; chid < f->num_channels; chid++) {
		struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);

		if (ch == NULL) {
			continue;
		}

		if (nvgpu_inst_block_ptr(g, &ch->inst_block) ==
				g->ops.gr.falcon.get_ctx_ptr(curr_ctx)) {
			tsgid = ch->tsgid;
			/* found it */
			ret_ch = ch;
			break;
		}
		nvgpu_channel_put(ch);
	}

	if (ret_ch == NULL) {
		goto unlock;
	}

	/* add to free tlb entry */
	for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
		if (intr->chid_tlb[i].curr_ctx == 0U) {
			intr->chid_tlb[i].curr_ctx = curr_ctx;
			intr->chid_tlb[i].chid = chid;
			intr->chid_tlb[i].tsgid = tsgid;
			goto unlock;
		}
	}

	/* no free entry, flush one */
	intr->chid_tlb[intr->channel_tlb_flush_index].curr_ctx = curr_ctx;
	intr->chid_tlb[intr->channel_tlb_flush_index].chid = chid;
	intr->chid_tlb[intr->channel_tlb_flush_index].tsgid = tsgid;

	intr->channel_tlb_flush_index =
		(nvgpu_safe_add_u32(intr->channel_tlb_flush_index, 1U)) &
		(nvgpu_safe_sub_u32(GR_CHANNEL_MAP_TLB_SIZE, 1U));

unlock:
	nvgpu_spinlock_release(&intr->ch_tlb_lock);
	*curr_tsgid = tsgid;
	return ret_ch;
}

void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
		u32 err_type, u32 status, u32 sub_err_type)
{
	struct nvgpu_channel *ch = NULL;
	struct gr_exception_info err_info;
	struct gr_err_info info;
	u32 tsgid, chid, curr_ctx;

	tsgid = NVGPU_INVALID_TSG_ID;
	curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
	if (curr_ctx != 0U) {
		ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
	}
	chid = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;
	if (ch != NULL) {
		nvgpu_channel_put(ch);
	}

	(void) memset(&err_info, 0, sizeof(err_info));
	(void) memset(&info, 0, sizeof(info));
	err_info.curr_ctx = curr_ctx;
	err_info.chid = chid;
	err_info.tsgid = tsgid;
	err_info.status = status;
	info.exception_info = &err_info;
	nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
			inst, err_type, &info, sub_err_type);
}

void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
		  struct nvgpu_gr_isr_data *isr_data, u32 error_notifier)
{
	struct nvgpu_channel *ch;
	struct nvgpu_tsg *tsg;

	ch = isr_data->ch;

	if (ch == NULL) {
		return;
	}

	tsg = nvgpu_tsg_from_ch(ch);
	if (tsg != NULL) {
		nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
	} else {
		nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
	}
}

static bool is_global_esr_error(u32 global_esr, u32 global_mask)
{
	return ((global_esr & ~global_mask) != 0U) ? true: false;
}

static void gr_intr_report_warp_error(struct gk20a *g, u32 gpc, u32 tpc,
			u32 sm,	u32 global_esr, u32 warp_esr,
			u32 global_mask, u32 offset)
{
	u64 hww_warp_esr_pc = 0;

	if (is_global_esr_error(global_esr, global_mask)) {
		if (g->ops.gr.intr.get_sm_hww_warp_esr_pc != NULL) {
			hww_warp_esr_pc = g->ops.gr.intr.get_sm_hww_warp_esr_pc(g,
					offset);
		}
		gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr,
				hww_warp_esr_pc);
	}
}

#ifdef CONFIG_NVGPU_DEBUGGER
static int gr_intr_sm_exception_warp_sync(struct gk20a *g,
		u32 gpc, u32 tpc, u32 sm,
		u32 global_esr, u32 warp_esr, u32 global_mask,
		bool ignore_debugger, bool *post_event)
{
	int ret = 0;
	bool do_warp_sync = false;

	if (!ignore_debugger && ((warp_esr != 0U) ||
			(is_global_esr_error(global_esr, global_mask)))) {
		nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
		do_warp_sync = true;
	}

	if (do_warp_sync) {
		ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
				 global_mask, true);
		if (ret != 0) {
			nvgpu_err(g, "sm did not lock down!");
			return ret;
		}
	}

	if (ignore_debugger) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			"ignore_debugger set, skipping event posting");
	} else {
		*post_event = true;
	}

	return ret;
}
#endif

int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
		bool *post_event, struct nvgpu_channel *fault_ch,
		u32 *hww_global_esr)
{
	int ret = 0;
	u32 offset = nvgpu_safe_add_u32(nvgpu_gr_gpc_offset(g, gpc),
					  nvgpu_gr_tpc_offset(g, tpc));
	u32 global_esr, warp_esr, global_mask;
#ifdef CONFIG_NVGPU_DEBUGGER
	bool sm_debugger_attached;
	bool early_exit = false, ignore_debugger = false;
	bool disable_sm_exceptions = true;
#endif

	nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");

	global_esr = g->ops.gr.intr.get_sm_hww_global_esr(g, gpc, tpc, sm);
	*hww_global_esr = global_esr;

	warp_esr = g->ops.gr.intr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
	global_mask = g->ops.gr.intr.get_sm_no_lock_down_hww_global_esr_mask(g);

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
		  "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);

	/*
	 * Check and report any fatal warp errors.
	 */
	gr_intr_report_warp_error(g, gpc, tpc, sm, global_esr, warp_esr,
				global_mask, offset);

	(void)nvgpu_pg_elpg_protected_call(g,
		nvgpu_safe_cast_u32_to_s32(
		g->ops.gr.intr.record_sm_error_state(g, gpc, tpc,
							sm, fault_ch)));

#ifdef CONFIG_NVGPU_DEBUGGER
	sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
	if (!sm_debugger_attached) {
		nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
			  global_esr, warp_esr);
		return -EFAULT;
	}

	if (g->ops.gr.pre_process_sm_exception != NULL) {
		ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
				global_esr, warp_esr,
				sm_debugger_attached,
				fault_ch,
				&early_exit,
				&ignore_debugger);
		if (ret != 0) {
			nvgpu_err(g, "could not pre-process sm error!");
			return ret;
		}
	}

	if (early_exit) {
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
				"returning early");
		return ret;
	}

	/*
	 * Disable forwarding of tpc exceptions,
	 * the debugger will reenable exceptions after servicing them.
	 *
	 * Do not disable exceptions if the only SM exception is BPT_INT
	 */
	if ((g->ops.gr.esr_bpt_pending_events(global_esr,
			NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) {
		disable_sm_exceptions = false;
	}

	if (!ignore_debugger && disable_sm_exceptions) {
		g->ops.gr.intr.tpc_exception_sm_disable(g, offset);
		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			  "SM Exceptions disabled");
	}

	/* if debugger is present and an error has occurred, do a warp sync */
	ret = gr_intr_sm_exception_warp_sync(g, gpc, tpc, sm,
					global_esr, warp_esr, global_mask,
					ignore_debugger, post_event);
#else
	/* Return error so that recovery is triggered */
	ret = -EFAULT;
#endif

	return ret;
}

int nvgpu_gr_intr_handle_fecs_error(struct gk20a *g, struct nvgpu_channel *ch,
					struct nvgpu_gr_isr_data *isr_data)
{
	u32 gr_fecs_intr, mailbox_value;
	int ret = 0;
	struct nvgpu_fecs_host_intr_status fecs_host_intr;
	u32 chid = (isr_data->ch != NULL) ?
		isr_data->ch->chid : NVGPU_INVALID_CHANNEL_ID;
	u32 mailbox_id = NVGPU_GR_FALCON_FECS_CTXSW_MAILBOX6;

	gr_fecs_intr = g->ops.gr.falcon.fecs_host_intr_status(g,
						&fecs_host_intr);
	if (gr_fecs_intr == 0U) {
		return 0;
	}

	if (fecs_host_intr.unimp_fw_method_active) {
		mailbox_value = g->ops.gr.falcon.read_fecs_ctxsw_mailbox(g,
								mailbox_id);
		nvgpu_gr_intr_set_error_notifier(g, isr_data,
			 NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD);
		nvgpu_err(g, "firmware method error: "
			"mailxbox6 0x%08x, trapped_addr_reg 0x%08x "
			"set_falcon_method 0x%08x, class 0x%08x "
			"non-whitelist reg: 0x%08x",
			mailbox_value, isr_data->addr,
			isr_data->offset << 2U, isr_data->class_num,
			isr_data->data_lo);
		ret = -1;
	} else if (fecs_host_intr.watchdog_active) {
		gr_intr_report_ctxsw_error(g,
				GPU_FECS_CTXSW_WATCHDOG_TIMEOUT,
				chid, 0);
		/* currently, recovery is not initiated */
		nvgpu_err(g, "fecs watchdog triggered for channel %u, "
				"cannot ctxsw anymore !!", chid);
		g->ops.gr.falcon.dump_stats(g);
	} else if (fecs_host_intr.ctxsw_intr0 != 0U) {
		mailbox_value = g->ops.gr.falcon.read_fecs_ctxsw_mailbox(g,
								mailbox_id);
#ifdef CONFIG_NVGPU_FECS_TRACE
		if (mailbox_value ==
			g->ops.gr.fecs_trace.get_buffer_full_mailbox_val()) {
			nvgpu_info(g, "ctxsw intr0 set by ucode, "
					"timestamp buffer full");
			nvgpu_gr_fecs_trace_reset_buffer(g);
		} else
#endif
		/*
		 * The mailbox values may vary across chips hence keeping it
		 * as a HAL.
		 */
		if ((g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val != NULL)
			&& (mailbox_value ==
			g->ops.gr.intr.get_ctxsw_checksum_mismatch_mailbox_val())) {

			gr_intr_report_ctxsw_error(g,
					GPU_FECS_CTXSW_CRC_MISMATCH,
					chid, mailbox_value);
			nvgpu_err(g, "ctxsw intr0 set by ucode, "
					"ctxsw checksum mismatch");
			ret = -1;
		} else {
			/*
			 * Other errors are also treated as fatal and channel
			 * recovery is initiated and error is reported to
			 * 3LSS.
			 */
			gr_intr_report_ctxsw_error(g,
					GPU_FECS_FAULT_DURING_CTXSW,
					chid, mailbox_value);
			nvgpu_err(g,
				 "ctxsw intr0 set by ucode, error_code: 0x%08x",
				 mailbox_value);
			ret = -1;
		}
	} else if (fecs_host_intr.fault_during_ctxsw_active) {
		gr_intr_report_ctxsw_error(g,
				GPU_FECS_FAULT_DURING_CTXSW,
				chid, 0);
		nvgpu_err(g, "fecs fault during ctxsw for channel %u", chid);
		ret = -1;
	} else {
		nvgpu_err(g,
			"unhandled fecs error interrupt 0x%08x for channel %u",
			gr_fecs_intr, chid);
		g->ops.gr.falcon.dump_stats(g);
	}

	g->ops.gr.falcon.fecs_host_clear_intr(g, gr_fecs_intr);

	return ret;
}

static int gr_intr_check_handle_tpc_exception(struct gk20a *g, u32 gpc,
	u32 tpc_exception, bool *post_event, struct nvgpu_gr_config *gr_config,
	struct nvgpu_channel *fault_ch, u32 *hww_global_esr)
{
	int tmp_ret, ret = 0;
	u32 tpc;

	for (tpc = 0;
	     tpc < nvgpu_gr_config_get_gpc_tpc_count(gr_config, gpc);
	     tpc++) {
		if ((tpc_exception & BIT32(tpc)) == 0U) {
			continue;
		}

		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
			  "GPC%d: TPC%d exception pending", gpc, tpc);

		tmp_ret = gr_intr_handle_tpc_exception(g, gpc, tpc,
				post_event, fault_ch, hww_global_esr);
		ret = (ret != 0) ? ret : tmp_ret;
	}
	return ret;
}

int nvgpu_gr_intr_handle_gpc_exception(struct gk20a *g, bool *post_event,
	struct nvgpu_gr_config *gr_config, struct nvgpu_channel *fault_ch,
	u32 *hww_global_esr)
{
	int ret = 0;
	u32 gpc;
	u32 exception1 = g->ops.gr.intr.read_exception1(g);
	u32 gpc_exception, tpc_exception;

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, " ");

	for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr_config); gpc++) {
		if ((exception1 & BIT32(gpc)) == 0U) {
			continue;
		}

		nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
				"GPC%d exception pending", gpc);
		gpc_exception = g->ops.gr.intr.read_gpc_exception(g, gpc);
		tpc_exception = g->ops.gr.intr.read_gpc_tpc_exception(
							gpc_exception);

		/* check and handle if any tpc has an exception */
		ret = gr_intr_check_handle_tpc_exception(g, gpc, tpc_exception,
			post_event, gr_config, fault_ch, hww_global_esr);

		/* Handle GCC exception */
		if (g->ops.gr.intr.handle_gcc_exception != NULL) {
			g->ops.gr.intr.handle_gcc_exception(g, gpc,
				gpc_exception,
				&g->ecc.gr.gcc_l15_ecc_corrected_err_count[gpc].counter,
				&g->ecc.gr.gcc_l15_ecc_uncorrected_err_count[gpc].counter);
		}

		/* Handle GPCCS exceptions */
		if (g->ops.gr.intr.handle_gpc_gpccs_exception != NULL) {
			g->ops.gr.intr.handle_gpc_gpccs_exception(g, gpc,
				gpc_exception,
				&g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter,
				&g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
		}

		/* Handle GPCMMU exceptions */
		if (g->ops.gr.intr.handle_gpc_gpcmmu_exception != NULL) {
			 g->ops.gr.intr.handle_gpc_gpcmmu_exception(g, gpc,
				gpc_exception,
				&g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter,
				&g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
		}

		/* Handle PROP exception */
		if (g->ops.gr.intr.handle_gpc_prop_exception != NULL) {
			 g->ops.gr.intr.handle_gpc_prop_exception(g, gpc,
				gpc_exception);
		}

		/* Handle ZCULL exception */
		if (g->ops.gr.intr.handle_gpc_zcull_exception != NULL) {
			 g->ops.gr.intr.handle_gpc_zcull_exception(g, gpc,
				gpc_exception);
		}

		/* Handle SETUP exception */
		if (g->ops.gr.intr.handle_gpc_setup_exception != NULL) {
			 g->ops.gr.intr.handle_gpc_setup_exception(g, gpc,
				gpc_exception);
		}

		/* Handle PES exception */
		if (g->ops.gr.intr.handle_gpc_pes_exception != NULL) {
			 g->ops.gr.intr.handle_gpc_pes_exception(g, gpc,
				gpc_exception);
		}

	}

	return ret;
}

void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g,
					struct nvgpu_gr_isr_data *isr_data)
{
	struct nvgpu_channel *ch = isr_data->ch;
	int err;

	if (ch == NULL) {
		return;
	}

	if (nvgpu_tsg_from_ch(ch) == NULL) {
		return;
	}

	nvgpu_log_fn(g, " ");

#if defined(CONFIG_NVGPU_CYCLESTATS)
	nvgpu_cyclestats_exec(g, ch, isr_data->data_lo);
#endif

	err = nvgpu_cond_broadcast_interruptible(&ch->notifier_wq);
	if (err != 0) {
		nvgpu_log(g, gpu_dbg_intr, "failed to broadcast");
	}
}

void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
					   struct nvgpu_gr_isr_data *isr_data)
{
	struct nvgpu_channel *ch = isr_data->ch;
	struct nvgpu_tsg *tsg;

	if (ch == NULL) {
		return;
	}

	tsg = nvgpu_tsg_from_ch(ch);
	if (tsg != NULL) {
		int err;

#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
		g->ops.tsg.post_event_id(tsg,
			NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN);
#endif

		err = nvgpu_cond_broadcast(&ch->semaphore_wq);
		if (err != 0) {
			nvgpu_log(g, gpu_dbg_intr, "failed to broadcast");
		}
	} else {
		nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid);
	}
}

#ifdef CONFIG_NVGPU_DEBUGGER
static void gr_intr_signal_exception_event(struct gk20a *g,
				bool post_event,
				struct nvgpu_channel *fault_ch)
{
	if (g->ops.gr.sm_debugger_attached(g) &&
		post_event && (fault_ch != NULL)) {
		g->ops.debugger.post_events(fault_ch);
	}
}
#endif

static u32 gr_intr_handle_exception_interrupts(struct gk20a *g,
		u32 *clear_intr,
		struct nvgpu_tsg *tsg, u32 *global_esr,
		struct nvgpu_gr_intr_info *intr_info,
		struct nvgpu_gr_isr_data *isr_data)
{
	struct nvgpu_channel *fault_ch = NULL;
	struct nvgpu_gr_config *gr_config = nvgpu_gr_get_config_ptr(g);
	bool need_reset = false;

	if (intr_info->exception != 0U) {
		bool is_gpc_exception = false;

		need_reset = g->ops.gr.intr.handle_exceptions(g,
							&is_gpc_exception);

		/* check if a gpc exception has occurred */
		if (is_gpc_exception &&	!need_reset) {
			bool post_event = false;

			nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
					 "GPC exception pending");

			if (tsg != NULL) {
				fault_ch = isr_data->ch;
			}

			/* fault_ch can be NULL */
			/* check if any gpc has an exception */
			if (nvgpu_gr_intr_handle_gpc_exception(g, &post_event,
				gr_config, fault_ch, global_esr) != 0) {
				need_reset = true;
			}

#ifdef CONFIG_NVGPU_DEBUGGER
			/* signal clients waiting on an event */
			gr_intr_signal_exception_event(g,
						post_event, fault_ch);
#endif
		}
		*clear_intr &= ~intr_info->exception;

		if (need_reset) {
			nvgpu_err(g, "set gr exception notifier");
			nvgpu_gr_intr_set_error_notifier(g, isr_data,
					 NVGPU_ERR_NOTIFIER_GR_EXCEPTION);
		}
	}

	return (need_reset)? 1U : 0U;
}

static u32 gr_intr_handle_illegal_interrupts(struct gk20a *g,
		u32 gr_intr, u32 *clear_intr,
		struct nvgpu_gr_intr_info *intr_info,
		struct nvgpu_gr_isr_data *isr_data)
{
	u32 do_reset = 0U;

	if (intr_info->illegal_notify != 0U) {
		nvgpu_err(g, "illegal notify pending");

		nvgpu_gr_intr_report_exception(g, 0U,
				GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
				GPU_PGRAPH_ILLEGAL_NOTIFY);
		nvgpu_gr_intr_set_error_notifier(g, isr_data,
				NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
		do_reset = 1U;
		*clear_intr &= ~intr_info->illegal_notify;
	}

	if (intr_info->illegal_method != 0U) {
		if (gr_intr_handle_illegal_method(g, isr_data) != 0) {
			nvgpu_gr_intr_report_exception(g, 0U,
				GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
				GPU_PGRAPH_ILLEGAL_METHOD);

			do_reset = 1U;
		}
		*clear_intr &= ~intr_info->illegal_method;
	}

	if (intr_info->illegal_class != 0U) {
		nvgpu_gr_intr_report_exception(g, 0U,
				GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
				GPU_PGRAPH_ILLEGAL_CLASS);
		nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
			  isr_data->class_num, isr_data->offset);

		nvgpu_gr_intr_set_error_notifier(g, isr_data,
				NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
		do_reset = 1U;
		*clear_intr &= ~intr_info->illegal_class;
	}
	return do_reset;
}

static u32 gr_intr_handle_error_interrupts(struct gk20a *g,
		u32 gr_intr, u32 *clear_intr,
		struct nvgpu_gr_intr_info *intr_info,
		struct nvgpu_gr_isr_data *isr_data)
{
	u32 do_reset = 0U;

	if (intr_info->fecs_error != 0U) {
		if (g->ops.gr.intr.handle_fecs_error(g,
				isr_data->ch, isr_data) != 0) {
			do_reset = 1U;
		}
		*clear_intr &= ~intr_info->fecs_error;
	}

	if (intr_info->class_error != 0U) {
		nvgpu_gr_intr_report_exception(g, 0U,
				GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
				GPU_PGRAPH_CLASS_ERROR);
		gr_intr_handle_class_error(g, isr_data);
		do_reset = 1U;
		*clear_intr &= ~intr_info->class_error;
	}

	/* this one happens if someone tries to hit a non-whitelisted
	 * register using set_falcon[4] */
	if (intr_info->fw_method != 0U) {
		u32 ch_id = (isr_data->ch != NULL) ?
			isr_data->ch->chid : NVGPU_INVALID_CHANNEL_ID;
		nvgpu_err(g,
		   "firmware method 0x%08x, offset 0x%08x for channel %u",
		   isr_data->class_num, isr_data->offset,
		   ch_id);

		nvgpu_gr_intr_set_error_notifier(g, isr_data,
			 NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY);
		do_reset = 1U;
		*clear_intr &= ~intr_info->fw_method;
	}
	return do_reset;
}

static void gr_intr_handle_pending_interrupts(struct gk20a *g,
		u32 *clear_intr,
		struct nvgpu_gr_intr_info *intr_info,
		struct nvgpu_gr_isr_data *isr_data)
{
	if (intr_info->notify != 0U) {
		g->ops.gr.intr.handle_notify_pending(g, isr_data);
		*clear_intr &= ~intr_info->notify;
	}

	if (intr_info->semaphore != 0U) {
		g->ops.gr.intr.handle_semaphore_pending(g, isr_data);
		*clear_intr &= ~intr_info->semaphore;
	}
}

static struct nvgpu_tsg *gr_intr_get_channel_from_ctx(struct gk20a *g,
			u32 gr_intr, u32 *chid,
			struct nvgpu_gr_isr_data *isr_data)
{
	struct nvgpu_channel *ch = NULL;
	u32 tsgid = NVGPU_INVALID_TSG_ID;
	struct nvgpu_tsg *tsg_info = NULL;
	u32 channel_id;

	ch = nvgpu_gr_intr_get_channel_from_ctx(g, isr_data->curr_ctx, &tsgid);
	isr_data->ch = ch;
	channel_id = (ch != NULL) ? ch->chid : NVGPU_INVALID_CHANNEL_ID;

	if (ch == NULL) {
		nvgpu_err(g,
			"pgraph intr: 0x%08x, channel_id: INVALID", gr_intr);
	} else {
		tsg_info = nvgpu_tsg_from_ch(ch);
		if (tsg_info == NULL) {
			nvgpu_err(g, "pgraph intr: 0x%08x, channel_id: %d "
				"not bound to tsg", gr_intr, channel_id);
		}
	}

	nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
		"channel %d: addr 0x%08x, "
		"data 0x%08x 0x%08x,"
		"ctx 0x%08x, offset 0x%08x, "
		"subchannel 0x%08x, class 0x%08x",
		channel_id, isr_data->addr,
		isr_data->data_hi, isr_data->data_lo,
		isr_data->curr_ctx, isr_data->offset,
		isr_data->sub_chan, isr_data->class_num);

	*chid = channel_id;

	return tsg_info;
}

static void gr_clear_intr_status(struct gk20a *g,
			    struct nvgpu_gr_isr_data *isr_data,
			    u32 clear_intr, u32 gr_intr, u32 chid)
{
	if (clear_intr != 0U) {
		if (isr_data->ch == NULL) {
			/*
			 * This is probably an interrupt during
			 * gk20a_free_channel()
			 */
			nvgpu_err(g, "unhandled gr intr 0x%08x for "
				"unreferenceable channel, clearing",
				gr_intr);
		} else {
			nvgpu_err(g, "unhandled gr intr 0x%08x for chid %u",
				gr_intr, chid);
		}
	}
}

int nvgpu_gr_intr_stall_isr(struct gk20a *g)
{
	struct nvgpu_gr_isr_data isr_data;
	struct nvgpu_gr_intr_info intr_info;
	u32 need_reset = 0U;
	struct nvgpu_tsg *tsg = NULL;
	u32 global_esr = 0;
	u32 chid = NVGPU_INVALID_CHANNEL_ID;
	u32 gr_intr = g->ops.gr.intr.read_pending_interrupts(g, &intr_info);
	u32 clear_intr = gr_intr;

	nvgpu_log_fn(g, " ");
	nvgpu_log(g, gpu_dbg_intr, "pgraph intr 0x%08x", gr_intr);

	if (gr_intr == 0U) {
		return 0;
	}

	(void) memset(&isr_data, 0, sizeof(struct nvgpu_gr_isr_data));

	/* Disable fifo access */
	g->ops.gr.init.fifo_access(g, false);

	g->ops.gr.intr.trapped_method_info(g, &isr_data);

	if (isr_data.curr_ctx != 0U) {
		tsg = gr_intr_get_channel_from_ctx(g, gr_intr, &chid,
							&isr_data);
	}

	gr_intr_handle_pending_interrupts(g, &clear_intr,
					&intr_info, &isr_data);

	need_reset |= gr_intr_handle_illegal_interrupts(g, gr_intr,
				&clear_intr, &intr_info, &isr_data);

	need_reset |= gr_intr_handle_error_interrupts(g, gr_intr,
				&clear_intr, &intr_info, &isr_data);

	need_reset |= gr_intr_handle_exception_interrupts(g, &clear_intr,
				tsg, &global_esr, &intr_info, &isr_data);

	if (need_reset != 0U) {
		nvgpu_rc_gr_fault(g, tsg, isr_data.ch);
	}

	gr_clear_intr_status(g, &isr_data, clear_intr, gr_intr, chid);

	/* clear handled and unhandled interrupts */
	g->ops.gr.intr.clear_pending_interrupts(g, gr_intr);

	/* Enable fifo access */
	g->ops.gr.init.fifo_access(g, true);

#if defined(CONFIG_NVGPU_CHANNEL_TSG_CONTROL) && defined(CONFIG_NVGPU_DEBUGGER)
	/* Posting of BPT events should be the last thing in this function */
	if ((global_esr != 0U) && (tsg != NULL) && (need_reset == 0U)) {
		gr_intr_post_bpt_events(g, tsg, global_esr);
	}
#endif

	if (isr_data.ch != NULL) {
		nvgpu_channel_put(isr_data.ch);
	}

	return 0;
}

/* invalidate channel lookup tlb */
void nvgpu_gr_intr_flush_channel_tlb(struct gk20a *g)
{
	struct nvgpu_gr_intr *intr = nvgpu_gr_get_intr_ptr(g);

	nvgpu_spinlock_acquire(&intr->ch_tlb_lock);
	(void) memset(intr->chid_tlb, 0,
		sizeof(struct gr_channel_map_tlb_entry) *
		GR_CHANNEL_MAP_TLB_SIZE);
	nvgpu_spinlock_release(&intr->ch_tlb_lock);
}

struct nvgpu_gr_intr *nvgpu_gr_intr_init_support(struct gk20a *g)
{
	struct nvgpu_gr_intr *intr;

	nvgpu_log_fn(g, " ");

	intr = nvgpu_kzalloc(g, sizeof(*intr));
	if (intr == NULL) {
		return intr;
	}

	nvgpu_spinlock_init(&intr->ch_tlb_lock);

	return intr;
}

void nvgpu_gr_intr_remove_support(struct gk20a *g, struct nvgpu_gr_intr *intr)
{
	nvgpu_log_fn(g, " ");

	if (intr == NULL) {
		return;
	}
	nvgpu_kfree(g, intr);
}