gpu: nvgpu: move handle_sm_exception to gr.intr

Move gr_gp10b_handle_sm_exception from gr_gp10b to
gp10b_gr_intr_handle_sm_exception in hal.gr.intr unit

Move gr_gk20a_handle_sm_exception from gr_gk20a to
nvgpu_gr_intr_handle_sm_exception in common.gr.intr

Move nvgpu_report_gr_sm_exception to common.gr.intr

JIRA NVGPU-3016

Change-Id: I545ddca052122f87685f35f515831841a246dab3
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2103736
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2019-04-23 14:37:02 -07:00
committed by mobile promotions
parent 3bd35af767
commit 490ea365d2
16 changed files with 331 additions and 332 deletions

View File

@@ -26,6 +26,7 @@
#include <nvgpu/regops.h>
#include <nvgpu/rc.h>
#include <nvgpu/error_notifier.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/gr/gr.h>
#include <nvgpu/gr/gr_intr.h>
@@ -71,7 +72,7 @@ static int gr_intr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc,
"GPC%d TPC%d: SM%d exception pending",
gpc, tpc, sm);
tmp_ret = g->ops.gr.handle_sm_exception(g,
tmp_ret = g->ops.gr.intr.handle_sm_exception(g,
gpc, tpc, sm, post_event, fault_ch,
hww_global_esr);
ret = (ret != 0) ? ret : tmp_ret;
@@ -153,6 +154,48 @@ static int gr_intr_handle_class_error(struct gk20a *g,
return -EINVAL;
}
static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
{
int ret;
struct gr_sm_mcerr_info err_info;
struct channel_gk20a *ch;
struct gr_err_info info;
u32 tsgid, chid, curr_ctx, inst = 0;
if (g->ops.gr.err_ops.report_gr_err == NULL) {
return;
}
tsgid = NVGPU_INVALID_TSG_ID;
curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
if (ch != NULL) {
gk20a_channel_put(ch);
}
(void) memset(&err_info, 0, sizeof(err_info));
(void) memset(&info, 0, sizeof(info));
err_info.curr_ctx = curr_ctx;
err_info.chid = chid;
err_info.tsgid = tsgid;
err_info.hww_warp_esr_pc = hww_warp_esr_pc;
err_info.hww_warp_esr_status = hww_warp_esr_status;
err_info.gpc = gpc;
err_info.tpc = tpc;
err_info.sm = sm;
info.sm_mcerr_info = &err_info;
ret = g->ops.gr.err_ops.report_gr_err(g,
NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR,
&info);
if (ret != 0) {
nvgpu_err(g, "failed to report SM_EXCEPTION "
"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
gpc, tpc, sm, hww_warp_esr_status);
}
}
/* Used by sw interrupt thread to translate current ctx to chid.
* Also used by regops to translate current ctx to chid and tsgid.
* For performance, we don't want to go through 128 channels every time.
@@ -295,6 +338,112 @@ void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
}
}
int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
int ret = 0;
bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
bool disable_sm_exceptions = true;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
bool sm_debugger_attached;
u32 global_esr, warp_esr, global_mask;
u64 hww_warp_esr_pc = 0;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
*hww_global_esr = global_esr;
warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
if (!sm_debugger_attached) {
nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
global_esr, warp_esr);
return -EFAULT;
}
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
/*
* Check and report any fatal wrap errors.
*/
if ((global_esr & ~global_mask) != 0U) {
if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) {
hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g,
offset);
}
gr_intr_report_sm_exception(g, gpc, tpc, sm, warp_esr,
hww_warp_esr_pc);
}
nvgpu_pg_elpg_protected_call(g,
g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
if (g->ops.gr.pre_process_sm_exception != NULL) {
ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
global_esr, warp_esr,
sm_debugger_attached,
fault_ch,
&early_exit,
&ignore_debugger);
if (ret != 0) {
nvgpu_err(g, "could not pre-process sm error!");
return ret;
}
}
if (early_exit) {
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"returning early");
return ret;
}
/*
* Disable forwarding of tpc exceptions,
* the debugger will reenable exceptions after servicing them.
*
* Do not disable exceptions if the only SM exception is BPT_INT
*/
if ((g->ops.gr.esr_bpt_pending_events(global_esr,
NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) {
disable_sm_exceptions = false;
}
if (!ignore_debugger && disable_sm_exceptions) {
g->ops.gr.intr.tpc_exception_sm_disable(g, offset);
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"SM Exceptions disabled");
}
/* if a debugger is present and an error has occurred, do a warp sync */
if (!ignore_debugger &&
((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) {
nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
do_warp_sync = true;
}
if (do_warp_sync) {
ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
global_mask, true);
if (ret != 0) {
nvgpu_err(g, "sm did not lock down!");
return ret;
}
}
if (ignore_debugger) {
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"ignore_debugger set, skipping event posting");
} else {
*post_event = true;
}
return ret;
}
int nvgpu_gr_intr_handle_gpc_exception(struct gk20a *g, bool *post_event,
struct nvgpu_gr_config *gr_config, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)

View File

@@ -139,7 +139,6 @@ static const struct gpu_ops vgpu_gp10b_ops = {
.set_sm_debug_mode = vgpu_gr_set_sm_debug_mode,
.bpt_reg_info = NULL,
.handle_fecs_error = NULL,
.handle_sm_exception = NULL,
.get_lrf_tex_ltc_dram_override = NULL,
.update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode,
.update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode,

View File

@@ -165,7 +165,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
.set_sm_debug_mode = vgpu_gr_set_sm_debug_mode,
.bpt_reg_info = NULL,
.handle_fecs_error = NULL,
.handle_sm_exception = NULL,
.get_lrf_tex_ltc_dram_override = NULL,
.update_smpc_ctxsw_mode = vgpu_gr_update_smpc_ctxsw_mode,
.update_hwpm_ctxsw_mode = vgpu_gr_update_hwpm_ctxsw_mode,

View File

@@ -59,7 +59,6 @@
#include <nvgpu/engine_status.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/power_features/cg.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/preempt.h>
#include "gr_gk20a.h"
@@ -67,51 +66,8 @@
#include "common/gr/gr_priv.h"
#include <nvgpu/hw/gk20a/hw_fifo_gk20a.h>
#include <nvgpu/hw/gk20a/hw_gr_gk20a.h>
static void nvgpu_report_gr_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
u32 sm, u32 hww_warp_esr_status, u64 hww_warp_esr_pc)
{
int ret;
struct gr_sm_mcerr_info err_info;
struct channel_gk20a *ch;
struct gr_err_info info;
u32 tsgid, chid, curr_ctx, inst = 0;
if (g->ops.gr.err_ops.report_gr_err == NULL) {
return;
}
tsgid = NVGPU_INVALID_TSG_ID;
curr_ctx = g->ops.gr.falcon.get_current_ctx(g);
ch = nvgpu_gr_intr_get_channel_from_ctx(g, curr_ctx, &tsgid);
chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID;
if (ch != NULL) {
gk20a_channel_put(ch);
}
(void) memset(&err_info, 0, sizeof(err_info));
(void) memset(&info, 0, sizeof(info));
err_info.curr_ctx = curr_ctx;
err_info.chid = chid;
err_info.tsgid = tsgid;
err_info.hww_warp_esr_pc = hww_warp_esr_pc;
err_info.hww_warp_esr_status = hww_warp_esr_status;
err_info.gpc = gpc;
err_info.tpc = tpc;
err_info.sm = sm;
info.sm_mcerr_info = &err_info;
ret = g->ops.gr.err_ops.report_gr_err(g,
NVGPU_ERR_MODULE_SM, inst, GPU_SM_MACHINE_CHECK_ERROR,
&info);
if (ret != 0) {
nvgpu_err(g, "failed to report SM_EXCEPTION "
"gpc=%u, tpc=%u, sm=%u, esr_status=%x",
gpc, tpc, sm, hww_warp_esr_status);
}
}
static void gr_report_ctxsw_error(struct gk20a *g, u32 err_type, u32 chid,
u32 mailbox_value)
{
@@ -373,112 +329,6 @@ bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
return false;
}
int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
int ret = 0;
bool do_warp_sync = false, early_exit = false, ignore_debugger = false;
bool disable_sm_exceptions = true;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
bool sm_debugger_attached;
u32 global_esr, warp_esr, global_mask;
u64 hww_warp_esr_pc = 0;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " ");
sm_debugger_attached = g->ops.gr.sm_debugger_attached(g);
global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm);
*hww_global_esr = global_esr;
warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm);
global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g);
if (!sm_debugger_attached) {
nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x",
global_esr, warp_esr);
return -EFAULT;
}
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr);
/*
* Check and report any fatal wrap errors.
*/
if ((global_esr & ~global_mask) != 0U) {
if (g->ops.gr.get_sm_hww_warp_esr_pc != NULL) {
hww_warp_esr_pc = g->ops.gr.get_sm_hww_warp_esr_pc(g,
offset);
}
nvgpu_report_gr_sm_exception(g, gpc, tpc, sm, warp_esr,
hww_warp_esr_pc);
}
nvgpu_pg_elpg_protected_call(g,
g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch));
if (g->ops.gr.pre_process_sm_exception != NULL) {
ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm,
global_esr, warp_esr,
sm_debugger_attached,
fault_ch,
&early_exit,
&ignore_debugger);
if (ret != 0) {
nvgpu_err(g, "could not pre-process sm error!");
return ret;
}
}
if (early_exit) {
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"returning early");
return ret;
}
/*
* Disable forwarding of tpc exceptions,
* the debugger will reenable exceptions after servicing them.
*
* Do not disable exceptions if the only SM exception is BPT_INT
*/
if ((g->ops.gr.esr_bpt_pending_events(global_esr,
NVGPU_EVENT_ID_BPT_INT)) && (warp_esr == 0U)) {
disable_sm_exceptions = false;
}
if (!ignore_debugger && disable_sm_exceptions) {
g->ops.gr.intr.tpc_exception_sm_disable(g, offset);
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"SM Exceptions disabled");
}
/* if a debugger is present and an error has occurred, do a warp sync */
if (!ignore_debugger &&
((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) {
nvgpu_log(g, gpu_dbg_intr, "warp sync needed");
do_warp_sync = true;
}
if (do_warp_sync) {
ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm,
global_mask, true);
if (ret != 0) {
nvgpu_err(g, "sm did not lock down!");
return ret;
}
}
if (ignore_debugger) {
nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg,
"ignore_debugger set, skipping event posting");
} else {
*post_event = true;
}
return ret;
}
void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc,
u32 *esr_sm_sel)
{

View File

@@ -96,9 +96,6 @@ void gk20a_gr_suspend_all_sms(struct gk20a *g,
int gr_gk20a_set_sm_debug_mode(struct gk20a *g,
struct channel_gk20a *ch, u64 sms, bool enable);
bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch);
int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
#if defined(CONFIG_GK20A_CYCLE_STATS)
int gr_gk20a_css_attach(struct channel_gk20a *ch, /* in - main hw structure */

View File

@@ -57,164 +57,6 @@
#include <nvgpu/hw/gp10b/hw_gr_gp10b.h>
#include <nvgpu/hw/gp10b/hw_fifo_gp10b.h>
static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err,
u32 sed_status,
u32 ded_status,
u32 *count_to_adjust,
u32 opposite_count)
{
u32 over_count = 0;
sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b();
ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b();
/* One overcount for each partition on which a SBE occurred but not a
DBE (or vice-versa) */
if (single_err) {
over_count = (u32)hweight32(sed_status & ~ded_status);
} else {
over_count = (u32)hweight32(ded_status & ~sed_status);
}
/* If both a SBE and a DBE occur on the same partition, then we have an
overcount for the subpartition if the opposite error counts are
zero. */
if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) {
over_count += (u32)hweight32(sed_status & ded_status);
}
if (*count_to_adjust > over_count) {
*count_to_adjust -= over_count;
} else {
*count_to_adjust = 0;
}
}
int gr_gp10b_handle_sm_exception(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
int ret = 0;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
u32 lrf_single_count_delta, lrf_double_count_delta;
u32 shm_ecc_status;
ret = gr_gk20a_handle_sm_exception(g,
gpc, tpc, sm, post_event, fault_ch, hww_global_esr);
/* Check for LRF ECC errors. */
lrf_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
lrf_ecc_sed_status = lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f());
lrf_ecc_ded_status = lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
lrf_single_count_delta =
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
offset);
lrf_double_count_delta =
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
offset);
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset,
0);
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset,
0);
if (lrf_ecc_sed_status != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Single bit error detected in SM LRF!");
gr_gp10b_sm_lrf_ecc_overcount_war(true,
lrf_ecc_sed_status,
lrf_ecc_ded_status,
&lrf_single_count_delta,
lrf_double_count_delta);
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
lrf_single_count_delta;
}
if (lrf_ecc_ded_status != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Double bit error detected in SM LRF!");
gr_gp10b_sm_lrf_ecc_overcount_war(false,
lrf_ecc_sed_status,
lrf_ecc_ded_status,
&lrf_double_count_delta,
lrf_single_count_delta);
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_double_count_delta;
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
lrf_ecc_status);
/* Check for SHM ECC errors. */
shm_ecc_status = gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset);
if ((shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) != 0U ) {
u32 ecc_stats_reg_val;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Single bit error detected in SM SHM!");
ecc_stats_reg_val =
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
ecc_stats_reg_val);
}
if ((shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) != 0U) {
u32 ecc_stats_reg_val;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Double bit error detected in SM SHM!");
ecc_stats_reg_val =
gk20a_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
gk20a_writel(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
ecc_stats_reg_val);
}
gk20a_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset,
shm_ecc_status);
return ret;
}
void gr_gp10b_set_bes_crop_debug3(struct gk20a *g, u32 data)
{
u32 val;

View File

@@ -43,11 +43,6 @@ int gr_gp10b_handle_fecs_error(struct gk20a *g,
struct nvgpu_gr_isr_data *isr_data);
int gr_gp10b_set_cilp_preempt_pending(struct gk20a *g,
struct channel_gk20a *fault_ch);
int gr_gp10b_handle_sm_exception(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int gr_gp10b_commit_global_cb_manager(struct gk20a *g,
struct nvgpu_gr_ctx *gr_ctx, bool patch);
void gr_gp10b_set_bes_crop_debug3(struct gk20a *g, u32 data);

View File

@@ -25,6 +25,8 @@
#include <nvgpu/class.h>
#include <nvgpu/gr/config.h>
#include <nvgpu/gr/gr.h>
#include <nvgpu/gr/gr_intr.h>
#include "gr_intr_gp10b.h"
@@ -103,12 +105,165 @@ fail:
return -EINVAL;
}
static void gr_gp10b_sm_lrf_ecc_overcount_war(bool single_err,
u32 sed_status,
u32 ded_status,
u32 *count_to_adjust,
u32 opposite_count)
{
u32 over_count = 0;
sed_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_b();
ded_status >>= gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_b();
/* One overcount for each partition on which a SBE occurred but not a
DBE (or vice-versa) */
if (single_err) {
over_count = (u32)hweight32(sed_status & ~ded_status);
} else {
over_count = (u32)hweight32(ded_status & ~sed_status);
}
/* If both a SBE and a DBE occur on the same partition, then we have an
overcount for the subpartition if the opposite error counts are
zero. */
if (((sed_status & ded_status) != 0U) && (opposite_count == 0U)) {
over_count += (u32)hweight32(sed_status & ded_status);
}
if (*count_to_adjust > over_count) {
*count_to_adjust -= over_count;
} else {
*count_to_adjust = 0;
}
}
int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr)
{
int ret = 0;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 lrf_ecc_status, lrf_ecc_sed_status, lrf_ecc_ded_status;
u32 lrf_single_count_delta, lrf_double_count_delta;
u32 shm_ecc_status;
ret = nvgpu_gr_intr_handle_sm_exception(g,
gpc, tpc, sm, post_event, fault_ch, hww_global_esr);
/* Check for LRF ECC errors. */
lrf_ecc_status = nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset);
lrf_ecc_sed_status =
lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp0_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp1_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp2_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_single_err_detected_qrfdp3_pending_f());
lrf_ecc_ded_status =
lrf_ecc_status &
(gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp0_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp1_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp2_pending_f() |
gr_pri_gpc0_tpc0_sm_lrf_ecc_status_double_err_detected_qrfdp3_pending_f());
lrf_single_count_delta =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() +
offset);
lrf_double_count_delta =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() +
offset);
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_single_err_count_r() + offset, 0);
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_lrf_ecc_double_err_count_r() + offset, 0);
if (lrf_ecc_sed_status != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Single bit error detected in SM LRF!");
gr_gp10b_sm_lrf_ecc_overcount_war(true,
lrf_ecc_sed_status,
lrf_ecc_ded_status,
&lrf_single_count_delta,
lrf_double_count_delta);
g->ecc.gr.sm_lrf_ecc_single_err_count[gpc][tpc].counter +=
lrf_single_count_delta;
}
if (lrf_ecc_ded_status != 0U) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Double bit error detected in SM LRF!");
gr_gp10b_sm_lrf_ecc_overcount_war(false,
lrf_ecc_sed_status,
lrf_ecc_ded_status,
&lrf_double_count_delta,
lrf_single_count_delta);
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter +=
lrf_double_count_delta;
}
nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_lrf_ecc_status_r() + offset,
lrf_ecc_status);
/* Check for SHM ECC errors. */
shm_ecc_status = nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset);
if ((shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_corrected_shm1_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_single_err_detected_shm1_pending_f()) != 0U ) {
u32 ecc_stats_reg_val;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Single bit error detected in SM SHM!");
ecc_stats_reg_val =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
g->ecc.gr.sm_shm_ecc_sec_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_v(ecc_stats_reg_val);
g->ecc.gr.sm_shm_ecc_sed_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_v(ecc_stats_reg_val);
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_corrected_m() |
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_single_detected_m());
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
ecc_stats_reg_val);
}
if ((shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm0_pending_f()) != 0U ||
(shm_ecc_status &
gr_pri_gpc0_tpc0_sm_shm_ecc_status_double_err_detected_shm1_pending_f()) != 0U) {
u32 ecc_stats_reg_val;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_intr,
"Double bit error detected in SM SHM!");
ecc_stats_reg_val =
nvgpu_readl(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset);
g->ecc.gr.sm_shm_ecc_ded_count[gpc][tpc].counter +=
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_v(ecc_stats_reg_val);
ecc_stats_reg_val &= ~(gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_double_detected_m());
nvgpu_writel(g,
gr_pri_gpc0_tpc0_sm_shm_ecc_err_count_r() + offset,
ecc_stats_reg_val);
}
nvgpu_writel(g, gr_pri_gpc0_tpc0_sm_shm_ecc_status_r() + offset,
shm_ecc_status);
return ret;
}
void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc)
{
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
u32 offset = nvgpu_gr_gpc_offset(g, gpc) + nvgpu_gr_tpc_offset(g, tpc);
u32 esr;
u32 ecc_stats_reg_val;

View File

@@ -26,6 +26,7 @@
#include <nvgpu/types.h>
struct gk20a;
struct channel_gk20a;
#define NVC097_SET_GO_IDLE_TIMEOUT 0x022cU
#define NVC097_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dcU
@@ -43,4 +44,8 @@ void gp10b_gr_intr_set_go_idle_timeout(struct gk20a *g, u32 data);
void gp10b_gr_intr_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc);
int gp10b_gr_intr_handle_sw_method(struct gk20a *g, u32 addr,
u32 class_num, u32 offset, u32 data);
int gp10b_gr_intr_handle_sm_exception(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
#endif /* NVGPU_GR_INTR_GP10B_H */

View File

@@ -265,7 +265,6 @@ static const struct gpu_ops gm20b_ops = {
.set_sm_debug_mode = gr_gk20a_set_sm_debug_mode,
.bpt_reg_info = gr_gm20b_bpt_reg_info,
.handle_fecs_error = gk20a_gr_handle_fecs_error,
.handle_sm_exception = gr_gk20a_handle_sm_exception,
.get_lrf_tex_ltc_dram_override = NULL,
.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -532,6 +531,8 @@ static const struct gpu_ops gm20b_ops = {
gm20ab_gr_intr_tpc_exception_sm_enable,
.tpc_exception_sm_disable =
gm20ab_gr_intr_tpc_exception_sm_disable,
.handle_sm_exception =
nvgpu_gr_intr_handle_sm_exception,
.stall_isr = nvgpu_gr_intr_stall_isr,
},
.falcon = {

View File

@@ -296,7 +296,6 @@ static const struct gpu_ops gp10b_ops = {
.set_sm_debug_mode = gr_gk20a_set_sm_debug_mode,
.bpt_reg_info = gr_gm20b_bpt_reg_info,
.handle_fecs_error = gr_gp10b_handle_fecs_error,
.handle_sm_exception = gr_gp10b_handle_sm_exception,
.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode,
@@ -597,6 +596,8 @@ static const struct gpu_ops gp10b_ops = {
gm20ab_gr_intr_tpc_exception_sm_enable,
.tpc_exception_sm_disable =
gm20ab_gr_intr_tpc_exception_sm_disable,
.handle_sm_exception =
gp10b_gr_intr_handle_sm_exception,
.stall_isr = nvgpu_gr_intr_stall_isr,
},
.falcon = {

View File

@@ -398,7 +398,6 @@ static const struct gpu_ops gv100_ops = {
.set_sm_debug_mode = gv11b_gr_set_sm_debug_mode,
.bpt_reg_info = gv11b_gr_bpt_reg_info,
.handle_fecs_error = gr_gv11b_handle_fecs_error,
.handle_sm_exception = gr_gk20a_handle_sm_exception,
.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
@@ -739,6 +738,8 @@ static const struct gpu_ops gv100_ops = {
gm20ab_gr_intr_tpc_exception_sm_enable,
.tpc_exception_sm_disable =
gm20ab_gr_intr_tpc_exception_sm_disable,
.handle_sm_exception =
nvgpu_gr_intr_handle_sm_exception,
.stall_isr = nvgpu_gr_intr_stall_isr,
},
.falcon = {

View File

@@ -369,7 +369,6 @@ static const struct gpu_ops gv11b_ops = {
.set_sm_debug_mode = gv11b_gr_set_sm_debug_mode,
.bpt_reg_info = gv11b_gr_bpt_reg_info,
.handle_fecs_error = gr_gv11b_handle_fecs_error,
.handle_sm_exception = gr_gk20a_handle_sm_exception,
.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
@@ -716,6 +715,8 @@ static const struct gpu_ops gv11b_ops = {
gm20ab_gr_intr_tpc_exception_sm_enable,
.tpc_exception_sm_disable =
gm20ab_gr_intr_tpc_exception_sm_disable,
.handle_sm_exception =
nvgpu_gr_intr_handle_sm_exception,
.stall_isr = nvgpu_gr_intr_stall_isr,
},
.falcon = {

View File

@@ -418,7 +418,6 @@ static const struct gpu_ops tu104_ops = {
.set_sm_debug_mode = gv11b_gr_set_sm_debug_mode,
.bpt_reg_info = gv11b_gr_bpt_reg_info,
.handle_fecs_error = gr_gv11b_handle_fecs_error,
.handle_sm_exception = gr_gk20a_handle_sm_exception,
.get_lrf_tex_ltc_dram_override = get_ecc_override_val,
.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode,
.get_num_hwpm_perfmon = gr_gv100_get_num_hwpm_perfmon,
@@ -769,6 +768,8 @@ static const struct gpu_ops tu104_ops = {
gm20ab_gr_intr_tpc_exception_sm_enable,
.tpc_exception_sm_disable =
gm20ab_gr_intr_tpc_exception_sm_disable,
.handle_sm_exception =
nvgpu_gr_intr_handle_sm_exception,
.stall_isr = nvgpu_gr_intr_stall_isr,
},
.falcon = {

View File

@@ -365,10 +365,6 @@ struct gpu_ops {
u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int (*handle_sm_exception)(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc,
u32 sm, struct channel_gk20a *fault_ch);
@@ -845,6 +841,10 @@ struct gpu_ops {
void (*tpc_exception_sm_disable)(struct gk20a *g,
u32 offset);
void (*tpc_exception_sm_enable)(struct gk20a *g);
int (*handle_sm_exception)(struct gk20a *g,
u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int (*stall_isr)(struct gk20a *g);
} intr;

View File

@@ -69,5 +69,8 @@ struct channel_gk20a *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
u32 curr_ctx, u32 *curr_tsgid);
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
struct nvgpu_gr_isr_data *isr_data, u32 error_notifier);
int nvgpu_gr_intr_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int nvgpu_gr_intr_stall_isr(struct gk20a *g);
#endif /* NVGPU_GR_INTR_H */