gpu: nvgpu: report class/method related errors

This patch adds support to report class/method related errors to 3LSS.
Specifically, it adds the following service ID:
NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR

JIRA NVGPU-3458
JIRA NVGPU-3461

Change-Id: I9b28ed3074f664254347e059ac699470f95610b3
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2136301
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Raghuram Kothakota <rkothakota@nvidia.com>
Reviewed-by: Ankur Kishore <ankkishore@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Rajesh Devaraj
2019-06-14 14:15:18 +05:30
committed by mobile promotions
parent b7061a3263
commit ab70c2e80f
7 changed files with 36 additions and 16 deletions

View File

@@ -208,7 +208,7 @@ static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
err_info.sm = sm;
info.sm_mcerr_info = &err_info;
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
GPU_SM_MACHINE_CHECK_ERROR, &info);
GPU_SM_MACHINE_CHECK_ERROR, &info, 0U);
}
/* Used by sw interrupt thread to translate current ctx to chid.
@@ -296,7 +296,7 @@ unlock:
}
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
u32 err_type, u32 status)
u32 err_type, u32 status, u32 sub_err_type)
{
struct nvgpu_channel *ch;
struct gr_exception_info err_info;
@@ -319,7 +319,7 @@ void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
err_info.status = status;
info.exception_info = &err_info;
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
inst, err_type, &info);
inst, err_type, &info, sub_err_type);
}
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
@@ -735,6 +735,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
if (intr_info.illegal_notify != 0U) {
nvgpu_err(g, "illegal notify pending");
nvgpu_gr_intr_report_exception(g, 0U,
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_ILLEGAL_NOTIFY);
nvgpu_gr_intr_set_error_notifier(g, &isr_data,
NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
need_reset = true;
@@ -742,6 +745,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
}
if (intr_info.illegal_method != 0U) {
nvgpu_gr_intr_report_exception(g, 0U,
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_ILLEGAL_METHOD);
if (gr_intr_handle_illegal_method(g, &isr_data) != 0) {
need_reset = true;
}
@@ -749,6 +755,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
}
if (intr_info.illegal_class != 0U) {
nvgpu_gr_intr_report_exception(g, 0U,
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_ILLEGAL_CLASS);
nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
isr_data.class_num, isr_data.offset);
@@ -766,6 +775,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
}
if (intr_info.class_error != 0U) {
nvgpu_gr_intr_report_exception(g, 0U,
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
GPU_PGRAPH_CLASS_ERROR);
if (gr_intr_handle_class_error(g, &isr_data) != 0) {
need_reset = true;
}

View File

@@ -231,7 +231,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_FE_EXCEPTION,
fe);
fe, 0);
nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
fe, info);
nvgpu_writel(g, gr_fe_hww_esr_r(),
@@ -244,7 +244,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_MEMFMT_EXCEPTION,
memfmt);
memfmt, 0);
nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
nvgpu_writel(g, gr_memfmt_hww_esr_r(),
gr_memfmt_hww_esr_reset_active_f());
@@ -256,7 +256,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_PD_EXCEPTION,
pd);
pd, 0);
nvgpu_err(g, "pd exception: esr 0x%08x", pd);
nvgpu_writel(g, gr_pd_hww_esr_r(),
gr_pd_hww_esr_reset_active_f());
@@ -268,7 +268,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_SCC_EXCEPTION,
scc);
scc, 0);
nvgpu_err(g, "scc exception: esr 0x%08x", scc);
nvgpu_writel(g, gr_scc_hww_esr_r(),
gr_scc_hww_esr_reset_active_f());
@@ -280,7 +280,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_DS_EXCEPTION,
ds);
ds, 0);
nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
nvgpu_writel(g, gr_ds_hww_esr_r(),
gr_ds_hww_esr_reset_task_f());
@@ -300,7 +300,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
}
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_SSYNC_EXCEPTION,
ssync_esr);
ssync_esr, 0);
}
if ((exception & gr_exception_mme_m()) != 0U) {
@@ -309,7 +309,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_MME_EXCEPTION,
mme);
mme, 0);
nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
mme, info);
if (g->ops.gr.intr.log_mme_exception != NULL) {
@@ -326,7 +326,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
nvgpu_gr_intr_report_exception(g, 0,
GPU_PGRAPH_SKED_EXCEPTION,
sked);
sked, 0);
nvgpu_err(g, "sked exception: esr 0x%08x", sked);
nvgpu_writel(g, gr_sked_hww_esr_r(),
gr_sked_hww_esr_reset_active_f());

View File

@@ -607,7 +607,7 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
GPU_PGRAPH_MPC_EXCEPTION,
esr);
esr, 0);
esr = nvgpu_readl(g,
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),

View File

@@ -42,7 +42,7 @@ void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g,
void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
struct nvgpu_gr_isr_data *isr_data);
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
u32 err_type, u32 status);
u32 err_type, u32 status, u32 sub_err_type);
struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
u32 curr_ctx, u32 *curr_tsgid);
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,

View File

@@ -136,6 +136,14 @@ struct ctxsw_err_info {
#define GPU_PGRAPH_SKED_EXCEPTION (7U)
#define GPU_PGRAPH_BE_EXCEPTION (8U)
#define GPU_PGRAPH_MPC_EXCEPTION (9U)
#define GPU_PGRAPH_ILLEGAL_ERROR (10U)
/* Sub-errors in GPU_PGRAPH_ILLEGAL_ERROR */
#define GPU_PGRAPH_ILLEGAL_NOTIFY (0U)
#define GPU_PGRAPH_ILLEGAL_METHOD (1U)
#define GPU_PGRAPH_ILLEGAL_CLASS (2U)
#define GPU_PGRAPH_CLASS_ERROR (3U)
struct gr_exception_info {
u32 curr_ctx; /* Context which triggered the exception */
u32 chid; /* Channel bound to the context */
@@ -210,7 +218,7 @@ int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
void *data);
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, struct gr_err_info *err_info);
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type);
int nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
u32 sub_err_type, u32 status);

View File

@@ -37,7 +37,7 @@ int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
}
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, struct gr_err_info *err_info)
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type)
{
return 0;
}

View File

@@ -57,7 +57,7 @@ int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
}
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
u32 err_type, struct gr_err_info *err_info)
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type)
{
return 0;
}