mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-23 09:57:08 +03:00
gpu: nvgpu: report class/method related errors
This patch adds support to report class/method related errors to 3LSS. Specifically, it adds the following service ID: NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR JIRA NVGPU-3458 JIRA NVGPU-3461 Change-Id: I9b28ed3074f664254347e059ac699470f95610b3 Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2136301 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Raghuram Kothakota <rkothakota@nvidia.com> Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
b7061a3263
commit
ab70c2e80f
@@ -208,7 +208,7 @@ static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
err_info.sm = sm;
|
||||
info.sm_mcerr_info = &err_info;
|
||||
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
|
||||
GPU_SM_MACHINE_CHECK_ERROR, &info);
|
||||
GPU_SM_MACHINE_CHECK_ERROR, &info, 0U);
|
||||
}
|
||||
|
||||
/* Used by sw interrupt thread to translate current ctx to chid.
|
||||
@@ -296,7 +296,7 @@ unlock:
|
||||
}
|
||||
|
||||
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||
u32 err_type, u32 status)
|
||||
u32 err_type, u32 status, u32 sub_err_type)
|
||||
{
|
||||
struct nvgpu_channel *ch;
|
||||
struct gr_exception_info err_info;
|
||||
@@ -319,7 +319,7 @@ void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||
err_info.status = status;
|
||||
info.exception_info = &err_info;
|
||||
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
|
||||
inst, err_type, &info);
|
||||
inst, err_type, &info, sub_err_type);
|
||||
}
|
||||
|
||||
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
||||
@@ -735,6 +735,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
||||
if (intr_info.illegal_notify != 0U) {
|
||||
nvgpu_err(g, "illegal notify pending");
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_ILLEGAL_NOTIFY);
|
||||
nvgpu_gr_intr_set_error_notifier(g, &isr_data,
|
||||
NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
|
||||
need_reset = true;
|
||||
@@ -742,6 +745,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
||||
}
|
||||
|
||||
if (intr_info.illegal_method != 0U) {
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_ILLEGAL_METHOD);
|
||||
if (gr_intr_handle_illegal_method(g, &isr_data) != 0) {
|
||||
need_reset = true;
|
||||
}
|
||||
@@ -749,6 +755,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
||||
}
|
||||
|
||||
if (intr_info.illegal_class != 0U) {
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_ILLEGAL_CLASS);
|
||||
nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
|
||||
isr_data.class_num, isr_data.offset);
|
||||
|
||||
@@ -766,6 +775,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
||||
}
|
||||
|
||||
if (intr_info.class_error != 0U) {
|
||||
nvgpu_gr_intr_report_exception(g, 0U,
|
||||
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||
GPU_PGRAPH_CLASS_ERROR);
|
||||
if (gr_intr_handle_class_error(g, &isr_data) != 0) {
|
||||
need_reset = true;
|
||||
}
|
||||
|
||||
@@ -231,7 +231,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_FE_EXCEPTION,
|
||||
fe);
|
||||
fe, 0);
|
||||
nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
|
||||
fe, info);
|
||||
nvgpu_writel(g, gr_fe_hww_esr_r(),
|
||||
@@ -244,7 +244,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_MEMFMT_EXCEPTION,
|
||||
memfmt);
|
||||
memfmt, 0);
|
||||
nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
|
||||
nvgpu_writel(g, gr_memfmt_hww_esr_r(),
|
||||
gr_memfmt_hww_esr_reset_active_f());
|
||||
@@ -256,7 +256,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_PD_EXCEPTION,
|
||||
pd);
|
||||
pd, 0);
|
||||
nvgpu_err(g, "pd exception: esr 0x%08x", pd);
|
||||
nvgpu_writel(g, gr_pd_hww_esr_r(),
|
||||
gr_pd_hww_esr_reset_active_f());
|
||||
@@ -268,7 +268,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_SCC_EXCEPTION,
|
||||
scc);
|
||||
scc, 0);
|
||||
nvgpu_err(g, "scc exception: esr 0x%08x", scc);
|
||||
nvgpu_writel(g, gr_scc_hww_esr_r(),
|
||||
gr_scc_hww_esr_reset_active_f());
|
||||
@@ -280,7 +280,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_DS_EXCEPTION,
|
||||
ds);
|
||||
ds, 0);
|
||||
nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
|
||||
nvgpu_writel(g, gr_ds_hww_esr_r(),
|
||||
gr_ds_hww_esr_reset_task_f());
|
||||
@@ -300,7 +300,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
}
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_SSYNC_EXCEPTION,
|
||||
ssync_esr);
|
||||
ssync_esr, 0);
|
||||
}
|
||||
|
||||
if ((exception & gr_exception_mme_m()) != 0U) {
|
||||
@@ -309,7 +309,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_MME_EXCEPTION,
|
||||
mme);
|
||||
mme, 0);
|
||||
nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
|
||||
mme, info);
|
||||
if (g->ops.gr.intr.log_mme_exception != NULL) {
|
||||
@@ -326,7 +326,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, 0,
|
||||
GPU_PGRAPH_SKED_EXCEPTION,
|
||||
sked);
|
||||
sked, 0);
|
||||
nvgpu_err(g, "sked exception: esr 0x%08x", sked);
|
||||
nvgpu_writel(g, gr_sked_hww_esr_r(),
|
||||
gr_sked_hww_esr_reset_active_f());
|
||||
|
||||
@@ -607,7 +607,7 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
|
||||
|
||||
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
|
||||
GPU_PGRAPH_MPC_EXCEPTION,
|
||||
esr);
|
||||
esr, 0);
|
||||
|
||||
esr = nvgpu_readl(g,
|
||||
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
|
||||
|
||||
@@ -42,7 +42,7 @@ void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g,
|
||||
void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
|
||||
struct nvgpu_gr_isr_data *isr_data);
|
||||
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||
u32 err_type, u32 status);
|
||||
u32 err_type, u32 status, u32 sub_err_type);
|
||||
struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
|
||||
u32 curr_ctx, u32 *curr_tsgid);
|
||||
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
||||
|
||||
@@ -136,6 +136,14 @@ struct ctxsw_err_info {
|
||||
#define GPU_PGRAPH_SKED_EXCEPTION (7U)
|
||||
#define GPU_PGRAPH_BE_EXCEPTION (8U)
|
||||
#define GPU_PGRAPH_MPC_EXCEPTION (9U)
|
||||
#define GPU_PGRAPH_ILLEGAL_ERROR (10U)
|
||||
|
||||
/* Sub-errors in GPU_PGRAPH_ILLEGAL_ERROR */
|
||||
#define GPU_PGRAPH_ILLEGAL_NOTIFY (0U)
|
||||
#define GPU_PGRAPH_ILLEGAL_METHOD (1U)
|
||||
#define GPU_PGRAPH_ILLEGAL_CLASS (2U)
|
||||
#define GPU_PGRAPH_CLASS_ERROR (3U)
|
||||
|
||||
struct gr_exception_info {
|
||||
u32 curr_ctx; /* Context which triggered the exception */
|
||||
u32 chid; /* Channel bound to the context */
|
||||
@@ -210,7 +218,7 @@ int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
||||
void *data);
|
||||
|
||||
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
u32 err_type, struct gr_err_info *err_info);
|
||||
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type);
|
||||
|
||||
int nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
||||
u32 sub_err_type, u32 status);
|
||||
|
||||
@@ -37,7 +37,7 @@ int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
}
|
||||
|
||||
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
u32 err_type, struct gr_err_info *err_info)
|
||||
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
}
|
||||
|
||||
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||
u32 err_type, struct gr_err_info *err_info)
|
||||
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user