mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-23 09:57:08 +03:00
gpu: nvgpu: report class/method related errors
This patch adds support to report class/method related errors to 3LSS. Specifically, it adds the following service ID: NVGUARD_SERVICE_IGPU_PGRAPH_SWERR_ILLEGAL_ERROR JIRA NVGPU-3458 JIRA NVGPU-3461 Change-Id: I9b28ed3074f664254347e059ac699470f95610b3 Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2136301 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Raghuram Kothakota <rkothakota@nvidia.com> Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
b7061a3263
commit
ab70c2e80f
@@ -208,7 +208,7 @@ static void gr_intr_report_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
|||||||
err_info.sm = sm;
|
err_info.sm = sm;
|
||||||
info.sm_mcerr_info = &err_info;
|
info.sm_mcerr_info = &err_info;
|
||||||
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
|
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_SM, inst,
|
||||||
GPU_SM_MACHINE_CHECK_ERROR, &info);
|
GPU_SM_MACHINE_CHECK_ERROR, &info, 0U);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Used by sw interrupt thread to translate current ctx to chid.
|
/* Used by sw interrupt thread to translate current ctx to chid.
|
||||||
@@ -296,7 +296,7 @@ unlock:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||||
u32 err_type, u32 status)
|
u32 err_type, u32 status, u32 sub_err_type)
|
||||||
{
|
{
|
||||||
struct nvgpu_channel *ch;
|
struct nvgpu_channel *ch;
|
||||||
struct gr_exception_info err_info;
|
struct gr_exception_info err_info;
|
||||||
@@ -319,7 +319,7 @@ void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
|||||||
err_info.status = status;
|
err_info.status = status;
|
||||||
info.exception_info = &err_info;
|
info.exception_info = &err_info;
|
||||||
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
|
(void) nvgpu_report_gr_err(g, NVGPU_ERR_MODULE_PGRAPH,
|
||||||
inst, err_type, &info);
|
inst, err_type, &info, sub_err_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
||||||
@@ -735,6 +735,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
|||||||
if (intr_info.illegal_notify != 0U) {
|
if (intr_info.illegal_notify != 0U) {
|
||||||
nvgpu_err(g, "illegal notify pending");
|
nvgpu_err(g, "illegal notify pending");
|
||||||
|
|
||||||
|
nvgpu_gr_intr_report_exception(g, 0U,
|
||||||
|
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||||
|
GPU_PGRAPH_ILLEGAL_NOTIFY);
|
||||||
nvgpu_gr_intr_set_error_notifier(g, &isr_data,
|
nvgpu_gr_intr_set_error_notifier(g, &isr_data,
|
||||||
NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
|
NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY);
|
||||||
need_reset = true;
|
need_reset = true;
|
||||||
@@ -742,6 +745,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (intr_info.illegal_method != 0U) {
|
if (intr_info.illegal_method != 0U) {
|
||||||
|
nvgpu_gr_intr_report_exception(g, 0U,
|
||||||
|
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||||
|
GPU_PGRAPH_ILLEGAL_METHOD);
|
||||||
if (gr_intr_handle_illegal_method(g, &isr_data) != 0) {
|
if (gr_intr_handle_illegal_method(g, &isr_data) != 0) {
|
||||||
need_reset = true;
|
need_reset = true;
|
||||||
}
|
}
|
||||||
@@ -749,6 +755,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (intr_info.illegal_class != 0U) {
|
if (intr_info.illegal_class != 0U) {
|
||||||
|
nvgpu_gr_intr_report_exception(g, 0U,
|
||||||
|
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||||
|
GPU_PGRAPH_ILLEGAL_CLASS);
|
||||||
nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
|
nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x",
|
||||||
isr_data.class_num, isr_data.offset);
|
isr_data.class_num, isr_data.offset);
|
||||||
|
|
||||||
@@ -766,6 +775,9 @@ int nvgpu_gr_intr_stall_isr(struct gk20a *g)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (intr_info.class_error != 0U) {
|
if (intr_info.class_error != 0U) {
|
||||||
|
nvgpu_gr_intr_report_exception(g, 0U,
|
||||||
|
GPU_PGRAPH_ILLEGAL_ERROR, gr_intr,
|
||||||
|
GPU_PGRAPH_CLASS_ERROR);
|
||||||
if (gr_intr_handle_class_error(g, &isr_data) != 0) {
|
if (gr_intr_handle_class_error(g, &isr_data) != 0) {
|
||||||
need_reset = true;
|
need_reset = true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -231,7 +231,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_FE_EXCEPTION,
|
GPU_PGRAPH_FE_EXCEPTION,
|
||||||
fe);
|
fe, 0);
|
||||||
nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
|
nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x",
|
||||||
fe, info);
|
fe, info);
|
||||||
nvgpu_writel(g, gr_fe_hww_esr_r(),
|
nvgpu_writel(g, gr_fe_hww_esr_r(),
|
||||||
@@ -244,7 +244,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_MEMFMT_EXCEPTION,
|
GPU_PGRAPH_MEMFMT_EXCEPTION,
|
||||||
memfmt);
|
memfmt, 0);
|
||||||
nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
|
nvgpu_err(g, "memfmt exception: esr %08x", memfmt);
|
||||||
nvgpu_writel(g, gr_memfmt_hww_esr_r(),
|
nvgpu_writel(g, gr_memfmt_hww_esr_r(),
|
||||||
gr_memfmt_hww_esr_reset_active_f());
|
gr_memfmt_hww_esr_reset_active_f());
|
||||||
@@ -256,7 +256,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_PD_EXCEPTION,
|
GPU_PGRAPH_PD_EXCEPTION,
|
||||||
pd);
|
pd, 0);
|
||||||
nvgpu_err(g, "pd exception: esr 0x%08x", pd);
|
nvgpu_err(g, "pd exception: esr 0x%08x", pd);
|
||||||
nvgpu_writel(g, gr_pd_hww_esr_r(),
|
nvgpu_writel(g, gr_pd_hww_esr_r(),
|
||||||
gr_pd_hww_esr_reset_active_f());
|
gr_pd_hww_esr_reset_active_f());
|
||||||
@@ -268,7 +268,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_SCC_EXCEPTION,
|
GPU_PGRAPH_SCC_EXCEPTION,
|
||||||
scc);
|
scc, 0);
|
||||||
nvgpu_err(g, "scc exception: esr 0x%08x", scc);
|
nvgpu_err(g, "scc exception: esr 0x%08x", scc);
|
||||||
nvgpu_writel(g, gr_scc_hww_esr_r(),
|
nvgpu_writel(g, gr_scc_hww_esr_r(),
|
||||||
gr_scc_hww_esr_reset_active_f());
|
gr_scc_hww_esr_reset_active_f());
|
||||||
@@ -280,7 +280,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_DS_EXCEPTION,
|
GPU_PGRAPH_DS_EXCEPTION,
|
||||||
ds);
|
ds, 0);
|
||||||
nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
|
nvgpu_err(g, "ds exception: esr: 0x%08x", ds);
|
||||||
nvgpu_writel(g, gr_ds_hww_esr_r(),
|
nvgpu_writel(g, gr_ds_hww_esr_r(),
|
||||||
gr_ds_hww_esr_reset_task_f());
|
gr_ds_hww_esr_reset_task_f());
|
||||||
@@ -300,7 +300,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
}
|
}
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_SSYNC_EXCEPTION,
|
GPU_PGRAPH_SSYNC_EXCEPTION,
|
||||||
ssync_esr);
|
ssync_esr, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((exception & gr_exception_mme_m()) != 0U) {
|
if ((exception & gr_exception_mme_m()) != 0U) {
|
||||||
@@ -309,7 +309,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_MME_EXCEPTION,
|
GPU_PGRAPH_MME_EXCEPTION,
|
||||||
mme);
|
mme, 0);
|
||||||
nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
|
nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x",
|
||||||
mme, info);
|
mme, info);
|
||||||
if (g->ops.gr.intr.log_mme_exception != NULL) {
|
if (g->ops.gr.intr.log_mme_exception != NULL) {
|
||||||
@@ -326,7 +326,7 @@ bool gm20b_gr_intr_handle_exceptions(struct gk20a *g, bool *is_gpc_exception)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, 0,
|
nvgpu_gr_intr_report_exception(g, 0,
|
||||||
GPU_PGRAPH_SKED_EXCEPTION,
|
GPU_PGRAPH_SKED_EXCEPTION,
|
||||||
sked);
|
sked, 0);
|
||||||
nvgpu_err(g, "sked exception: esr 0x%08x", sked);
|
nvgpu_err(g, "sked exception: esr 0x%08x", sked);
|
||||||
nvgpu_writel(g, gr_sked_hww_esr_r(),
|
nvgpu_writel(g, gr_sked_hww_esr_r(),
|
||||||
gr_sked_hww_esr_reset_active_f());
|
gr_sked_hww_esr_reset_active_f());
|
||||||
|
|||||||
@@ -607,7 +607,7 @@ void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
|
|||||||
|
|
||||||
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
|
nvgpu_gr_intr_report_exception(g, ((gpc << 8U) | tpc),
|
||||||
GPU_PGRAPH_MPC_EXCEPTION,
|
GPU_PGRAPH_MPC_EXCEPTION,
|
||||||
esr);
|
esr, 0);
|
||||||
|
|
||||||
esr = nvgpu_readl(g,
|
esr = nvgpu_readl(g,
|
||||||
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
|
nvgpu_safe_add_u32(gr_gpc0_tpc0_mpc_hww_esr_info_r(),
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ void nvgpu_gr_intr_handle_notify_pending(struct gk20a *g,
|
|||||||
void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
|
void nvgpu_gr_intr_handle_semaphore_pending(struct gk20a *g,
|
||||||
struct nvgpu_gr_isr_data *isr_data);
|
struct nvgpu_gr_isr_data *isr_data);
|
||||||
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
void nvgpu_gr_intr_report_exception(struct gk20a *g, u32 inst,
|
||||||
u32 err_type, u32 status);
|
u32 err_type, u32 status, u32 sub_err_type);
|
||||||
struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
|
struct nvgpu_channel *nvgpu_gr_intr_get_channel_from_ctx(struct gk20a *g,
|
||||||
u32 curr_ctx, u32 *curr_tsgid);
|
u32 curr_ctx, u32 *curr_tsgid);
|
||||||
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
void nvgpu_gr_intr_set_error_notifier(struct gk20a *g,
|
||||||
|
|||||||
@@ -136,6 +136,14 @@ struct ctxsw_err_info {
|
|||||||
#define GPU_PGRAPH_SKED_EXCEPTION (7U)
|
#define GPU_PGRAPH_SKED_EXCEPTION (7U)
|
||||||
#define GPU_PGRAPH_BE_EXCEPTION (8U)
|
#define GPU_PGRAPH_BE_EXCEPTION (8U)
|
||||||
#define GPU_PGRAPH_MPC_EXCEPTION (9U)
|
#define GPU_PGRAPH_MPC_EXCEPTION (9U)
|
||||||
|
#define GPU_PGRAPH_ILLEGAL_ERROR (10U)
|
||||||
|
|
||||||
|
/* Sub-errors in GPU_PGRAPH_ILLEGAL_ERROR */
|
||||||
|
#define GPU_PGRAPH_ILLEGAL_NOTIFY (0U)
|
||||||
|
#define GPU_PGRAPH_ILLEGAL_METHOD (1U)
|
||||||
|
#define GPU_PGRAPH_ILLEGAL_CLASS (2U)
|
||||||
|
#define GPU_PGRAPH_CLASS_ERROR (3U)
|
||||||
|
|
||||||
struct gr_exception_info {
|
struct gr_exception_info {
|
||||||
u32 curr_ctx; /* Context which triggered the exception */
|
u32 curr_ctx; /* Context which triggered the exception */
|
||||||
u32 chid; /* Channel bound to the context */
|
u32 chid; /* Channel bound to the context */
|
||||||
@@ -210,7 +218,7 @@ int nvgpu_report_ctxsw_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
|||||||
void *data);
|
void *data);
|
||||||
|
|
||||||
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||||
u32 err_type, struct gr_err_info *err_info);
|
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type);
|
||||||
|
|
||||||
int nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
int nvgpu_report_pmu_err(struct gk20a *g, u32 hw_unit, u32 err_id,
|
||||||
u32 sub_err_type, u32 status);
|
u32 sub_err_type, u32 status);
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||||
u32 err_type, struct gr_err_info *err_info)
|
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ int nvgpu_report_ecc_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
int nvgpu_report_gr_err(struct gk20a *g, u32 hw_unit, u32 inst,
|
||||||
u32 err_type, struct gr_err_info *err_info)
|
u32 err_type, struct gr_err_info *err_info, u32 sub_err_type)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user