gpu: nvgpu: update reporting of errors to sdl

In Drive 6.0, the error reporting is supported only for orin (ga10b)
in dev-main. For this purpose, this patch does the following:

- Removes the redundant reporting of following IDs from gv11b:
  - GPU_HOST_PFIFO_SCHED_ERROR
  - GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR
  - GPU_HOST_PBDMA_HCE_ERROR
  - GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED
  - GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED
  - GPU_LTC_CACHE_DSTG_ECC_CORRECTED
  - GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED

- Migrates the reporting of following IDs from gv11b to ga10b:
  - GPU_SM_L1_TAG_ECC_CORRECTED
  - GPU_SM_L1_TAG_ECC_UNCORRECTED
  - GPU_SM_CBU_ECC_UNCORRECTED
  - GPU_SM_LRF_ECC_UNCORRECTED
  - GPU_SM_L1_DATA_ECC_UNCORRECTED
  - GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED
  - GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED
  - GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED
  - GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED

- Removes the unused ID that doesn't have any HSI related to it:
  - GPU_HOST_PBDMA_PREEMPT_ERROR

In addition to the above, this patch does the following:
- Updates error IDs related to page fault error.
- Updates look-up table to remove unused error IDs.

JIRA NVGPU-8094
Bug 200729736

Change-Id: Ifea76d38ba609c894560e61ff5a6e406290f919e
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2685249
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Dinesh T <dt@nvidia.com>
Reviewed-by: Vaibhav Kachore <vkachore@nvidia.com>
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Rajesh Devaraj
2022-03-22 17:11:29 +05:30
committed by mobile promotions
parent 7ff977063b
commit 37c6b8b1c3
10 changed files with 83 additions and 163 deletions

View File

@@ -45,7 +45,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "host",
.hw_unit = (u32)NVGPU_ERR_MODULE_HOST,
.num_instances = 1U,
.num_errs = 17U,
.num_errs = 16U,
.errs = (struct nvgpu_err_desc[]) {
GPU_CRITERR("pfifo_bind_error",
GPU_HOST_PFIFO_BIND_ERROR, INJECT_SW,
@@ -113,11 +113,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("pbdma_preempt_error",
GPU_HOST_PBDMA_PREEMPT_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("pfifo_ctxsw_timeout",
GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR,
INJECT_SW,
@@ -134,7 +129,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "sm",
.hw_unit = (u32)NVGPU_ERR_MODULE_SM,
.num_instances = 8U,
.num_errs = 21U,
.num_errs = 12U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("l1_tag_ecc_corrected",
GPU_SM_L1_TAG_ECC_CORRECTED,
@@ -146,73 +141,41 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("cbu_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("cbu_ecc_uncorrected",
GPU_SM_CBU_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("lrf_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("lrf_ecc_uncorrected",
GPU_SM_LRF_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("l1_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_data_ecc_uncorrected",
GPU_SM_L1_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("icache_l0_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l0_data_ecc_uncorrected",
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("icache_l1_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l1_data_ecc_uncorrected",
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("icache_l0_predecode_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l0_predecode_ecc_uncorrected",
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("l1_tag_miss_fifo_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_tag_miss_fifo_ecc_uncorrected",
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("l1_tag_s2r_pixprf_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1_tag_s2r_pixprf_ecc_uncorrected",
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED,
INJECT_SW,
@@ -223,12 +186,9 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("icache_l1_predecode_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("icache_l1_predecode_ecc_uncorrected",
0, INJECT_NONE,
GPU_CRITERR("rams_urf_ecc_uncorrected",
GPU_SM_RAMS_URF_ECC_UNCORRECTED,
INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
},
@@ -237,7 +197,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "fecs",
.hw_unit = (u32)NVGPU_ERR_MODULE_FECS,
.num_instances = 1U,
.num_errs = 8U,
.num_errs = 7U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected",
GPU_FECS_FALCON_IMEM_ECC_CORRECTED,
@@ -249,10 +209,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("falcon_dmem_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED,
INJECT_SW,
@@ -284,7 +240,7 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "gpccs",
.hw_unit = (u32)NVGPU_ERR_MODULE_GPCCS,
.num_instances = 1U,
.num_errs = 4U,
.num_errs = 3U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("falcon_imem_ecc_corrected",
GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
@@ -296,10 +252,6 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("falcon_dmem_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("falcon_dmem_ecc_uncorrected",
GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
INJECT_SW,
@@ -311,21 +263,13 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "mmu",
.hw_unit = (u32)NVGPU_ERR_MODULE_MMU,
.num_instances = 1U,
.num_errs = 4U,
.num_errs = 2U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("l1tlb_sa_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1tlb_sa_data_ecc_uncorrected",
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("l1tlb_fa_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l1tlb_fa_data_ecc_uncorrected",
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
INJECT_SW,
@@ -337,12 +281,8 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.name = "gcc",
.hw_unit = (u32)NVGPU_ERR_MODULE_GCC,
.num_instances = 1U,
.num_errs = 2U,
.num_errs = 1U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("l15_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("l15_ecc_uncorrected",
GPU_GCC_L15_ECC_UNCORRECTED,
INJECT_TYPE,
@@ -555,44 +495,48 @@ struct nvgpu_err_hw_module ga10b_err_lut[] = {
.num_instances = 1U,
.num_errs = 9U,
.errs = (struct nvgpu_err_desc[]) {
GPU_NONCRITERR("hubmmu_l2tlb_sa_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_l2tlb_sa_data_ecc_uncorrected",
GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_tlb_sa_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_tlb_sa_data_ecc_uncorrected",
GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_pte_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_pte_data_ecc_uncorrected",
GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED,
INJECT_TYPE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_NONCRITERR("hubmmu_pde0_data_ecc_corrected",
0, INJECT_NONE,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_pde0_data_ecc_uncorrected",
GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_page_fault_error",
GPU_HUBMMU_PAGE_FAULT_ERROR,
GPU_CRITERR("hubmmu_page_fault_other_fault_notify_error",
GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_page_fault_nonreplayable_fault_overflow_error",
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_page_fault_replayable_fault_overflow_error",
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_page_fault_replayable_fault_notify_error",
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),
GPU_CRITERR("hubmmu_page_fault_nonreplayable_fault_notify_error",
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR,
INJECT_SW,
NULL, NULL,
NULL, NULL, 0, 0),

View File

@@ -513,7 +513,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
fb_niso_intr_mmu_other_fault_notify_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR);
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
"sub-err: OTHER_FAULT_NOTIFY. "
"fault_status(0x%x)", fault_status);
@@ -542,7 +542,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
fb_niso_intr_mmu_nonreplayable_fault_overflow_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR);
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
"sub-err: NONREPLAYABLE_FAULT_OVERFLOW. "
"fault_status(0x%x)", fault_status);
@@ -567,7 +567,7 @@ void gv11b_fb_handle_mmu_fault(struct gk20a *g, u32 niso_intr)
fb_niso_intr_mmu_replayable_fault_overflow_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
GPU_HUBMMU_PAGE_FAULT_ERROR);
GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR);
nvgpu_err(g, "GPU_HUBMMU_PAGE_FAULT_ERROR. "
"sub-err: REPLAYABLE_FAULT_OVERFLOW. "
"fault_status(0x%x)", fault_status);

View File

@@ -215,9 +215,6 @@ bool gv11b_fifo_handle_ctxsw_timeout(struct gk20a *g)
continue;
}
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR);
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
recover = g->ops.tsg.check_ctxsw_timeout(tsg,
&debug_dump, &ms);

View File

@@ -132,9 +132,6 @@ bool gv11b_fifo_handle_sched_error(struct gk20a *g)
nvgpu_err(g, "fifo sched error code not supported");
}
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
GPU_HOST_PFIFO_SCHED_ERROR);
if (sched_error == SCHED_ERROR_CODE_BAD_TSG) {
/* id is unknown, preempt all runlists and do recovery */
nvgpu_rc_sched_error_bad_tsg(g);

View File

@@ -195,9 +195,6 @@ bool gv11b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1,
recover = true;
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HOST,
GPU_HOST_PBDMA_HCE_ERROR);
if ((pbdma_intr_1 & pbdma_intr_1_ctxnotvalid_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "ctxnotvalid intr on pbdma id %d",
pbdma_id);

View File

@@ -483,6 +483,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_el1_0_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_ECC_UNCORRECTED);
nvgpu_err(g, "sm_l1_tag_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_TAG_ECC_UNCORRECTED;
ecc_status->err_count =
@@ -491,6 +494,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_miss_fifo_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED);
nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED;
ecc_status->err_count =
@@ -499,6 +505,9 @@ static void ga10b_gr_intr_set_l1_tag_uncorrected_err(struct gk20a *g,
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_uncorrected_err_pixrpf_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED);
nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED;
ecc_status->err_count =
@@ -513,6 +522,9 @@ static void ga10b_gr_intr_set_l1_tag_corrected_err(struct gk20a *g,
if ((l1_tag_ecc_status &
gr_pri_gpc0_tpc0_sm_l1_tag_ecc_status_corrected_err_el1_0_m()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_ECC_CORRECTED);
nvgpu_err(g, "sm_l1_tag_ecc_corrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_TAG_ECC_CORRECTED;
ecc_status->err_count =
@@ -569,6 +581,9 @@ static bool ga10b_gr_intr_sm_lrf_ecc_status_errors(struct gk20a *g,
ecc_status->err_count = 0U;
if (uncorr_err != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_CBU_ECC_UNCORRECTED);
nvgpu_err(g, "sm_lrf_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_LRF_ECC_UNCORRECTED;
ecc_status->err_count =
@@ -604,6 +619,9 @@ static bool ga10b_gr_intr_sm_cbu_ecc_status_errors(struct gk20a *g,
ecc_status->err_count = 0;
if (uncorr_err != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_CBU_ECC_UNCORRECTED);
nvgpu_err(g, "sm_cbu_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_CBU_ECC_UNCORRECTED;
ecc_status->err_count =
@@ -636,6 +654,9 @@ static bool ga10b_gr_intr_sm_l1_data_ecc_status_errors(struct gk20a *g,
ecc_status->err_count = 0U;
if (uncorr_err != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_DATA_ECC_UNCORRECTED);
nvgpu_err(g, "sm_l1_data_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_L1_DATA_ECC_UNCORRECTED;
ecc_status->err_count =
@@ -727,6 +748,9 @@ static bool ga10b_gr_intr_sm_icache_ecc_status_errors(struct gk20a *g,
ecc_status->err_count = 0U;
if (uncorr_err != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected");
ecc_status->err_id[ecc_status->err_count] =
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED;
ecc_status->err_count =

View File

@@ -371,8 +371,6 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED);
nvgpu_err(g, "uncorrected ecc sa data error. gpc_id(%d)", gpc);
}
if ((ecc_status &
@@ -387,8 +385,6 @@ static void gv11b_gr_intr_report_gpcmmu_ecc_err(struct gk20a *g,
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_MMU,
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED);
nvgpu_err(g, "uncorrected ecc fa data error. gpc_id(%d)", gpc);
}
}
@@ -909,22 +905,16 @@ static void gv11b_gr_intr_report_l1_tag_uncorrected_err(struct gk20a *g,
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_UNCORRECTED) {
nvgpu_err(g, "sm_l1_tag_ecc_uncorrected "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_ECC_UNCORRECTED);
}
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED) {
nvgpu_err(g, "sm_l1_tag_miss_fifo_ecc_uncorrected "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_MISS_FIFO_ECC_UNCORRECTED);
}
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED) {
nvgpu_err(g, "sm_l1_tag_s2r_pixprf_ecc_uncorrected "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_S2R_PIXPRF_ECC_UNCORRECTED);
}
}
}
@@ -944,8 +934,6 @@ static void gv11b_gr_intr_report_l1_tag_corrected_err(struct gk20a *g,
if (ecc_status->err_id[i] == GPU_SM_L1_TAG_ECC_CORRECTED) {
nvgpu_err(g, "sm_l1_tag_ecc_corrected "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_TAG_ECC_CORRECTED);
}
}
}
@@ -1265,8 +1253,6 @@ static void gv11b_gr_intr_handle_lrf_exception(struct gk20a *g, u32 gpc, u32 tpc
nvgpu_safe_add_u32(
g->ecc.gr.sm_lrf_ecc_double_err_count[gpc][tpc].counter,
lrf_uncorrected_err_count_delta);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_LRF_ECC_UNCORRECTED);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_lrf_ecc_uncorrected_err_count_r(), offset),
0U);
@@ -1398,8 +1384,6 @@ static void gv11b_gr_intr_handle_cbu_exception(struct gk20a *g, u32 gpc, u32 tpc
nvgpu_safe_add_u32(
g->ecc.gr.sm_cbu_ecc_uncorrected_err_count[gpc][tpc].counter,
cbu_uncorrected_err_count_delta);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_CBU_ECC_UNCORRECTED);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_cbu_ecc_uncorrected_err_count_r(), offset),
0U);
@@ -1527,8 +1511,6 @@ static void gv11b_gr_intr_handle_l1_data_exception(struct gk20a *g, u32 gpc, u32
nvgpu_safe_add_u32(
g->ecc.gr.sm_l1_data_ecc_uncorrected_err_count[gpc][tpc].counter,
l1_data_uncorrected_err_count_delta);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_L1_DATA_ECC_UNCORRECTED);
nvgpu_writel(g, nvgpu_safe_add_u32(
gr_pri_gpc0_tpc0_sm_l1_data_ecc_uncorrected_err_count_r(), offset),
0U);
@@ -1553,22 +1535,16 @@ static void gv11b_gr_intr_report_icache_uncorrected_err(struct gk20a *g,
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED) {
nvgpu_err(g, "sm_icache_l0_data_ecc_uncorrected. "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L0_DATA_ECC_UNCORRECTED);
}
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED) {
nvgpu_err(g, "sm_icache_l0_predecode_ecc_uncorrected. "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L0_PREDECODE_ECC_UNCORRECTED);
}
if (ecc_status->err_id[i] == GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED) {
nvgpu_err(g, "sm_icache_l1_data_ecc_uncorrected. "
"gpc_id(%d), tpc_id(%d)", gpc, tpc);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_SM,
GPU_SM_ICACHE_L1_DATA_ECC_UNCORRECTED);
}
}
}

View File

@@ -126,8 +126,6 @@ void gv11b_ltc_intr_handle_tstg_ecc_interrupts(struct gk20a *g,
g->ecc.ltc.tstg_ecc_parity_count[ltc][slice].counter,
uncorrected_delta);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
GPU_LTC_CACHE_TSTG_ECC_UNCORRECTED);
nvgpu_err(g, "tstg ecc error uncorrected. "
"ecc_addr(0x%x)", ecc_addr);
}
@@ -281,8 +279,6 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
ecc_stats_reg_val);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
GPU_LTC_CACHE_DSTG_ECC_CORRECTED);
nvgpu_err(g, "dstg ecc error corrected. "
"ecc_addr(0x%x)", dstg_ecc_addr);
@@ -328,8 +324,6 @@ static void gv11b_ltc_intr_handle_ecc_sec_ded_interrupts(struct gk20a *g, u32 lt
ltc_ltc0_lts0_dstg_ecc_report_r(), offset),
ecc_stats_reg_val);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_LTC,
GPU_LTC_CACHE_DSTG_ECC_UNCORRECTED);
nvgpu_err(g, "dstg ecc error uncorrected. "
"ecc_addr(0x%x)", dstg_ecc_addr);
}

View File

@@ -499,7 +499,7 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g,
u32 *invalidate_replay_val_ptr, u32 rd32_val, u32 fault_status,
u32 index, u32 get_indx, u32 offset, u32 entries)
{
u32 sub_err_type = 0U;
u32 err_type = 0U;
#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
u64 prev_fault_addr = 0ULL;
u64 next_fault_addr = 0ULL;
@@ -513,19 +513,17 @@ static void gv11b_mm_mmu_fault_handle_buf_valid_entry(struct gk20a *g,
#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
if (index == NVGPU_MMU_FAULT_REPLAY_REG_INDX) {
sub_err_type = GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY;
err_type = GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR;
} else {
#endif
sub_err_type = GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY;
err_type = GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR;
#ifdef CONFIG_NVGPU_REPLAYABLE_FAULT
}
#endif
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU,
GPU_HUBMMU_PAGE_FAULT_ERROR);
nvgpu_err(g, "page fault error: sub_er_type = 0x%x, "
"fault_status = 0x%x",
sub_err_type, fault_status);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_HUBMMU, err_type);
nvgpu_err(g, "page fault error: err_type = 0x%x, "
"fault_status = 0x%x", err_type, fault_status);
nvgpu_assert(get_indx < U32_MAX);
nvgpu_assert(entries != 0U);

View File

@@ -78,10 +78,9 @@ struct mmu_fault_info;
#define GPU_HOST_PBDMA_METHOD_ERROR (11U)
#define GPU_HOST_PBDMA_SIGNATURE_ERROR (12U)
#define GPU_HOST_PBDMA_HCE_ERROR (13U)
#define GPU_HOST_PBDMA_PREEMPT_ERROR (14U)
#define GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR (15U)
#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (16U)
#define GPU_HOST_INVALID_ERROR (17U)
#define GPU_HOST_PFIFO_CTXSW_TIMEOUT_ERROR (14U)
#define GPU_HOST_PFIFO_FB_FLUSH_TIMEOUT_ERROR (15U)
#define GPU_HOST_INVALID_ERROR (16U)
/**
* @}
*/
@@ -116,11 +115,11 @@ struct mmu_fault_info;
*/
#define GPU_FECS_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_FECS_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (3U)
#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT (4U)
#define GPU_FECS_CTXSW_CRC_MISMATCH (5U)
#define GPU_FECS_FAULT_DURING_CTXSW (6U)
#define GPU_FECS_CTXSW_INIT_ERROR (7U)
#define GPU_FECS_FALCON_DMEM_ECC_UNCORRECTED (2U)
#define GPU_FECS_CTXSW_WATCHDOG_TIMEOUT (3U)
#define GPU_FECS_CTXSW_CRC_MISMATCH (4U)
#define GPU_FECS_FAULT_DURING_CTXSW (5U)
#define GPU_FECS_CTXSW_INIT_ERROR (6U)
/**
* @}
*/
@@ -132,7 +131,7 @@ struct mmu_fault_info;
*/
#define GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED (0U)
#define GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED (1U)
#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (3U)
#define GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED (2U)
/**
* @}
*/
@@ -153,7 +152,7 @@ struct mmu_fault_info;
* Macros used to assign unique index to errors reported from the GCC unit.
* @{
*/
#define GPU_GCC_L15_ECC_UNCORRECTED (1U)
#define GPU_GCC_L15_ECC_UNCORRECTED (0U)
/**
* @}
*/
@@ -264,25 +263,19 @@ struct mmu_fault_info;
* Macros used to assign unique index to errors reported from the HUBMMU unit.
* @{
*/
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (0U)
#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (1U)
#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (2U)
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (3U)
#define GPU_HUBMMU_PAGE_FAULT_ERROR (4U)
#define GPU_HUBMMU_L2TLB_SA_DATA_ECC_UNCORRECTED (0U)
#define GPU_HUBMMU_TLB_SA_DATA_ECC_UNCORRECTED (1U)
#define GPU_HUBMMU_PTE_DATA_ECC_UNCORRECTED (2U)
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED (3U)
#define GPU_HUBMMU_PAGE_FAULT_OTHER_FAULT_NOTIFY_ERROR (4U)
#define GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_OVERFLOW_ERROR (5U)
#define GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_OVERFLOW_ERROR (6U)
#define GPU_HUBMMU_PAGE_FAULT_REPLAYABLE_FAULT_NOTIFY_ERROR (7U)
#define GPU_HUBMMU_PAGE_FAULT_NONREPLAYABLE_FAULT_NOTIFY_ERROR (8U)
/**
* @}
*/
/**
* This assigns an unique index for sub-errors
* in GPU_HUBMMU_PAGE_FAULT_ERROR.
*/
#define GPU_HUBMMU_REPLAYABLE_FAULT_OVERFLOW (0U)
#define GPU_HUBMMU_REPLAYABLE_FAULT_NOTIFY (1U)
#define GPU_HUBMMU_NONREPLAYABLE_FAULT_OVERFLOW (2U)
#define GPU_HUBMMU_NONREPLAYABLE_FAULT_NOTIFY (3U)
#define GPU_HUBMMU_OTHER_FAULT_NOTIFY (4U)
/**
* @defgroup LIST_OF_ERRORS_REPORTED_FROM_PRI
* Macros used to assign unique index to errors reported from the PRI unit.