diff --git a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c index a8efd01b7..550f5f21c 100644 --- a/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c +++ b/drivers/gpu/nvgpu/common/acr/acr_bootstrap.c @@ -38,6 +38,8 @@ static int acr_wait_for_completion(struct gk20a *g, u32 sctl, cpuctl; int completion = 0; u32 data = 0; + u32 bar0_status = 0; + u32 error_type; nvgpu_log_fn(g, " "); @@ -45,9 +47,19 @@ static int acr_wait_for_completion(struct gk20a *g, if (completion != 0) { nvgpu_err(g, "flcn-%d: HS ucode boot timed out", flcn_id); nvgpu_falcon_dump_stats(flcn); + error_type = ACR_BOOT_TIMEDOUT; goto exit; } + if (g->acr.acr.acr_engine_bus_err_status != NULL) { + completion = g->acr.acr.acr_engine_bus_err_status(g, + &bar0_status, &error_type); + if (completion != 0) { + nvgpu_err(g, "flcn-%d: ACR engine bus error", flcn_id); + goto exit; + } + } + nvgpu_acr_dbg(g, "flcn-%d: HS ucode capabilities %x", flcn_id, nvgpu_falcon_mailbox_read(flcn, FALCON_MAILBOX_1)); @@ -56,6 +68,7 @@ static int acr_wait_for_completion(struct gk20a *g, nvgpu_err(g, "flcn-%d: HS ucode boot failed, err %x", flcn_id, data); completion = -EAGAIN; + error_type = ACR_BOOT_FAILED; goto exit; } @@ -65,6 +78,12 @@ static int acr_wait_for_completion(struct gk20a *g, flcn_id, sctl, cpuctl); exit: + if (completion != 0) { + if (g->acr.acr.report_acr_engine_bus_err_status != NULL) { + g->acr.acr.report_acr_engine_bus_err_status(g, + bar0_status, error_type); + } + } return completion; } diff --git a/drivers/gpu/nvgpu/common/acr/acr_gm20b.c b/drivers/gpu/nvgpu/common/acr/acr_gm20b.c index ce4afc4b4..66672a815 100644 --- a/drivers/gpu/nvgpu/common/acr/acr_gm20b.c +++ b/drivers/gpu/nvgpu/common/acr/acr_gm20b.c @@ -170,6 +170,8 @@ static void gm20b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr) hs_acr->acr_flcn = g->pmu.flcn; hs_acr->acr_flcn_setup_hw_and_bl_bootstrap = gm20b_pmu_setup_hw_and_bl_bootstrap; + hs_acr->acr_engine_bus_err_status = + gk20a_pmu_bar0_error_status; } void gm20b_remove_acr_support(struct nvgpu_acr *acr) diff --git a/drivers/gpu/nvgpu/common/acr/acr_gv11b.c b/drivers/gpu/nvgpu/common/acr/acr_gv11b.c index 210396bac..0c7502c32 100644 --- a/drivers/gpu/nvgpu/common/acr/acr_gv11b.c +++ b/drivers/gpu/nvgpu/common/acr/acr_gv11b.c @@ -163,6 +163,10 @@ static void gv11b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr) hs_acr->acr_flcn = g->pmu.flcn; hs_acr->acr_flcn_setup_hw_and_bl_bootstrap = gm20b_pmu_setup_hw_and_bl_bootstrap; + hs_acr->report_acr_engine_bus_err_status = + nvgpu_pmu_report_bar0_pri_err_status; + hs_acr->acr_engine_bus_err_status = + gk20a_pmu_bar0_error_status; } void nvgpu_gv11b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr) diff --git a/drivers/gpu/nvgpu/common/pmu/pmu.c b/drivers/gpu/nvgpu/common/pmu/pmu.c index 035d358c6..769668cee 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu.c @@ -35,6 +35,22 @@ #include #include #include +#include + +static void pmu_report_error(struct gk20a *g, u32 err_type, + u32 status, u32 pmu_err_type) +{ + int ret = 0; + + if (g->ops.pmu.err_ops.report_pmu_err != NULL) { + ret = g->ops.pmu.err_ops.report_pmu_err(g, + NVGPU_ERR_MODULE_PWR, err_type, status, pmu_err_type); + if (ret != 0) { + nvgpu_err(g, "Failed to report PMU error: %d", + err_type); + } + } +} static int pmu_enable_hw(struct nvgpu_pmu *pmu, bool enable) { @@ -650,3 +666,11 @@ void nvgpu_pmu_get_cmd_line_args_offset(struct gk20a *g, *args_offset = dmem_size - g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu); } + +void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status, + u32 error_type) +{ + pmu_report_error(g, + GPU_PMU_BAR0_ERROR_TIMEOUT, bar0_status, error_type); + return; +} diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_gk20a.c b/drivers/gpu/nvgpu/common/pmu/pmu_gk20a.c index 68c45f5f0..5f8fd7f5a 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu_gk20a.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu_gk20a.c @@ -891,3 +891,58 @@ u32 gk20a_pmu_falcon_base_addr(void) { return pwr_falcon_irqsset_r(); } + +int gk20a_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status, + u32 *etype) +{ + u32 val = 0; + u32 err_status = 0; + + val = gk20a_readl(g, pwr_pmu_bar0_error_status_r()); + *bar0_status = val; + if (val == 0U) { + return 0; + } + if ((val & pwr_pmu_bar0_error_status_timeout_host_m()) != 0U) { + *etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U) + ? PMU_BAR0_HOST_WRITE_TOUT : PMU_BAR0_HOST_READ_TOUT; + } else if ((val & pwr_pmu_bar0_error_status_timeout_fecs_m()) != 0U) { + *etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U) + ? PMU_BAR0_FECS_WRITE_TOUT : PMU_BAR0_FECS_READ_TOUT; + } else if ((val & pwr_pmu_bar0_error_status_cmd_hwerr_m()) != 0U) { + *etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U) + ? PMU_BAR0_CMD_WRITE_HWERR : PMU_BAR0_CMD_READ_HWERR; + } else if ((val & pwr_pmu_bar0_error_status_fecserr_m()) != 0U) { + *etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U) + ? PMU_BAR0_WRITE_FECSERR : PMU_BAR0_READ_FECSERR; + err_status = gk20a_readl(g, pwr_pmu_bar0_fecs_error_r()); + /* + * BAR0_FECS_ERROR would only record the first error code if + * multiple FECS error happen. Once BAR0_FECS_ERROR is cleared, + * BAR0_FECS_ERROR can record the error code from FECS again. + * Writing status regiter to clear the FECS Hardware state. + */ + gk20a_writel(g, pwr_pmu_bar0_fecs_error_r(), err_status); + } else if ((val & pwr_pmu_bar0_error_status_hosterr_m()) != 0U) { + *etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U) + ? PMU_BAR0_WRITE_HOSTERR : PMU_BAR0_READ_HOSTERR; + /* + * BAR0_HOST_ERROR would only record the first error code if + * multiple HOST error happen. Once BAR0_HOST_ERROR is cleared, + * BAR0_HOST_ERROR can record the error code from HOST again. + * Writing status regiter to clear the FECS Hardware state. + * + * Defining clear ops for host err as gk20a does not have + * status register for this. + */ + if (g->ops.pmu.pmu_clear_bar0_host_err_status != NULL) { + g->ops.pmu.pmu_clear_bar0_host_err_status(g); + } + } else { + nvgpu_err(g, "PMU bar0 status type is not found"); + } + + /* Writing Bar0 status regiter to clear the Hardware state */ + gk20a_writel(g, pwr_pmu_bar0_error_status_r(), val); + return (-EIO); +} diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.c b/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.c index b63505ab6..6fe6eeea7 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.c +++ b/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.c @@ -414,3 +414,11 @@ bool gm20b_is_pmu_supported(struct gk20a *g) { return true; } + +void gm20b_clear_pmu_bar0_host_err_status(struct gk20a *g) +{ + u32 status; + + status = gk20a_readl(g, pwr_pmu_bar0_host_error_r()); + gk20a_writel(g, pwr_pmu_bar0_host_error_r(), status); +} diff --git a/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.h b/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.h index 186983863..78820afb5 100644 --- a/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.h +++ b/drivers/gpu/nvgpu/common/pmu/pmu_gm20b.h @@ -1,7 +1,7 @@ /* * GM20B PMU * - * Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2014-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -41,4 +41,5 @@ int gm20b_pmu_setup_hw_and_bl_bootstrap(struct gk20a *g, struct nvgpu_falcon_bl_info *bl_info); void gm20b_secured_pmu_start(struct gk20a *g); bool gm20b_is_pmu_supported(struct gk20a *g); +void gm20b_clear_pmu_bar0_host_err_status(struct gk20a *g); #endif /*NVGPU_GM20B_PMU_GM20B_H*/ diff --git a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c index 65ca17c4c..a2adef41d 100644 --- a/drivers/gpu/nvgpu/gm20b/hal_gm20b.c +++ b/drivers/gpu/nvgpu/gm20b/hal_gm20b.c @@ -688,6 +688,8 @@ static const struct gpu_ops gm20b_ops = { .get_irqdest = gk20a_pmu_get_irqdest, .is_debug_mode_enabled = gm20b_pmu_is_debug_mode_en, .save_zbc = gk20a_pmu_save_zbc, + .pmu_clear_bar0_host_err_status = + gm20b_clear_pmu_bar0_host_err_status, }, .clk = { .init_clk_support = gm20b_init_clk_support, diff --git a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c index 46f88898b..abcaaee09 100644 --- a/drivers/gpu/nvgpu/gp10b/hal_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/hal_gp10b.c @@ -762,6 +762,8 @@ static const struct gpu_ops gp10b_ops = { .get_irqdest = gk20a_pmu_get_irqdest, .is_debug_mode_enabled = gm20b_pmu_is_debug_mode_en, .save_zbc = gk20a_pmu_save_zbc, + .pmu_clear_bar0_host_err_status = + gm20b_clear_pmu_bar0_host_err_status, }, .clk_arb = { .check_clk_arb_support = gp10b_check_clk_arb_support, diff --git a/drivers/gpu/nvgpu/gv100/hal_gv100.c b/drivers/gpu/nvgpu/gv100/hal_gv100.c index d94f3af38..3f679e32e 100644 --- a/drivers/gpu/nvgpu/gv100/hal_gv100.c +++ b/drivers/gpu/nvgpu/gv100/hal_gv100.c @@ -952,6 +952,8 @@ static const struct gpu_ops gv100_ops = { .secured_pmu_start = gm20b_secured_pmu_start, .create_ssmd_lookup_table = nvgpu_pmu_create_ssmd_lookup_table, .save_zbc = gk20a_pmu_save_zbc, + .pmu_clear_bar0_host_err_status = + gm20b_clear_pmu_bar0_host_err_status, }, .clk = { .init_clk_support = gv100_init_clk_support, diff --git a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c index 5ccb018ef..cfb3465a4 100644 --- a/drivers/gpu/nvgpu/gv11b/hal_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/hal_gv11b.c @@ -914,6 +914,8 @@ static const struct gpu_ops gv11b_ops = { .init_wpr_region = gm20b_pmu_init_acr, .load_lsfalcon_ucode = gp10b_load_falcon_ucode, .save_zbc = gk20a_pmu_save_zbc, + .pmu_clear_bar0_host_err_status = + gm20b_clear_pmu_bar0_host_err_status, #endif }, .clk_arb = { diff --git a/drivers/gpu/nvgpu/include/nvgpu/acr/nvgpu_acr.h b/drivers/gpu/nvgpu/include/nvgpu/acr/nvgpu_acr.h index 563f3bd91..7bb15b75b 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/acr/nvgpu_acr.h +++ b/drivers/gpu/nvgpu/include/nvgpu/acr/nvgpu_acr.h @@ -144,6 +144,10 @@ struct hs_acr { int (*acr_flcn_setup_hw_and_bl_bootstrap)(struct gk20a *g, struct nvgpu_falcon_bl_info *bl_info); + void (*report_acr_engine_bus_err_status)(struct gk20a *g, + u32 bar0_status, u32 error_type); + int (*acr_engine_bus_err_status)(struct gk20a *g, u32 *bar0_status, + u32 *error_type); }; #define ACR_DEFAULT 0U diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 64ea8f359..5258c6405 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -1283,9 +1283,13 @@ struct gpu_ops { u32 hw_id, u32 inst, u32 err_id, u64 err_addr, u64 err_cnt); + int (*report_pmu_err)(struct gk20a *g, + u32 hw_id, u32 err_id, u32 status, + u32 pmu_err_type); } err_ops; void (*create_ssmd_lookup_table)(struct nvgpu_pmu *pmu); void (*save_zbc)(struct gk20a *g, u32 entries); + void (*pmu_clear_bar0_host_err_status)(struct gk20a *g); } pmu; struct { int (*init_debugfs)(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index a37716f6a..11a4d1ece 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -101,6 +101,8 @@ #define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED 1U #define GPU_PMU_FALCON_DMEM_ECC_CORRECTED 2U #define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED 3U +#define GPU_PMU_BAR0_ERROR_TIMEOUT 4U +#define GPU_PMU_INVALID_ERROR 5U #define GPU_PGRAPH_FE_EXCEPTION 0U #define GPU_PGRAPH_MEMFMT_EXCEPTION 1U diff --git a/drivers/gpu/nvgpu/include/nvgpu/pmu.h b/drivers/gpu/nvgpu/include/nvgpu/pmu.h index 7047fc3e0..c3f0bc40e 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/pmu.h +++ b/drivers/gpu/nvgpu/include/nvgpu/pmu.h @@ -122,6 +122,20 @@ enum pmu_seq_state { PMU_SEQ_STATE_CANCELLED }; +#define PMU_BAR0_SUCCESS 0U +#define PMU_BAR0_HOST_READ_TOUT 1U +#define PMU_BAR0_HOST_WRITE_TOUT 2U +#define PMU_BAR0_FECS_READ_TOUT 3U +#define PMU_BAR0_FECS_WRITE_TOUT 4U +#define PMU_BAR0_CMD_READ_HWERR 5U +#define PMU_BAR0_CMD_WRITE_HWERR 6U +#define PMU_BAR0_READ_HOSTERR 7U +#define PMU_BAR0_WRITE_HOSTERR 8U +#define PMU_BAR0_READ_FECSERR 9U +#define PMU_BAR0_WRITE_FECSERR 10U +#define ACR_BOOT_TIMEDOUT 11U +#define ACR_BOOT_FAILED 12U + /*PG defines used by nvpgu-pmu*/ #define PMU_PG_IDLE_THRESHOLD_SIM 1000U #define PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM 4000000U @@ -533,5 +547,9 @@ u32 nvgpu_pmu_get_ss_member_set_size(struct nvgpu_pmu *pmu, u32 member_id); u32 nvgpu_pmu_get_ss_member_get_status_size(struct nvgpu_pmu *pmu, u32 member_id); +void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status, + u32 error_type); +int gk20a_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status, + u32 *etype); #endif /* NVGPU_PMU_H */ diff --git a/drivers/gpu/nvgpu/tu104/hal_tu104.c b/drivers/gpu/nvgpu/tu104/hal_tu104.c index 506467c10..7bee7bd29 100644 --- a/drivers/gpu/nvgpu/tu104/hal_tu104.c +++ b/drivers/gpu/nvgpu/tu104/hal_tu104.c @@ -986,6 +986,8 @@ static const struct gpu_ops tu104_ops = { .secured_pmu_start = gm20b_secured_pmu_start, .create_ssmd_lookup_table = nvgpu_pmu_create_ssmd_lookup_table, .save_zbc = gk20a_pmu_save_zbc, + .pmu_clear_bar0_host_err_status = + gm20b_clear_pmu_bar0_host_err_status, }, .clk = { .init_clk_support = gv100_init_clk_support,