gpu: nvgpu: report PMU falcon bar0 errors

Introduce hooks for reporting BAR0 PRI timeout.

Jira NVGPU-1858

Change-Id: I917a7cb2e24b6d4025305e965c00c5551222c00a
Signed-off-by: Prateek sethi <prsethi@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2024488
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Prateek sethi
2019-02-21 11:07:13 +05:30
committed by mobile promotions
parent 154ffd9dad
commit 3859725ea1
16 changed files with 152 additions and 1 deletions

View File

@@ -38,6 +38,8 @@ static int acr_wait_for_completion(struct gk20a *g,
u32 sctl, cpuctl;
int completion = 0;
u32 data = 0;
u32 bar0_status = 0;
u32 error_type;
nvgpu_log_fn(g, " ");
@@ -45,9 +47,19 @@ static int acr_wait_for_completion(struct gk20a *g,
if (completion != 0) {
nvgpu_err(g, "flcn-%d: HS ucode boot timed out", flcn_id);
nvgpu_falcon_dump_stats(flcn);
error_type = ACR_BOOT_TIMEDOUT;
goto exit;
}
if (g->acr.acr.acr_engine_bus_err_status != NULL) {
completion = g->acr.acr.acr_engine_bus_err_status(g,
&bar0_status, &error_type);
if (completion != 0) {
nvgpu_err(g, "flcn-%d: ACR engine bus error", flcn_id);
goto exit;
}
}
nvgpu_acr_dbg(g, "flcn-%d: HS ucode capabilities %x", flcn_id,
nvgpu_falcon_mailbox_read(flcn, FALCON_MAILBOX_1));
@@ -56,6 +68,7 @@ static int acr_wait_for_completion(struct gk20a *g,
nvgpu_err(g, "flcn-%d: HS ucode boot failed, err %x", flcn_id,
data);
completion = -EAGAIN;
error_type = ACR_BOOT_FAILED;
goto exit;
}
@@ -65,6 +78,12 @@ static int acr_wait_for_completion(struct gk20a *g,
flcn_id, sctl, cpuctl);
exit:
if (completion != 0) {
if (g->acr.acr.report_acr_engine_bus_err_status != NULL) {
g->acr.acr.report_acr_engine_bus_err_status(g,
bar0_status, error_type);
}
}
return completion;
}

View File

@@ -170,6 +170,8 @@ static void gm20b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
hs_acr->acr_flcn = g->pmu.flcn;
hs_acr->acr_flcn_setup_hw_and_bl_bootstrap =
gm20b_pmu_setup_hw_and_bl_bootstrap;
hs_acr->acr_engine_bus_err_status =
gk20a_pmu_bar0_error_status;
}
void gm20b_remove_acr_support(struct nvgpu_acr *acr)

View File

@@ -163,6 +163,10 @@ static void gv11b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
hs_acr->acr_flcn = g->pmu.flcn;
hs_acr->acr_flcn_setup_hw_and_bl_bootstrap =
gm20b_pmu_setup_hw_and_bl_bootstrap;
hs_acr->report_acr_engine_bus_err_status =
nvgpu_pmu_report_bar0_pri_err_status;
hs_acr->acr_engine_bus_err_status =
gk20a_pmu_bar0_error_status;
}
void nvgpu_gv11b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr)

View File

@@ -35,6 +35,22 @@
#include <nvgpu/gk20a.h>
#include <nvgpu/string.h>
#include <nvgpu/power_features/cg.h>
#include <nvgpu/nvgpu_err.h>
static void pmu_report_error(struct gk20a *g, u32 err_type,
u32 status, u32 pmu_err_type)
{
int ret = 0;
if (g->ops.pmu.err_ops.report_pmu_err != NULL) {
ret = g->ops.pmu.err_ops.report_pmu_err(g,
NVGPU_ERR_MODULE_PWR, err_type, status, pmu_err_type);
if (ret != 0) {
nvgpu_err(g, "Failed to report PMU error: %d",
err_type);
}
}
}
static int pmu_enable_hw(struct nvgpu_pmu *pmu, bool enable)
{
@@ -650,3 +666,11 @@ void nvgpu_pmu_get_cmd_line_args_offset(struct gk20a *g,
*args_offset = dmem_size - g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu);
}
void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
u32 error_type)
{
pmu_report_error(g,
GPU_PMU_BAR0_ERROR_TIMEOUT, bar0_status, error_type);
return;
}

View File

@@ -891,3 +891,58 @@ u32 gk20a_pmu_falcon_base_addr(void)
{
return pwr_falcon_irqsset_r();
}
int gk20a_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status,
u32 *etype)
{
u32 val = 0;
u32 err_status = 0;
val = gk20a_readl(g, pwr_pmu_bar0_error_status_r());
*bar0_status = val;
if (val == 0U) {
return 0;
}
if ((val & pwr_pmu_bar0_error_status_timeout_host_m()) != 0U) {
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
? PMU_BAR0_HOST_WRITE_TOUT : PMU_BAR0_HOST_READ_TOUT;
} else if ((val & pwr_pmu_bar0_error_status_timeout_fecs_m()) != 0U) {
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
? PMU_BAR0_FECS_WRITE_TOUT : PMU_BAR0_FECS_READ_TOUT;
} else if ((val & pwr_pmu_bar0_error_status_cmd_hwerr_m()) != 0U) {
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
? PMU_BAR0_CMD_WRITE_HWERR : PMU_BAR0_CMD_READ_HWERR;
} else if ((val & pwr_pmu_bar0_error_status_fecserr_m()) != 0U) {
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
? PMU_BAR0_WRITE_FECSERR : PMU_BAR0_READ_FECSERR;
err_status = gk20a_readl(g, pwr_pmu_bar0_fecs_error_r());
/*
* BAR0_FECS_ERROR would only record the first error code if
* multiple FECS error happen. Once BAR0_FECS_ERROR is cleared,
* BAR0_FECS_ERROR can record the error code from FECS again.
* Writing status regiter to clear the FECS Hardware state.
*/
gk20a_writel(g, pwr_pmu_bar0_fecs_error_r(), err_status);
} else if ((val & pwr_pmu_bar0_error_status_hosterr_m()) != 0U) {
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
? PMU_BAR0_WRITE_HOSTERR : PMU_BAR0_READ_HOSTERR;
/*
* BAR0_HOST_ERROR would only record the first error code if
* multiple HOST error happen. Once BAR0_HOST_ERROR is cleared,
* BAR0_HOST_ERROR can record the error code from HOST again.
* Writing status regiter to clear the FECS Hardware state.
*
* Defining clear ops for host err as gk20a does not have
* status register for this.
*/
if (g->ops.pmu.pmu_clear_bar0_host_err_status != NULL) {
g->ops.pmu.pmu_clear_bar0_host_err_status(g);
}
} else {
nvgpu_err(g, "PMU bar0 status type is not found");
}
/* Writing Bar0 status regiter to clear the Hardware state */
gk20a_writel(g, pwr_pmu_bar0_error_status_r(), val);
return (-EIO);
}

View File

@@ -414,3 +414,11 @@ bool gm20b_is_pmu_supported(struct gk20a *g)
{
return true;
}
void gm20b_clear_pmu_bar0_host_err_status(struct gk20a *g)
{
u32 status;
status = gk20a_readl(g, pwr_pmu_bar0_host_error_r());
gk20a_writel(g, pwr_pmu_bar0_host_error_r(), status);
}

View File

@@ -1,7 +1,7 @@
/*
* GM20B PMU
*
* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2014-2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -41,4 +41,5 @@ int gm20b_pmu_setup_hw_and_bl_bootstrap(struct gk20a *g,
struct nvgpu_falcon_bl_info *bl_info);
void gm20b_secured_pmu_start(struct gk20a *g);
bool gm20b_is_pmu_supported(struct gk20a *g);
void gm20b_clear_pmu_bar0_host_err_status(struct gk20a *g);
#endif /*NVGPU_GM20B_PMU_GM20B_H*/

View File

@@ -688,6 +688,8 @@ static const struct gpu_ops gm20b_ops = {
.get_irqdest = gk20a_pmu_get_irqdest,
.is_debug_mode_enabled = gm20b_pmu_is_debug_mode_en,
.save_zbc = gk20a_pmu_save_zbc,
.pmu_clear_bar0_host_err_status =
gm20b_clear_pmu_bar0_host_err_status,
},
.clk = {
.init_clk_support = gm20b_init_clk_support,

View File

@@ -762,6 +762,8 @@ static const struct gpu_ops gp10b_ops = {
.get_irqdest = gk20a_pmu_get_irqdest,
.is_debug_mode_enabled = gm20b_pmu_is_debug_mode_en,
.save_zbc = gk20a_pmu_save_zbc,
.pmu_clear_bar0_host_err_status =
gm20b_clear_pmu_bar0_host_err_status,
},
.clk_arb = {
.check_clk_arb_support = gp10b_check_clk_arb_support,

View File

@@ -952,6 +952,8 @@ static const struct gpu_ops gv100_ops = {
.secured_pmu_start = gm20b_secured_pmu_start,
.create_ssmd_lookup_table = nvgpu_pmu_create_ssmd_lookup_table,
.save_zbc = gk20a_pmu_save_zbc,
.pmu_clear_bar0_host_err_status =
gm20b_clear_pmu_bar0_host_err_status,
},
.clk = {
.init_clk_support = gv100_init_clk_support,

View File

@@ -914,6 +914,8 @@ static const struct gpu_ops gv11b_ops = {
.init_wpr_region = gm20b_pmu_init_acr,
.load_lsfalcon_ucode = gp10b_load_falcon_ucode,
.save_zbc = gk20a_pmu_save_zbc,
.pmu_clear_bar0_host_err_status =
gm20b_clear_pmu_bar0_host_err_status,
#endif
},
.clk_arb = {

View File

@@ -144,6 +144,10 @@ struct hs_acr {
int (*acr_flcn_setup_hw_and_bl_bootstrap)(struct gk20a *g,
struct nvgpu_falcon_bl_info *bl_info);
void (*report_acr_engine_bus_err_status)(struct gk20a *g,
u32 bar0_status, u32 error_type);
int (*acr_engine_bus_err_status)(struct gk20a *g, u32 *bar0_status,
u32 *error_type);
};
#define ACR_DEFAULT 0U

View File

@@ -1283,9 +1283,13 @@ struct gpu_ops {
u32 hw_id, u32 inst,
u32 err_id, u64 err_addr,
u64 err_cnt);
int (*report_pmu_err)(struct gk20a *g,
u32 hw_id, u32 err_id, u32 status,
u32 pmu_err_type);
} err_ops;
void (*create_ssmd_lookup_table)(struct nvgpu_pmu *pmu);
void (*save_zbc)(struct gk20a *g, u32 entries);
void (*pmu_clear_bar0_host_err_status)(struct gk20a *g);
} pmu;
struct {
int (*init_debugfs)(struct gk20a *g);

View File

@@ -101,6 +101,8 @@
#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED 1U
#define GPU_PMU_FALCON_DMEM_ECC_CORRECTED 2U
#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED 3U
#define GPU_PMU_BAR0_ERROR_TIMEOUT 4U
#define GPU_PMU_INVALID_ERROR 5U
#define GPU_PGRAPH_FE_EXCEPTION 0U
#define GPU_PGRAPH_MEMFMT_EXCEPTION 1U

View File

@@ -122,6 +122,20 @@ enum pmu_seq_state {
PMU_SEQ_STATE_CANCELLED
};
#define PMU_BAR0_SUCCESS 0U
#define PMU_BAR0_HOST_READ_TOUT 1U
#define PMU_BAR0_HOST_WRITE_TOUT 2U
#define PMU_BAR0_FECS_READ_TOUT 3U
#define PMU_BAR0_FECS_WRITE_TOUT 4U
#define PMU_BAR0_CMD_READ_HWERR 5U
#define PMU_BAR0_CMD_WRITE_HWERR 6U
#define PMU_BAR0_READ_HOSTERR 7U
#define PMU_BAR0_WRITE_HOSTERR 8U
#define PMU_BAR0_READ_FECSERR 9U
#define PMU_BAR0_WRITE_FECSERR 10U
#define ACR_BOOT_TIMEDOUT 11U
#define ACR_BOOT_FAILED 12U
/*PG defines used by nvpgu-pmu*/
#define PMU_PG_IDLE_THRESHOLD_SIM 1000U
#define PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM 4000000U
@@ -533,5 +547,9 @@ u32 nvgpu_pmu_get_ss_member_set_size(struct nvgpu_pmu *pmu, u32 member_id);
u32 nvgpu_pmu_get_ss_member_get_status_size(struct nvgpu_pmu *pmu,
u32 member_id);
void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
u32 error_type);
int gk20a_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status,
u32 *etype);
#endif /* NVGPU_PMU_H */

View File

@@ -986,6 +986,8 @@ static const struct gpu_ops tu104_ops = {
.secured_pmu_start = gm20b_secured_pmu_start,
.create_ssmd_lookup_table = nvgpu_pmu_create_ssmd_lookup_table,
.save_zbc = gk20a_pmu_save_zbc,
.pmu_clear_bar0_host_err_status =
gm20b_clear_pmu_bar0_host_err_status,
},
.clk = {
.init_clk_support = gv100_init_clk_support,