mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: report PMU falcon bar0 errors
Introduce hooks for reporting BAR0 PRI timeout. Jira NVGPU-1858 Change-Id: I917a7cb2e24b6d4025305e965c00c5551222c00a Signed-off-by: Prateek sethi <prsethi@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2024488 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
154ffd9dad
commit
3859725ea1
@@ -38,6 +38,8 @@ static int acr_wait_for_completion(struct gk20a *g,
|
||||
u32 sctl, cpuctl;
|
||||
int completion = 0;
|
||||
u32 data = 0;
|
||||
u32 bar0_status = 0;
|
||||
u32 error_type;
|
||||
|
||||
nvgpu_log_fn(g, " ");
|
||||
|
||||
@@ -45,9 +47,19 @@ static int acr_wait_for_completion(struct gk20a *g,
|
||||
if (completion != 0) {
|
||||
nvgpu_err(g, "flcn-%d: HS ucode boot timed out", flcn_id);
|
||||
nvgpu_falcon_dump_stats(flcn);
|
||||
error_type = ACR_BOOT_TIMEDOUT;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
if (g->acr.acr.acr_engine_bus_err_status != NULL) {
|
||||
completion = g->acr.acr.acr_engine_bus_err_status(g,
|
||||
&bar0_status, &error_type);
|
||||
if (completion != 0) {
|
||||
nvgpu_err(g, "flcn-%d: ACR engine bus error", flcn_id);
|
||||
goto exit;
|
||||
}
|
||||
}
|
||||
|
||||
nvgpu_acr_dbg(g, "flcn-%d: HS ucode capabilities %x", flcn_id,
|
||||
nvgpu_falcon_mailbox_read(flcn, FALCON_MAILBOX_1));
|
||||
|
||||
@@ -56,6 +68,7 @@ static int acr_wait_for_completion(struct gk20a *g,
|
||||
nvgpu_err(g, "flcn-%d: HS ucode boot failed, err %x", flcn_id,
|
||||
data);
|
||||
completion = -EAGAIN;
|
||||
error_type = ACR_BOOT_FAILED;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
@@ -65,6 +78,12 @@ static int acr_wait_for_completion(struct gk20a *g,
|
||||
flcn_id, sctl, cpuctl);
|
||||
|
||||
exit:
|
||||
if (completion != 0) {
|
||||
if (g->acr.acr.report_acr_engine_bus_err_status != NULL) {
|
||||
g->acr.acr.report_acr_engine_bus_err_status(g,
|
||||
bar0_status, error_type);
|
||||
}
|
||||
}
|
||||
return completion;
|
||||
}
|
||||
|
||||
|
||||
@@ -170,6 +170,8 @@ static void gm20b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
|
||||
hs_acr->acr_flcn = g->pmu.flcn;
|
||||
hs_acr->acr_flcn_setup_hw_and_bl_bootstrap =
|
||||
gm20b_pmu_setup_hw_and_bl_bootstrap;
|
||||
hs_acr->acr_engine_bus_err_status =
|
||||
gk20a_pmu_bar0_error_status;
|
||||
}
|
||||
|
||||
void gm20b_remove_acr_support(struct nvgpu_acr *acr)
|
||||
|
||||
@@ -163,6 +163,10 @@ static void gv11b_acr_default_sw_init(struct gk20a *g, struct hs_acr *hs_acr)
|
||||
hs_acr->acr_flcn = g->pmu.flcn;
|
||||
hs_acr->acr_flcn_setup_hw_and_bl_bootstrap =
|
||||
gm20b_pmu_setup_hw_and_bl_bootstrap;
|
||||
hs_acr->report_acr_engine_bus_err_status =
|
||||
nvgpu_pmu_report_bar0_pri_err_status;
|
||||
hs_acr->acr_engine_bus_err_status =
|
||||
gk20a_pmu_bar0_error_status;
|
||||
}
|
||||
|
||||
void nvgpu_gv11b_acr_sw_init(struct gk20a *g, struct nvgpu_acr *acr)
|
||||
|
||||
@@ -35,6 +35,22 @@
|
||||
#include <nvgpu/gk20a.h>
|
||||
#include <nvgpu/string.h>
|
||||
#include <nvgpu/power_features/cg.h>
|
||||
#include <nvgpu/nvgpu_err.h>
|
||||
|
||||
static void pmu_report_error(struct gk20a *g, u32 err_type,
|
||||
u32 status, u32 pmu_err_type)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (g->ops.pmu.err_ops.report_pmu_err != NULL) {
|
||||
ret = g->ops.pmu.err_ops.report_pmu_err(g,
|
||||
NVGPU_ERR_MODULE_PWR, err_type, status, pmu_err_type);
|
||||
if (ret != 0) {
|
||||
nvgpu_err(g, "Failed to report PMU error: %d",
|
||||
err_type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int pmu_enable_hw(struct nvgpu_pmu *pmu, bool enable)
|
||||
{
|
||||
@@ -650,3 +666,11 @@ void nvgpu_pmu_get_cmd_line_args_offset(struct gk20a *g,
|
||||
|
||||
*args_offset = dmem_size - g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu);
|
||||
}
|
||||
|
||||
void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
|
||||
u32 error_type)
|
||||
{
|
||||
pmu_report_error(g,
|
||||
GPU_PMU_BAR0_ERROR_TIMEOUT, bar0_status, error_type);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -891,3 +891,58 @@ u32 gk20a_pmu_falcon_base_addr(void)
|
||||
{
|
||||
return pwr_falcon_irqsset_r();
|
||||
}
|
||||
|
||||
int gk20a_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status,
|
||||
u32 *etype)
|
||||
{
|
||||
u32 val = 0;
|
||||
u32 err_status = 0;
|
||||
|
||||
val = gk20a_readl(g, pwr_pmu_bar0_error_status_r());
|
||||
*bar0_status = val;
|
||||
if (val == 0U) {
|
||||
return 0;
|
||||
}
|
||||
if ((val & pwr_pmu_bar0_error_status_timeout_host_m()) != 0U) {
|
||||
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
|
||||
? PMU_BAR0_HOST_WRITE_TOUT : PMU_BAR0_HOST_READ_TOUT;
|
||||
} else if ((val & pwr_pmu_bar0_error_status_timeout_fecs_m()) != 0U) {
|
||||
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
|
||||
? PMU_BAR0_FECS_WRITE_TOUT : PMU_BAR0_FECS_READ_TOUT;
|
||||
} else if ((val & pwr_pmu_bar0_error_status_cmd_hwerr_m()) != 0U) {
|
||||
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
|
||||
? PMU_BAR0_CMD_WRITE_HWERR : PMU_BAR0_CMD_READ_HWERR;
|
||||
} else if ((val & pwr_pmu_bar0_error_status_fecserr_m()) != 0U) {
|
||||
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
|
||||
? PMU_BAR0_WRITE_FECSERR : PMU_BAR0_READ_FECSERR;
|
||||
err_status = gk20a_readl(g, pwr_pmu_bar0_fecs_error_r());
|
||||
/*
|
||||
* BAR0_FECS_ERROR would only record the first error code if
|
||||
* multiple FECS error happen. Once BAR0_FECS_ERROR is cleared,
|
||||
* BAR0_FECS_ERROR can record the error code from FECS again.
|
||||
* Writing status regiter to clear the FECS Hardware state.
|
||||
*/
|
||||
gk20a_writel(g, pwr_pmu_bar0_fecs_error_r(), err_status);
|
||||
} else if ((val & pwr_pmu_bar0_error_status_hosterr_m()) != 0U) {
|
||||
*etype = ((val & pwr_pmu_bar0_error_status_err_cmd_m()) != 0U)
|
||||
? PMU_BAR0_WRITE_HOSTERR : PMU_BAR0_READ_HOSTERR;
|
||||
/*
|
||||
* BAR0_HOST_ERROR would only record the first error code if
|
||||
* multiple HOST error happen. Once BAR0_HOST_ERROR is cleared,
|
||||
* BAR0_HOST_ERROR can record the error code from HOST again.
|
||||
* Writing status regiter to clear the FECS Hardware state.
|
||||
*
|
||||
* Defining clear ops for host err as gk20a does not have
|
||||
* status register for this.
|
||||
*/
|
||||
if (g->ops.pmu.pmu_clear_bar0_host_err_status != NULL) {
|
||||
g->ops.pmu.pmu_clear_bar0_host_err_status(g);
|
||||
}
|
||||
} else {
|
||||
nvgpu_err(g, "PMU bar0 status type is not found");
|
||||
}
|
||||
|
||||
/* Writing Bar0 status regiter to clear the Hardware state */
|
||||
gk20a_writel(g, pwr_pmu_bar0_error_status_r(), val);
|
||||
return (-EIO);
|
||||
}
|
||||
|
||||
@@ -414,3 +414,11 @@ bool gm20b_is_pmu_supported(struct gk20a *g)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void gm20b_clear_pmu_bar0_host_err_status(struct gk20a *g)
|
||||
{
|
||||
u32 status;
|
||||
|
||||
status = gk20a_readl(g, pwr_pmu_bar0_host_error_r());
|
||||
gk20a_writel(g, pwr_pmu_bar0_host_error_r(), status);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* GM20B PMU
|
||||
*
|
||||
* Copyright (c) 2014-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2014-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -41,4 +41,5 @@ int gm20b_pmu_setup_hw_and_bl_bootstrap(struct gk20a *g,
|
||||
struct nvgpu_falcon_bl_info *bl_info);
|
||||
void gm20b_secured_pmu_start(struct gk20a *g);
|
||||
bool gm20b_is_pmu_supported(struct gk20a *g);
|
||||
void gm20b_clear_pmu_bar0_host_err_status(struct gk20a *g);
|
||||
#endif /*NVGPU_GM20B_PMU_GM20B_H*/
|
||||
|
||||
@@ -688,6 +688,8 @@ static const struct gpu_ops gm20b_ops = {
|
||||
.get_irqdest = gk20a_pmu_get_irqdest,
|
||||
.is_debug_mode_enabled = gm20b_pmu_is_debug_mode_en,
|
||||
.save_zbc = gk20a_pmu_save_zbc,
|
||||
.pmu_clear_bar0_host_err_status =
|
||||
gm20b_clear_pmu_bar0_host_err_status,
|
||||
},
|
||||
.clk = {
|
||||
.init_clk_support = gm20b_init_clk_support,
|
||||
|
||||
@@ -762,6 +762,8 @@ static const struct gpu_ops gp10b_ops = {
|
||||
.get_irqdest = gk20a_pmu_get_irqdest,
|
||||
.is_debug_mode_enabled = gm20b_pmu_is_debug_mode_en,
|
||||
.save_zbc = gk20a_pmu_save_zbc,
|
||||
.pmu_clear_bar0_host_err_status =
|
||||
gm20b_clear_pmu_bar0_host_err_status,
|
||||
},
|
||||
.clk_arb = {
|
||||
.check_clk_arb_support = gp10b_check_clk_arb_support,
|
||||
|
||||
@@ -952,6 +952,8 @@ static const struct gpu_ops gv100_ops = {
|
||||
.secured_pmu_start = gm20b_secured_pmu_start,
|
||||
.create_ssmd_lookup_table = nvgpu_pmu_create_ssmd_lookup_table,
|
||||
.save_zbc = gk20a_pmu_save_zbc,
|
||||
.pmu_clear_bar0_host_err_status =
|
||||
gm20b_clear_pmu_bar0_host_err_status,
|
||||
},
|
||||
.clk = {
|
||||
.init_clk_support = gv100_init_clk_support,
|
||||
|
||||
@@ -914,6 +914,8 @@ static const struct gpu_ops gv11b_ops = {
|
||||
.init_wpr_region = gm20b_pmu_init_acr,
|
||||
.load_lsfalcon_ucode = gp10b_load_falcon_ucode,
|
||||
.save_zbc = gk20a_pmu_save_zbc,
|
||||
.pmu_clear_bar0_host_err_status =
|
||||
gm20b_clear_pmu_bar0_host_err_status,
|
||||
#endif
|
||||
},
|
||||
.clk_arb = {
|
||||
|
||||
@@ -144,6 +144,10 @@ struct hs_acr {
|
||||
|
||||
int (*acr_flcn_setup_hw_and_bl_bootstrap)(struct gk20a *g,
|
||||
struct nvgpu_falcon_bl_info *bl_info);
|
||||
void (*report_acr_engine_bus_err_status)(struct gk20a *g,
|
||||
u32 bar0_status, u32 error_type);
|
||||
int (*acr_engine_bus_err_status)(struct gk20a *g, u32 *bar0_status,
|
||||
u32 *error_type);
|
||||
};
|
||||
|
||||
#define ACR_DEFAULT 0U
|
||||
|
||||
@@ -1283,9 +1283,13 @@ struct gpu_ops {
|
||||
u32 hw_id, u32 inst,
|
||||
u32 err_id, u64 err_addr,
|
||||
u64 err_cnt);
|
||||
int (*report_pmu_err)(struct gk20a *g,
|
||||
u32 hw_id, u32 err_id, u32 status,
|
||||
u32 pmu_err_type);
|
||||
} err_ops;
|
||||
void (*create_ssmd_lookup_table)(struct nvgpu_pmu *pmu);
|
||||
void (*save_zbc)(struct gk20a *g, u32 entries);
|
||||
void (*pmu_clear_bar0_host_err_status)(struct gk20a *g);
|
||||
} pmu;
|
||||
struct {
|
||||
int (*init_debugfs)(struct gk20a *g);
|
||||
|
||||
@@ -101,6 +101,8 @@
|
||||
#define GPU_PMU_FALCON_IMEM_ECC_UNCORRECTED 1U
|
||||
#define GPU_PMU_FALCON_DMEM_ECC_CORRECTED 2U
|
||||
#define GPU_PMU_FALCON_DMEM_ECC_UNCORRECTED 3U
|
||||
#define GPU_PMU_BAR0_ERROR_TIMEOUT 4U
|
||||
#define GPU_PMU_INVALID_ERROR 5U
|
||||
|
||||
#define GPU_PGRAPH_FE_EXCEPTION 0U
|
||||
#define GPU_PGRAPH_MEMFMT_EXCEPTION 1U
|
||||
|
||||
@@ -122,6 +122,20 @@ enum pmu_seq_state {
|
||||
PMU_SEQ_STATE_CANCELLED
|
||||
};
|
||||
|
||||
#define PMU_BAR0_SUCCESS 0U
|
||||
#define PMU_BAR0_HOST_READ_TOUT 1U
|
||||
#define PMU_BAR0_HOST_WRITE_TOUT 2U
|
||||
#define PMU_BAR0_FECS_READ_TOUT 3U
|
||||
#define PMU_BAR0_FECS_WRITE_TOUT 4U
|
||||
#define PMU_BAR0_CMD_READ_HWERR 5U
|
||||
#define PMU_BAR0_CMD_WRITE_HWERR 6U
|
||||
#define PMU_BAR0_READ_HOSTERR 7U
|
||||
#define PMU_BAR0_WRITE_HOSTERR 8U
|
||||
#define PMU_BAR0_READ_FECSERR 9U
|
||||
#define PMU_BAR0_WRITE_FECSERR 10U
|
||||
#define ACR_BOOT_TIMEDOUT 11U
|
||||
#define ACR_BOOT_FAILED 12U
|
||||
|
||||
/*PG defines used by nvpgu-pmu*/
|
||||
#define PMU_PG_IDLE_THRESHOLD_SIM 1000U
|
||||
#define PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM 4000000U
|
||||
@@ -533,5 +547,9 @@ u32 nvgpu_pmu_get_ss_member_set_size(struct nvgpu_pmu *pmu, u32 member_id);
|
||||
u32 nvgpu_pmu_get_ss_member_get_status_size(struct nvgpu_pmu *pmu,
|
||||
u32 member_id);
|
||||
|
||||
void nvgpu_pmu_report_bar0_pri_err_status(struct gk20a *g, u32 bar0_status,
|
||||
u32 error_type);
|
||||
int gk20a_pmu_bar0_error_status(struct gk20a *g, u32 *bar0_status,
|
||||
u32 *etype);
|
||||
#endif /* NVGPU_PMU_H */
|
||||
|
||||
|
||||
@@ -986,6 +986,8 @@ static const struct gpu_ops tu104_ops = {
|
||||
.secured_pmu_start = gm20b_secured_pmu_start,
|
||||
.create_ssmd_lookup_table = nvgpu_pmu_create_ssmd_lookup_table,
|
||||
.save_zbc = gk20a_pmu_save_zbc,
|
||||
.pmu_clear_bar0_host_err_status =
|
||||
gm20b_clear_pmu_bar0_host_err_status,
|
||||
},
|
||||
.clk = {
|
||||
.init_clk_support = gv100_init_clk_support,
|
||||
|
||||
Reference in New Issue
Block a user