gpu: nvgpu: move handle_gpc_gpccs_exception hal

Move handle_gpc_gpccs_exception hal to hal.gr.intr
Pass g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter and
g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter variable address
as parameter to function to avoid dereferencing g->ecc variable
inside hal function.

Update g->ops.gr.handle_gpc_gpcss_exception call to
g->ops.gr.intr.handle_gpc_gpcss_exception

JIRA NVGPU-3016

Change-Id: I6cab6428eb6785261f34ca21f2ce055a9995b408
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2087197
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2019-04-01 14:32:56 -07:00
committed by mobile promotions
parent 5f8aa39fd9
commit 22fb278755
10 changed files with 123 additions and 130 deletions

View File

@@ -213,8 +213,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.handle_gpc_gpccs_exception =
gr_gv11b_handle_gpc_gpccs_exception,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
.is_egpc_addr = gv11b_gr_pri_is_egpc_addr,
.handle_gcc_exception = gr_gv11b_handle_gcc_exception,
@@ -416,6 +414,8 @@ static const struct gpu_ops vgpu_gv11b_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,
.handle_tpc_mpc_exception =
gv11b_gr_intr_handle_tpc_mpc_exception,

View File

@@ -2086,10 +2086,11 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
}
/* Handle GPCCS exceptions */
if (g->ops.gr.handle_gpc_gpccs_exception != NULL) {
tmp_ret = g->ops.gr.handle_gpc_gpccs_exception(g, gpc,
gpc_exception);
ret = (ret != 0) ? ret : tmp_ret;
if (g->ops.gr.intr.handle_gpc_gpccs_exception != NULL) {
g->ops.gr.intr.handle_gpc_gpccs_exception(g, gpc,
gpc_exception,
&g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter,
&g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
}
/* Handle GPCMMU exceptions */

View File

@@ -458,8 +458,6 @@ static const struct gpu_ops gv100_ops = {
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.handle_gpc_gpccs_exception =
gr_gv11b_handle_gpc_gpccs_exception,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
.is_egpc_addr = gv11b_gr_pri_is_egpc_addr,
.handle_gcc_exception = gr_gv11b_handle_gcc_exception,
@@ -706,6 +704,8 @@ static const struct gpu_ops gv100_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,
.handle_tpc_mpc_exception =
gv11b_gr_intr_handle_tpc_mpc_exception,

View File

@@ -930,108 +930,6 @@ static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
return ret;
}
static int gr_gv11b_handle_gpccs_ecc_exception(struct gk20a *g, u32 gpc,
u32 exception)
{
int ret = 0;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 offset = gpc_stride * gpc;
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
u32 hww_esr;
hww_esr = gk20a_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
if ((hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() |
gr_gpc0_gpccs_hww_esr_ecc_corrected_m())) == 0U) {
return ret;
}
ecc_status = gk20a_readl(g,
gr_gpc0_gpccs_falcon_ecc_status_r() + offset);
ecc_addr = gk20a_readl(g,
gr_gpc0_gpccs_falcon_ecc_address_r() + offset);
corrected_cnt = gk20a_readl(g,
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = gk20a_readl(g,
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset);
corrected_delta = gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(
corrected_cnt);
uncorrected_delta = gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v(
uncorrected_cnt);
corrected_overflow = ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_total_counter_overflow_m();
uncorrected_overflow = ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_total_counter_overflow_m();
/* clear the interrupt */
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
gk20a_writel(g,
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() +
offset, 0);
}
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
gk20a_writel(g,
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() +
offset, 0);
}
gk20a_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset,
gr_gpc0_gpccs_falcon_ecc_status_reset_task_f());
g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter +=
corrected_delta;
g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter +=
uncorrected_delta;
nvgpu_log(g, gpu_dbg_intr,
"gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr, g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
}
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
nvgpu_info(g, "gpccs ecc counter overflow!");
}
nvgpu_log(g, gpu_dbg_intr,
"ecc error row address: 0x%x",
gr_gpc0_gpccs_falcon_ecc_address_row_address_v(ecc_addr));
nvgpu_log(g, gpu_dbg_intr,
"ecc error count corrected: %d, uncorrected %d",
g->ecc.gr.gpccs_ecc_corrected_err_count[gpc].counter,
g->ecc.gr.gpccs_ecc_uncorrected_err_count[gpc].counter);
return ret;
}
int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception)
@@ -1043,17 +941,6 @@ int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
return 0;
}
int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception)
{
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpccs_m()) != 0U) {
return gr_gv11b_handle_gpccs_ecc_exception(g, gpc,
gpc_exception);
}
return 0;
}
void gr_gv11b_set_go_idle_timeout(struct gk20a *g, u32 data)
{
gk20a_writel(g, gr_fe_go_idle_timeout_r(), data);

View File

@@ -86,8 +86,6 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
u32 *hww_global_esr);
int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception);
int gr_gv11b_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception);
void gr_gv11b_enable_gpc_exceptions(struct gk20a *g);
int gr_gv11b_handle_sw_method(struct gk20a *g, u32 addr,
u32 class_num, u32 offset, u32 data);

View File

@@ -411,8 +411,6 @@ static const struct gpu_ops gv11b_ops = {
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.handle_gpc_gpccs_exception =
gr_gv11b_handle_gpc_gpccs_exception,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
.is_egpc_addr = gv11b_gr_pri_is_egpc_addr,
.handle_gcc_exception = gr_gv11b_handle_gcc_exception,
@@ -665,6 +663,8 @@ static const struct gpu_ops gv11b_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,
.handle_tpc_mpc_exception =
gv11b_gr_intr_handle_tpc_mpc_exception,

View File

@@ -31,6 +31,110 @@
#include <nvgpu/hw/gv11b/hw_gr_gv11b.h>
void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err)
{
u32 offset = nvgpu_gr_gpc_offset(g, gpc);
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
u32 hww_esr;
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpccs_m()) == 0U) {
return;
}
hww_esr = nvgpu_readl(g, gr_gpc0_gpccs_hww_esr_r() + offset);
if ((hww_esr & (gr_gpc0_gpccs_hww_esr_ecc_uncorrected_m() |
gr_gpc0_gpccs_hww_esr_ecc_corrected_m())) == 0U) {
return;
}
ecc_status = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_status_r() + offset);
ecc_addr = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_address_r() + offset);
corrected_cnt = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = nvgpu_readl(g,
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() + offset);
corrected_delta =
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_total_v(
corrected_cnt);
uncorrected_delta =
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_total_v(
uncorrected_cnt);
corrected_overflow = ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_total_counter_overflow_m();
uncorrected_overflow = ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_total_counter_overflow_m();
/* clear the interrupt */
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_gpccs_falcon_ecc_corrected_err_count_r() +
offset, 0);
}
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_gpccs_falcon_ecc_uncorrected_err_count_r() +
offset, 0);
}
nvgpu_writel(g, gr_gpc0_gpccs_falcon_ecc_status_r() + offset,
gr_gpc0_gpccs_falcon_ecc_status_reset_task_f());
*corrected_err += corrected_delta;
*corrected_err += uncorrected_delta;
nvgpu_log(g, gpu_dbg_intr,
"gppcs gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_imem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_IMEM_ECC_CORRECTED,
ecc_addr, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error corrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_imem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_IMEM_ECC_UNCORRECTED,
ecc_addr, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "imem ecc error uncorrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_corrected_err_dmem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_DMEM_ECC_CORRECTED,
ecc_addr, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error corrected");
}
if ((ecc_status &
gr_gpc0_gpccs_falcon_ecc_status_uncorrected_err_dmem_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_GPCCS, gpc, 0,
GPU_GPCCS_FALCON_DMEM_ECC_UNCORRECTED,
ecc_addr, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "dmem ecc error uncorrected");
}
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
nvgpu_info(g, "gpccs ecc counter overflow!");
}
nvgpu_log(g, gpu_dbg_intr,
"ecc error row address: 0x%x",
gr_gpc0_gpccs_falcon_ecc_address_row_address_v(ecc_addr));
nvgpu_log(g, gpu_dbg_intr,
"ecc error count corrected: %d, uncorrected %d",
(u32)*corrected_err, (u32)*uncorrected_err);
}
void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc)
{
u32 esr;
@@ -80,7 +184,7 @@ void gv11b_gr_intr_enable_hww_exceptions(struct gk20a *g)
/* For now leave POR values */
nvgpu_log(g, gpu_dbg_info, "gr_sked_hww_esr_en_r 0x%08x",
gk20a_readl(g, gr_sked_hww_esr_en_r()));
nvgpu_readl(g, gr_sked_hww_esr_en_r()));
}
void gv11b_gr_intr_enable_exceptions(struct gk20a *g,

View File

@@ -28,6 +28,8 @@
struct gk20a;
struct nvgpu_gr_config;
void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err);
void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc);
void gv11b_gr_intr_enable_hww_exceptions(struct gk20a *g);
void gv11b_gr_intr_enable_exceptions(struct gk20a *g,

View File

@@ -374,8 +374,6 @@ struct gpu_ops {
int (*handle_gcc_exception)(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int (*handle_gpc_gpccs_exception)(struct gk20a *g, u32 gpc,
u32 gpc_exception);
int (*handle_gpc_gpcmmu_exception)(struct gk20a *g, u32 gpc,
u32 gpc_exception);
int (*init_ecc)(struct gk20a *g);
@@ -782,6 +780,9 @@ struct gpu_ops {
} init;
struct {
void (*handle_gpc_gpccs_exception)(struct gk20a *g,
u32 gpc, u32 gpc_exception,
u32 *corrected_err, u32 *uncorrected_err);
u32 (*get_tpc_exception)(struct gk20a *g, u32 offset,
struct nvgpu_gr_tpc_exception *pending_tpc);
void (*handle_tpc_mpc_exception)(struct gk20a *g,

View File

@@ -480,8 +480,6 @@ static const struct gpu_ops tu104_ops = {
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.handle_gpc_gpccs_exception =
gr_gv11b_handle_gpc_gpccs_exception,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
.is_egpc_addr = gv11b_gr_pri_is_egpc_addr,
.handle_gcc_exception = gr_gv11b_handle_gcc_exception,
@@ -739,6 +737,8 @@ static const struct gpu_ops tu104_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,
.handle_tpc_mpc_exception =
gv11b_gr_intr_handle_tpc_mpc_exception,