gpu: nvgpu: move handle_gpc_gpcmmu_exception to hal

Move handle_gpc_gpcmmu_exception to hal.gr.intr
Pass g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter
and g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter pointers
as function parameter to avoid dereferencing g->ecc inside hal function

Update g->ops.gr.handle_gpc_gpcmmu_exception to
g->ops.gr.intr.handle_gpc_gpcmmu_exception

JIRA NVGPU-3016

Change-Id: I9698cf71b568caf8e259996f84b4f26aded865f5
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2087198
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2019-04-01 16:43:58 -07:00
committed by mobile promotions
parent 93fd6644f4
commit 4431de48f8
10 changed files with 132 additions and 139 deletions

View File

@@ -213,8 +213,6 @@ static const struct gpu_ops vgpu_gv11b_ops = {
.set_ctxsw_preemption_mode = vgpu_gr_set_ctxsw_preemption_mode,
.is_etpc_addr = gv11b_gr_pri_is_etpc_addr,
.egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table,
.handle_gpc_gpcmmu_exception =
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
@@ -419,6 +417,8 @@ static const struct gpu_ops vgpu_gv11b_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpcmmu_exception =
gv11b_gr_intr_handle_gpc_gpcmmu_exception,
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,

View File

@@ -1921,10 +1921,11 @@ static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event,
}
/* Handle GPCMMU exceptions */
if (g->ops.gr.handle_gpc_gpcmmu_exception != NULL) {
tmp_ret = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc,
gpc_exception);
ret = (ret != 0) ? ret : tmp_ret;
if (g->ops.gr.intr.handle_gpc_gpcmmu_exception != NULL) {
g->ops.gr.intr.handle_gpc_gpcmmu_exception(g, gpc,
gpc_exception,
&g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter,
&g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
}
}

View File

@@ -457,8 +457,6 @@ static const struct gpu_ops gv100_ops = {
.set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode,
.is_etpc_addr = gv11b_gr_pri_is_etpc_addr,
.egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table,
.handle_gpc_gpcmmu_exception =
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
@@ -705,6 +703,8 @@ static const struct gpu_ops gv100_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpcmmu_exception =
gv11b_gr_intr_handle_gpc_gpcmmu_exception,
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,

View File

@@ -819,129 +819,6 @@ int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
return 0;
}
static int gr_gv11b_handle_gpcmmu_ecc_exception(struct gk20a *g, u32 gpc,
u32 exception)
{
int ret = 0;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 offset = gpc_stride * gpc;
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
u32 hww_esr;
hww_esr = gk20a_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
if ((hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())) == 0U) {
return ret;
}
ecc_status = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
ecc_addr = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
corrected_cnt = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = gk20a_readl(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
corrected_cnt);
uncorrected_delta = gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(
uncorrected_cnt);
corrected_overflow = ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m();
uncorrected_overflow = ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m();
/* clear the interrupt */
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
gk20a_writel(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
offset, 0);
}
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
gk20a_writel(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
offset, 0);
}
gk20a_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
/* Handle overflow */
if (corrected_overflow != 0U) {
corrected_delta +=
BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
}
if (uncorrected_overflow != 0U) {
uncorrected_delta +=
BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
}
g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter +=
corrected_delta;
g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter +=
uncorrected_delta;
nvgpu_log(g, gpu_dbg_intr,
"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
}
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
}
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
}
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) != 0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
0, g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
}
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
nvgpu_info(g, "mmu l1tlb ecc counter overflow!");
}
nvgpu_log(g, gpu_dbg_intr,
"ecc error address: 0x%x", ecc_addr);
nvgpu_log(g, gpu_dbg_intr,
"ecc error count corrected: %d, uncorrected %d",
g->ecc.gr.mmu_l1tlb_ecc_corrected_err_count[gpc].counter,
g->ecc.gr.mmu_l1tlb_ecc_uncorrected_err_count[gpc].counter);
return ret;
}
int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception)
{
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m()) != 0U) {
return gr_gv11b_handle_gpcmmu_ecc_exception(g, gpc,
gpc_exception);
}
return 0;
}
void gr_gv11b_set_go_idle_timeout(struct gk20a *g, u32 data)
{
gk20a_writel(g, gr_fe_go_idle_timeout_r(), data);

View File

@@ -84,8 +84,6 @@ int gr_gv11b_handle_tpc_sm_ecc_exception(struct gk20a *g,
int gr_gv11b_handle_gcc_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int gr_gv11b_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception);
void gr_gv11b_enable_gpc_exceptions(struct gk20a *g);
int gr_gv11b_handle_sw_method(struct gk20a *g, u32 addr,
u32 class_num, u32 offset, u32 data);

View File

@@ -410,8 +410,6 @@ static const struct gpu_ops gv11b_ops = {
.set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode,
.is_etpc_addr = gv11b_gr_pri_is_etpc_addr,
.egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table,
.handle_gpc_gpcmmu_exception =
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
@@ -664,6 +662,8 @@ static const struct gpu_ops gv11b_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpcmmu_exception =
gv11b_gr_intr_handle_gpc_gpcmmu_exception,
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,

View File

@@ -31,6 +31,120 @@
#include <nvgpu/hw/gv11b/hw_gr_gv11b.h>
void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err)
{
u32 offset = nvgpu_gr_gpc_offset(g, gpc);
u32 ecc_status, ecc_addr, corrected_cnt, uncorrected_cnt;
u32 corrected_delta, uncorrected_delta;
u32 corrected_overflow, uncorrected_overflow;
u32 hww_esr;
if ((gpc_exception & gr_gpc0_gpccs_gpc_exception_gpcmmu_m()) == 0U) {
return;
}
hww_esr = nvgpu_readl(g, gr_gpc0_mmu_gpcmmu_global_esr_r() + offset);
if ((hww_esr & (gr_gpc0_mmu_gpcmmu_global_esr_ecc_corrected_m() |
gr_gpc0_mmu_gpcmmu_global_esr_ecc_uncorrected_m())) == 0U) {
return;
}
ecc_status = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_status_r() + offset);
ecc_addr = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_address_r() + offset);
corrected_cnt = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() + offset);
uncorrected_cnt = nvgpu_readl(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() + offset);
corrected_delta = gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_v(
corrected_cnt);
uncorrected_delta =
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_v(
uncorrected_cnt);
corrected_overflow = ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_total_counter_overflow_m();
uncorrected_overflow = ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_total_counter_overflow_m();
/* clear the interrupt */
if ((corrected_delta > 0U) || (corrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_r() +
offset, 0);
}
if ((uncorrected_delta > 0U) || (uncorrected_overflow != 0U)) {
nvgpu_writel(g,
gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_r() +
offset, 0);
}
nvgpu_writel(g, gr_gpc0_mmu_l1tlb_ecc_status_r() + offset,
gr_gpc0_mmu_l1tlb_ecc_status_reset_task_f());
/* Handle overflow */
if (corrected_overflow != 0U) {
corrected_delta +=
BIT32(gr_gpc0_mmu_l1tlb_ecc_corrected_err_count_total_s());
}
if (uncorrected_overflow != 0U) {
uncorrected_delta +=
BIT32(gr_gpc0_mmu_l1tlb_ecc_uncorrected_err_count_total_s());
}
*corrected_err += corrected_delta;
*uncorrected_err += uncorrected_delta;
nvgpu_log(g, gpu_dbg_intr,
"mmu l1tlb gpc:%d ecc interrupt intr: 0x%x", gpc, hww_esr);
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_sa_data_m()) !=
0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_SA_DATA_ECC_CORRECTED,
0, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc sa data error");
}
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_sa_data_m()) !=
0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_SA_DATA_ECC_UNCORRECTED,
0, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc sa data error");
}
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_corrected_err_l1tlb_fa_data_m()) !=
0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_FA_DATA_ECC_CORRECTED,
0, (u32)*corrected_err);
nvgpu_log(g, gpu_dbg_intr, "corrected ecc fa data error");
}
if ((ecc_status &
gr_gpc0_mmu_l1tlb_ecc_status_uncorrected_err_l1tlb_fa_data_m()) !=
0U) {
nvgpu_gr_report_ecc_error(g, NVGPU_ERR_MODULE_MMU, gpc, 0,
GPU_MMU_L1TLB_FA_DATA_ECC_UNCORRECTED,
0, (u32)*uncorrected_err);
nvgpu_log(g, gpu_dbg_intr, "uncorrected ecc fa data error");
}
if ((corrected_overflow != 0U) || (uncorrected_overflow != 0U)) {
nvgpu_info(g, "mmu l1tlb ecc counter overflow!");
}
nvgpu_log(g, gpu_dbg_intr,
"ecc error address: 0x%x", ecc_addr);
nvgpu_log(g, gpu_dbg_intr,
"ecc error count corrected: %d, uncorrected %d",
(u32)*corrected_err, (u32)*uncorrected_err);
}
void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err)
{

View File

@@ -28,6 +28,8 @@
struct gk20a;
struct nvgpu_gr_config;
void gv11b_gr_intr_handle_gpc_gpcmmu_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err);
void gv11b_gr_intr_handle_gpc_gpccs_exception(struct gk20a *g, u32 gpc,
u32 gpc_exception, u32 *corrected_err, u32 *uncorrected_err);
void gv11b_gr_intr_handle_tpc_mpc_exception(struct gk20a *g, u32 gpc, u32 tpc);

View File

@@ -374,8 +374,6 @@ struct gpu_ops {
int (*handle_gcc_exception)(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch,
u32 *hww_global_esr);
int (*handle_gpc_gpcmmu_exception)(struct gk20a *g, u32 gpc,
u32 gpc_exception);
int (*init_ecc)(struct gk20a *g);
u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
int (*record_sm_error_state)(struct gk20a *g, u32 gpc, u32 tpc,
@@ -782,6 +780,9 @@ struct gpu_ops {
} init;
struct {
void (*handle_gpc_gpcmmu_exception)(struct gk20a *g,
u32 gpc, u32 gpc_exception,
u32 *corrected_err, u32 *uncorrected_err);
void (*handle_gpc_gpccs_exception)(struct gk20a *g,
u32 gpc, u32 gpc_exception,
u32 *corrected_err, u32 *uncorrected_err);

View File

@@ -479,8 +479,6 @@ static const struct gpu_ops tu104_ops = {
.set_ctxsw_preemption_mode = gr_gp10b_set_ctxsw_preemption_mode,
.is_etpc_addr = gv11b_gr_pri_is_etpc_addr,
.egpc_etpc_priv_addr_table = gv11b_gr_egpc_etpc_priv_addr_table,
.handle_gpc_gpcmmu_exception =
gr_gv11b_handle_gpc_gpcmmu_exception,
.get_egpc_base = gv11b_gr_get_egpc_base,
.get_egpc_etpc_num = gv11b_gr_get_egpc_etpc_num,
.access_smpc_reg = gv11b_gr_access_smpc_reg,
@@ -738,6 +736,8 @@ static const struct gpu_ops tu104_ops = {
gv11b_gr_init_commit_gfxp_wfi_timeout,
},
.intr = {
.handle_gpc_gpcmmu_exception =
gv11b_gr_intr_handle_gpc_gpcmmu_exception,
.handle_gpc_gpccs_exception =
gv11b_gr_intr_handle_gpc_gpccs_exception,
.get_tpc_exception = gm20b_gr_intr_get_tpc_exception,