gpu: nvgpu: Enable the reporting of errors for CE

Enable the reporting of errors on hw module Copy Engine. These
errors will be notified to the underlying safety service.

Jira NVGPU-1866

Change-Id: Ie183b01f288653978e156cfcfcf231cfcb5426c3
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2022766
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Rajesh Devaraj
2019-02-19 14:57:05 +05:30
committed by mobile promotions
parent 9ff9fec887
commit 9c10f2d595
4 changed files with 52 additions and 9 deletions

View File

@@ -1,7 +1,7 @@
/*
* Pascal GPU series Copy Engine.
*
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -24,6 +24,7 @@
#include <nvgpu/io.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include "ce_gp10b.h"
@@ -52,10 +53,14 @@ void gp10b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
/* clear blocking interrupts: they exibit broken behavior */
if ((ce_intr & ce_intr_status_blockpipe_pending_f()) != 0U) {
nvgpu_report_ce_error(g, inst_id,
GPU_CE_BLOCK_PIPE, ce_intr);
clear_intr |= ce_blockpipe_isr(g, ce_intr);
}
if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) {
nvgpu_report_ce_error(g, inst_id,
GPU_CE_LAUNCH_ERROR, ce_intr);
clear_intr |= ce_launcherr_isr(g, ce_intr);
}
@@ -71,6 +76,8 @@ u32 gp10b_ce_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
nvgpu_log(g, gpu_dbg_intr, "ce nonstall isr %08x %08x\n", ce_intr, inst_id);
if ((ce_intr & ce_intr_status_nonblockpipe_pending_f()) != 0U) {
nvgpu_report_ce_error(g, inst_id,
GPU_CE_NONBLOCK_PIPE, ce_intr);
gk20a_writel(g, ce_intr_status_r(inst_id),
ce_intr_status_nonblockpipe_pending_f());
ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE |

View File

@@ -1,7 +1,7 @@
/*
* Volta GPU series Copy Engine.
*
* Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,6 +26,7 @@
#include "nvgpu/log.h"
#include "nvgpu/bitops.h"
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include "gp10b/ce_gp10b.h"
@@ -34,6 +35,23 @@
#include <nvgpu/hw/gv11b/hw_ce_gv11b.h>
#include <nvgpu/hw/gv11b/hw_top_gv11b.h>
void nvgpu_report_ce_error(struct gk20a *g, u32 inst,
u32 err_type, u32 status)
{
int ret = 0;
if (g->ops.ce2.err_ops.report_ce_err == NULL) {
return;
}
ret = g->ops.ce2.err_ops.report_ce_err(g,
NVGPU_ERR_MODULE_CE, inst, err_type, status);
if (ret != 0) {
nvgpu_err(g, "Failed to report CE error: "
"inst=%u, err_type=%u, status=%u",
inst, err_type, status);
}
}
u32 gv11b_ce_get_num_pce(struct gk20a *g)
{
/* register contains a bitmask indicating which physical copy
@@ -60,6 +78,8 @@ void gv11b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
* reset to get back to a working state.
*/
if ((ce_intr & ce_intr_status_invalid_config_pending_f()) != 0U) {
nvgpu_report_ce_error(g, inst_id,
GPU_CE_INVALID_CONFIG, ce_intr);
nvgpu_log(g, gpu_dbg_intr,
"ce: inst %d: invalid config", inst_id);
clear_intr |= ce_intr_status_invalid_config_reset_f();
@@ -71,6 +91,8 @@ void gv11b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
* reset before operations can start again, if not the entire GPU.
*/
if ((ce_intr & ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) {
nvgpu_report_ce_error(g, inst_id,
GPU_CE_METHOD_BUFFER_FAULT, ce_intr);
nvgpu_log(g, gpu_dbg_intr,
"ce: inst %d: mthd buffer fault", inst_id);
clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f();

View File

@@ -245,6 +245,11 @@ struct gpu_ops {
void (*isr_stall)(struct gk20a *g, u32 inst_id, u32 pri_base);
u32 (*isr_nonstall)(struct gk20a *g, u32 inst_id, u32 pri_base);
u32 (*get_num_pce)(struct gk20a *g);
struct {
int (*report_ce_err)(struct gk20a *g,
u32 hw_id, u32 inst, u32 err_id,
u32 status);
} err_ops;
} ce2;
struct {
u32 (*get_patch_slots)(struct gk20a *g);
@@ -2055,10 +2060,4 @@ void gk20a_put(struct gk20a *g);
bool nvgpu_has_syncpoints(struct gk20a *g);
void nvgpu_report_host_error(struct gk20a *g,
u32 inst, u32 err_id, u32 intr_info);
void nvgpu_report_gr_exception(struct gk20a *g, u32 inst,
u32 err_type, u32 status);
#endif /* GK20A_H */

View File

@@ -33,7 +33,8 @@
#define NVGPU_ERR_MODULE_PGRAPH 7U
#define NVGPU_ERR_MODULE_LTC 8U
#define NVGPU_ERR_MODULE_HUBMMU 9U
#define NVGPU_ERR_MODULE_INVALID 10U
#define NVGPU_ERR_MODULE_CE 11U
#define NVGPU_ERR_MODULE_INVALID 12U
#define GPU_HOST_PFIFO_BIND_ERROR 0U
#define GPU_HOST_PFIFO_SCHED_ERROR 1U
@@ -136,4 +137,18 @@
#define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED 7U
#define GPU_HUBMMU_INVALID_ERROR 8U
#define GPU_CE_LAUNCH_ERROR 0U
#define GPU_CE_BLOCK_PIPE 1U
#define GPU_CE_NONBLOCK_PIPE 2U
#define GPU_CE_INVALID_CONFIG 3U
#define GPU_CE_METHOD_BUFFER_FAULT 4U
void nvgpu_report_host_error(struct gk20a *g,
u32 inst, u32 err_id, u32 intr_info);
void nvgpu_report_gr_exception(struct gk20a *g, u32 inst,
u32 err_type, u32 status);
void nvgpu_report_ce_error(struct gk20a *g, u32 inst,
u32 err_type, u32 status);
#endif