From 9c10f2d59593db7adb2c05a4b3a87ed5dfb55011 Mon Sep 17 00:00:00 2001 From: Rajesh Devaraj Date: Tue, 19 Feb 2019 14:57:05 +0530 Subject: [PATCH] gpu: nvgpu: Enable the reporting of errors for CE Enable the reporting of errors on hw module Copy Engine. These errors will be notified to the underlying safety service. Jira NVGPU-1866 Change-Id: Ie183b01f288653978e156cfcfcf231cfcb5426c3 Signed-off-by: Rajesh Devaraj Reviewed-on: https://git-master.nvidia.com/r/2022766 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gp10b/ce_gp10b.c | 9 +++++++- drivers/gpu/nvgpu/gv11b/ce_gv11b.c | 24 ++++++++++++++++++++- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 11 +++++----- drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h | 17 ++++++++++++++- 4 files changed, 52 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/nvgpu/gp10b/ce_gp10b.c b/drivers/gpu/nvgpu/gp10b/ce_gp10b.c index 14c9eb2dd..d2993e487 100644 --- a/drivers/gpu/nvgpu/gp10b/ce_gp10b.c +++ b/drivers/gpu/nvgpu/gp10b/ce_gp10b.c @@ -1,7 +1,7 @@ /* * Pascal GPU series Copy Engine. * - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -24,6 +24,7 @@ #include #include +#include #include "ce_gp10b.h" @@ -52,10 +53,14 @@ void gp10b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base) /* clear blocking interrupts: they exibit broken behavior */ if ((ce_intr & ce_intr_status_blockpipe_pending_f()) != 0U) { + nvgpu_report_ce_error(g, inst_id, + GPU_CE_BLOCK_PIPE, ce_intr); clear_intr |= ce_blockpipe_isr(g, ce_intr); } if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) { + nvgpu_report_ce_error(g, inst_id, + GPU_CE_LAUNCH_ERROR, ce_intr); clear_intr |= ce_launcherr_isr(g, ce_intr); } @@ -71,6 +76,8 @@ u32 gp10b_ce_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base) nvgpu_log(g, gpu_dbg_intr, "ce nonstall isr %08x %08x\n", ce_intr, inst_id); if ((ce_intr & ce_intr_status_nonblockpipe_pending_f()) != 0U) { + nvgpu_report_ce_error(g, inst_id, + GPU_CE_NONBLOCK_PIPE, ce_intr); gk20a_writel(g, ce_intr_status_r(inst_id), ce_intr_status_nonblockpipe_pending_f()); ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE | diff --git a/drivers/gpu/nvgpu/gv11b/ce_gv11b.c b/drivers/gpu/nvgpu/gv11b/ce_gv11b.c index a960beea2..f12100a8f 100644 --- a/drivers/gpu/nvgpu/gv11b/ce_gv11b.c +++ b/drivers/gpu/nvgpu/gv11b/ce_gv11b.c @@ -1,7 +1,7 @@ /* * Volta GPU series Copy Engine. * - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,6 +26,7 @@ #include "nvgpu/log.h" #include "nvgpu/bitops.h" #include +#include #include "gp10b/ce_gp10b.h" @@ -34,6 +35,23 @@ #include #include +void nvgpu_report_ce_error(struct gk20a *g, u32 inst, + u32 err_type, u32 status) +{ + int ret = 0; + + if (g->ops.ce2.err_ops.report_ce_err == NULL) { + return; + } + ret = g->ops.ce2.err_ops.report_ce_err(g, + NVGPU_ERR_MODULE_CE, inst, err_type, status); + if (ret != 0) { + nvgpu_err(g, "Failed to report CE error: " + "inst=%u, err_type=%u, status=%u", + inst, err_type, status); + } +} + u32 gv11b_ce_get_num_pce(struct gk20a *g) { /* register contains a bitmask indicating which physical copy @@ -60,6 +78,8 @@ void gv11b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base) * reset to get back to a working state. */ if ((ce_intr & ce_intr_status_invalid_config_pending_f()) != 0U) { + nvgpu_report_ce_error(g, inst_id, + GPU_CE_INVALID_CONFIG, ce_intr); nvgpu_log(g, gpu_dbg_intr, "ce: inst %d: invalid config", inst_id); clear_intr |= ce_intr_status_invalid_config_reset_f(); @@ -71,6 +91,8 @@ void gv11b_ce_isr(struct gk20a *g, u32 inst_id, u32 pri_base) * reset before operations can start again, if not the entire GPU. */ if ((ce_intr & ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) { + nvgpu_report_ce_error(g, inst_id, + GPU_CE_METHOD_BUFFER_FAULT, ce_intr); nvgpu_log(g, gpu_dbg_intr, "ce: inst %d: mthd buffer fault", inst_id); clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f(); diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index f2dc2518f..57e59ff66 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -245,6 +245,11 @@ struct gpu_ops { void (*isr_stall)(struct gk20a *g, u32 inst_id, u32 pri_base); u32 (*isr_nonstall)(struct gk20a *g, u32 inst_id, u32 pri_base); u32 (*get_num_pce)(struct gk20a *g); + struct { + int (*report_ce_err)(struct gk20a *g, + u32 hw_id, u32 inst, u32 err_id, + u32 status); + } err_ops; } ce2; struct { u32 (*get_patch_slots)(struct gk20a *g); @@ -2055,10 +2060,4 @@ void gk20a_put(struct gk20a *g); bool nvgpu_has_syncpoints(struct gk20a *g); -void nvgpu_report_host_error(struct gk20a *g, - u32 inst, u32 err_id, u32 intr_info); - -void nvgpu_report_gr_exception(struct gk20a *g, u32 inst, - u32 err_type, u32 status); - #endif /* GK20A_H */ diff --git a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h index 234db44b1..81f72d80c 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h +++ b/drivers/gpu/nvgpu/include/nvgpu/nvgpu_err.h @@ -33,7 +33,8 @@ #define NVGPU_ERR_MODULE_PGRAPH 7U #define NVGPU_ERR_MODULE_LTC 8U #define NVGPU_ERR_MODULE_HUBMMU 9U -#define NVGPU_ERR_MODULE_INVALID 10U +#define NVGPU_ERR_MODULE_CE 11U +#define NVGPU_ERR_MODULE_INVALID 12U #define GPU_HOST_PFIFO_BIND_ERROR 0U #define GPU_HOST_PFIFO_SCHED_ERROR 1U @@ -136,4 +137,18 @@ #define GPU_HUBMMU_PDE0_DATA_ECC_UNCORRECTED 7U #define GPU_HUBMMU_INVALID_ERROR 8U +#define GPU_CE_LAUNCH_ERROR 0U +#define GPU_CE_BLOCK_PIPE 1U +#define GPU_CE_NONBLOCK_PIPE 2U +#define GPU_CE_INVALID_CONFIG 3U +#define GPU_CE_METHOD_BUFFER_FAULT 4U + +void nvgpu_report_host_error(struct gk20a *g, + u32 inst, u32 err_id, u32 intr_info); + +void nvgpu_report_gr_exception(struct gk20a *g, u32 inst, + u32 err_type, u32 status); + +void nvgpu_report_ce_error(struct gk20a *g, u32 inst, + u32 err_type, u32 status); #endif