gpu: nvgpu: Add CE interrupt handling

a. LAUNCH_ERR
    - Userspace error.
    - Triggered due to faulty launch.
    - Handle using recovery to reset CE engine and teardown the
      faulty channel.

b. An INVALID_CONFIG -
    - Triggered when LCE is mapped to floorswept PCE.
    - On iGPU, we use the default PCE 2 LCE  HW mapping.
      The default mapping can be read from NV_CE_PCE2LCE_CONFIG
      INIT value in CE refmanual.
    - NvGPU driver configures the mapping on dGPUs (currently only on
      Turing).
    - So, this interrupt can only be triggered if there is
      kernel or HW error
    - Recovery ( which is killing the context + engine reset) will
      not help resolve this error.
    - Trigger Quiesce as part of handling.

c. A MTHD_BUFFER_FAULT -
    - NvGPU driver allocates fault buffers for all TSGs or contexts,
      maps them in BAR2 VA space and writes the VA into channel
      instance block.
    - Can be triggered only due to kernel bug
    - Recovery will not help, need quiesce

d. FBUF_CRC_FAIL
    - Triggered when the CRC entry read from the method fault buffer
      does not match the computed CRC from the methods contained in
      the buffer.
    - This indicates memory corruption and is a fatal interrupt which
      at least requires the LCE to be reset before operations can
      start again, if not the entire GPU.
    - Better to quiesce on memory corruption
      CE Engine reset (via recovery) will not help.

e. FBUF_MAGIC_CHK_FAIL
    - Triggered when the MAGIC_NUM entry read from the method fault
      buf does not match NV_CE_MTHD_BUFFER_GLOBAL_HDR_MAGIC_NUM_VAL
    - This indicates memory corruption and is a fatal interrupt
    - Better to quiesce on memory corruption

f. STALLING_DEBUG
    - Only triggered with SW write for debug purposes
    - Debug interrupt, currently ignored

Move launch error handling from GP10b to GV11b HAL as -
1. LAUNCHERR_REPORT errcode METHOD_BUFFER_ACCESS_FAULT is not
   defined on Pascal
2. We do not support GP10b on dev-main ToT

JIRA NVGPU-8102

Change-Id: Idc84119bc23b5e85f3479fe62cc8720e98b627a5
Signed-off-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2678893
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Tejal Kudav
2022-03-09 12:40:14 +00:00
committed by mobile promotions
parent 15739c52e9
commit b80b2bdab8
35 changed files with 246 additions and 144 deletions

View File

@@ -28,6 +28,8 @@
#include <nvgpu/power_features/cg.h>
#include <nvgpu/cic_mon.h>
#include <nvgpu/mc.h>
#include <nvgpu/rc.h>
#include <nvgpu/nvgpu_init.h>
int nvgpu_ce_init_support(struct gk20a *g)
{
@@ -83,3 +85,24 @@ int nvgpu_ce_init_support(struct gk20a *g)
return 0;
}
void nvgpu_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
{
bool needs_rc = false;
bool needs_quiesce = false;
if (g->ops.ce.isr_stall != NULL) {
g->ops.ce.isr_stall(g, inst_id, pri_base, &needs_rc,
&needs_quiesce);
}
if (needs_quiesce) {
nvgpu_sw_quiesce(g);
}
if (needs_rc) {
nvgpu_log(g, gpu_dbg_intr,
"Recovery needed to handle CE interrupt.");
nvgpu_rc_ce_fault(g, inst_id);
}
}

View File

@@ -224,6 +224,42 @@ void nvgpu_rc_gr_fault(struct gk20a *g, struct nvgpu_tsg *tsg,
nvgpu_log(g, gpu_dbg_gr, "done");
}
void nvgpu_rc_ce_fault(struct gk20a *g, u32 inst_id)
{
struct nvgpu_channel *ch = NULL;
struct nvgpu_tsg *tsg = NULL;
u32 chid = NVGPU_INVALID_CHANNEL_ID;
u64 inst_ptr = 0U;
if (g->ops.ce.get_inst_ptr_from_lce != NULL) {
inst_ptr = g->ops.ce.get_inst_ptr_from_lce(g,
inst_id);
}
/* refch will be put back before recovery */
ch = nvgpu_channel_refch_from_inst_ptr(g, inst_ptr);
if (ch == NULL) {
return;
} else {
chid = ch->chid;
nvgpu_channel_put(ch);
tsg = nvgpu_tsg_from_ch(ch);
if (tsg == NULL) {
nvgpu_err(g, "channel_id: %d not bound to tsg",
chid);
/* ToDo: Trigger Quiesce? */
return;
}
nvgpu_tsg_set_error_notifier(g, tsg, NVGPU_ERR_NOTIFIER_CE_ERROR);
}
#ifdef CONFIG_NVGPU_RECOVERY
nvgpu_rc_tsg_and_related_engines(g, tsg, true,
RC_TYPE_CE_FAULT);
#else
WARN_ON(!g->sw_quiesce_pending);
(void)tsg;
#endif
}
void nvgpu_rc_sched_error_bad_tsg(struct gk20a *g)
{
#ifdef CONFIG_NVGPU_RECOVERY

View File

@@ -38,7 +38,8 @@
#include <nvgpu/hw/gk20a/hw_ce2_gk20a.h>
void gk20a_ce2_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
void gk20a_ce2_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce)
{
u32 ce2_intr = nvgpu_readl(g, ce2_intr_status_r());
u32 clear_intr = 0U;
@@ -55,9 +56,11 @@ void gk20a_ce2_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
}
if ((ce2_intr & ce2_intr_status_launcherr_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "ce2 launch error interrupt");
*needs_rc |= true;
clear_intr |= ce2_intr_status_launcherr_pending_f();
}
*needs_quiesce |= false;
nvgpu_writel(g, ce2_intr_status_r(), clear_intr);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -26,7 +26,8 @@
struct gk20a;
void gk20a_ce2_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
void gk20a_ce2_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce);
u32 gk20a_ce2_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
#endif /*NVGPU_CE2_GK20A_H*/

View File

@@ -29,7 +29,8 @@ struct gk20a;
void ga10b_ce_init_hw(struct gk20a *g);
#endif
void ga10b_ce_intr_enable(struct gk20a *g, bool enable);
void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce);
void ga10b_ce_intr_retrigger(struct gk20a *g, u32 inst_id);
void ga10b_ce_request_idle(struct gk20a *g);

View File

@@ -184,7 +184,8 @@ void ga10b_ce_intr_enable(struct gk20a *g, bool enable)
}
}
void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce)
{
u32 ce_intr = nvgpu_readl(g, ce_intr_status_r(inst_id));
u32 clear_intr = 0U;
@@ -199,6 +200,7 @@ void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
*/
if ((ce_intr & ce_intr_status_fbuf_crc_fail_pending_f()) != 0U) {
nvgpu_err(g, "ce: inst %d, fault buffer crc mismatch", inst_id);
*needs_quiesce |= true;
clear_intr |= ce_intr_status_fbuf_crc_fail_reset_f();
}
@@ -210,6 +212,7 @@ void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
if ((ce_intr & ce_intr_status_fbuf_magic_chk_fail_pending_f()) != 0U) {
nvgpu_err(g, "ce: inst %d, fault buffer magic check fail",
inst_id);
*needs_quiesce |= true;
clear_intr |= ce_intr_status_fbuf_magic_chk_fail_reset_f();
}
@@ -229,7 +232,7 @@ void ga10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
* The remaining legacy interrupts are handled by legacy interrupt
* handler.
*/
gv11b_ce_stall_isr(g, inst_id, pri_base);
gv11b_ce_stall_isr(g, inst_id, pri_base, needs_rc, needs_quiesce);
}
void ga10b_ce_intr_retrigger(struct gk20a *g, u32 inst_id)

View File

@@ -28,7 +28,8 @@
struct gk20a;
void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce);
#ifdef CONFIG_NVGPU_NONSTALL_INTR
u32 gp10b_ce_nonstall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
#endif /* NVGPU_HAL_NON_FUSA */

View File

@@ -32,7 +32,8 @@
#include <nvgpu/hw/gp10b/hw_ce_gp10b.h>
void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce)
{
u32 ce_intr = nvgpu_readl(g, ce_intr_status_r(inst_id));
u32 clear_intr = 0U;
@@ -47,13 +48,8 @@ void gp10b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
clear_intr |= ce_intr_status_blockpipe_pending_f();
}
if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) {
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_CE,
GPU_CE_LAUNCH_ERROR);
nvgpu_err(g, "ce launch error interrupt");
clear_intr |= ce_intr_status_launcherr_pending_f();
}
*needs_quiesce |= false;
*needs_rc |= false;
nvgpu_writel(g, ce_intr_status_r(inst_id), clear_intr);
return;
}

View File

@@ -28,9 +28,13 @@
struct gk20a;
struct nvgpu_device;
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
void gv11b_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g);
#endif
u32 gv11b_ce_get_num_pce(struct gk20a *g);
void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce);
void gv11b_ce_init_prod_values(struct gk20a *g);
void gv11b_ce_halt_engine(struct gk20a *g, const struct nvgpu_device *dev);
u64 gv11b_ce_get_inst_ptr_from_lce(struct gk20a *g, u32 inst_id);
#endif /* NVGPU_CE_GV11B_H */

View File

@@ -28,6 +28,7 @@
#include <nvgpu/device.h>
#include <nvgpu/gk20a.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/nvgpu_init.h>
#include "ce_gp10b.h"
#include "ce_gv11b.h"
@@ -48,13 +49,40 @@ u32 gv11b_ce_get_num_pce(struct gk20a *g)
return num_pce;
}
void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce)
{
u32 ce_intr = nvgpu_readl(g, ce_intr_status_r(inst_id));
u32 clear_intr = 0U;
u32 reg_val;
u32 err_code;
nvgpu_log(g, gpu_dbg_intr, "ce isr 0x%08x 0x%08x", ce_intr, inst_id);
if ((ce_intr & ce_intr_status_launcherr_pending_f()) != 0U) {
nvgpu_err(g, "ce launch error interrupt");
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_CE,
GPU_CE_LAUNCH_ERROR);
/* INVALID_CONFIG and METHOD_BUFFER_FAULT may still be
* reported via LAUNCHERR bit, but using different
* error code. Check the LAUNCHERR errorcode to
* check if above two interrupts are routed to
* LAUNCHERR bit and handle as per error handling
* policy.
*/
reg_val = nvgpu_readl(g, ce_lce_launcherr_r(inst_id));
err_code = ce_lce_launcherr_report_v(reg_val);
nvgpu_err(g, "ce launch error interrupt with errcode:0x%x", err_code);
if ((err_code == ce_lce_launcherr_report_method_buffer_access_fault_v()) ||
(err_code == ce_lce_launcherr_report_invalid_config_v())) {
*needs_quiesce |= true;
} else {
*needs_rc |= true;
}
clear_intr |= ce_intr_status_launcherr_pending_f();
}
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
/*
* An INVALID_CONFIG interrupt will be generated if a floorswept
@@ -66,6 +94,7 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_CE,
GPU_CE_INVALID_CONFIG);
nvgpu_err(g, "ce: inst %d: invalid config", inst_id);
*needs_quiesce |= true;
clear_intr |= ce_intr_status_invalid_config_reset_f();
}
@@ -79,15 +108,17 @@ void gv11b_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_CE,
GPU_CE_METHOD_BUFFER_FAULT);
nvgpu_err(g, "ce: inst %d: mthd buffer fault", inst_id);
*needs_quiesce |= true;
clear_intr |= ce_intr_status_mthd_buffer_fault_reset_f();
}
#endif
nvgpu_writel(g, ce_intr_status_r(inst_id), clear_intr);
gp10b_ce_stall_isr(g, inst_id, pri_base);
gp10b_ce_stall_isr(g, inst_id, pri_base, needs_rc, needs_quiesce);
}
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
void gv11b_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g)
{
u32 reg_val, num_lce, lce, clear_intr;
@@ -98,13 +129,19 @@ void gv11b_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g)
reg_val = nvgpu_readl(g, ce_intr_status_r(lce));
if ((reg_val &
ce_intr_status_mthd_buffer_fault_pending_f()) != 0U) {
nvgpu_log(g, gpu_dbg_intr,
"ce: lce %d: mthd buffer fault", lce);
nvgpu_err(g, "ce: lce %d: mthd buffer fault", lce);
nvgpu_report_err_to_sdl(g, NVGPU_ERR_MODULE_CE,
GPU_CE_METHOD_BUFFER_FAULT);
/* This is a fatal interrupt as it implies a kernel bug.
* Needs quiesce.
*/
nvgpu_sw_quiesce(g);
clear_intr = ce_intr_status_mthd_buffer_fault_reset_f();
nvgpu_writel(g, ce_intr_status_r(lce), clear_intr);
}
}
}
#endif
void gv11b_ce_init_prod_values(struct gk20a *g)
{
@@ -133,4 +170,20 @@ void gv11b_ce_halt_engine(struct gk20a *g, const struct nvgpu_device *dev)
nvgpu_err(g, "The CE engine %u is not idle"
"while reset", dev->inst_id);
}
}
u64 gv11b_ce_get_inst_ptr_from_lce(struct gk20a *g, u32 inst_id)
{
u32 reg_val;
reg_val = nvgpu_readl(g, ce_lce_bind_status_r(inst_id));
if (ce_lce_bind_status_bound_v(reg_val) ==
ce_lce_bind_status_bound_false_v()) {
/* CE appears to have never been bound -- ignore */
return 0U;
}
return (((u64)(ce_lce_bind_status_ctx_ptr_v(reg_val))) <<
g->ops.ramin.base_shift());
}

View File

@@ -469,8 +469,9 @@ void gv11b_fb_handle_bar2_fault(struct gk20a *g,
}
}
#endif
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
#endif
err = g->ops.bus.bar2_bind(g, &g->mm.bar2.inst_block);
if (err != 0) {
nvgpu_err(g, "bar2_bind failed!");

View File

@@ -427,10 +427,13 @@ static const struct gops_ce ga100_ops_ce = {
.init_hw = NULL,
#endif
.get_num_pce = gv11b_ce_get_num_pce,
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
.mthd_buffer_fault_in_bar2_fault = gv11b_ce_mthd_buffer_fault_in_bar2_fault,
#endif
.init_prod_values = gv11b_ce_init_prod_values,
.halt_engine = gv11b_ce_halt_engine,
.request_idle = NULL,
.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce,
};
static const struct gops_gr_ecc ga100_ops_gr_ecc = {

View File

@@ -401,10 +401,13 @@ static const struct gops_ce ga10b_ops_ce = {
.init_hw = ga10b_ce_init_hw,
#endif
.get_num_pce = gv11b_ce_get_num_pce,
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
.mthd_buffer_fault_in_bar2_fault = gv11b_ce_mthd_buffer_fault_in_bar2_fault,
#endif
.init_prod_values = gv11b_ce_init_prod_values,
.halt_engine = gv11b_ce_halt_engine,
.request_idle = ga10b_ce_request_idle,
.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce,
};
static const struct gops_gr_ecc ga10b_ops_gr_ecc = {

View File

@@ -314,10 +314,13 @@ static const struct gops_ce gv11b_ops_ce = {
.isr_nonstall = gp10b_ce_nonstall_isr,
#endif
.get_num_pce = gv11b_ce_get_num_pce,
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
.mthd_buffer_fault_in_bar2_fault = gv11b_ce_mthd_buffer_fault_in_bar2_fault,
#endif
.init_prod_values = gv11b_ce_init_prod_values,
.halt_engine = gv11b_ce_halt_engine,
.request_idle = NULL,
.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce,
};
static const struct gops_gr_ecc gv11b_ops_gr_ecc = {

View File

@@ -363,10 +363,13 @@ static const struct gops_ce tu104_ops_ce = {
.isr_nonstall = NULL,
#endif
.get_num_pce = gv11b_ce_get_num_pce,
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
.mthd_buffer_fault_in_bar2_fault = gv11b_ce_mthd_buffer_fault_in_bar2_fault,
#endif
.init_prod_values = gv11b_ce_init_prod_values,
.halt_engine = gv11b_ce_halt_engine,
.request_idle = NULL,
.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce,
};
static const struct gops_gr_ecc tu104_ops_gr_ecc = {

View File

@@ -1,7 +1,7 @@
/*
* GM20B Master Control
*
* Copyright (c) 2014-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2014-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -33,6 +33,7 @@
#include <nvgpu/engines.h>
#include <nvgpu/device.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/ce.h>
#include "mc_gm20b.h"
@@ -62,9 +63,8 @@ void gm20b_mc_isr_stall(struct gk20a *g)
}
/* CE Engine */
if (nvgpu_device_is_ce(g, dev) &&
(g->ops.ce.isr_stall != NULL)) {
g->ops.ce.isr_stall(g, dev->inst_id, dev->pri_base);
if (nvgpu_device_is_ce(g, dev)) {
nvgpu_ce_stall_isr(g, dev->inst_id, dev->pri_base);
}
}

View File

@@ -29,6 +29,7 @@
#include <nvgpu/engines.h>
#include <nvgpu/device.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/ce.h>
#include "mc_gp10b.h"
@@ -135,9 +136,8 @@ void mc_gp10b_isr_stall_engine(struct gk20a *g,
}
/* CE Engine */
if (nvgpu_device_is_ce(g, dev) &&
(g->ops.ce.isr_stall != NULL)) {
g->ops.ce.isr_stall(g, dev->inst_id, dev->pri_base);
if (nvgpu_device_is_ce(g, dev)) {
nvgpu_ce_stall_isr(g, dev->inst_id, dev->pri_base);
}
}

View File

@@ -30,6 +30,7 @@
#include <nvgpu/cic_mon.h>
#include <nvgpu/power_features/pg.h>
#include <nvgpu/gr/gr_instances.h>
#include <nvgpu/ce.h>
#ifdef CONFIG_NVGPU_GSP_SCHEDULER
#include <nvgpu/gsp.h>
#endif
@@ -869,13 +870,8 @@ static void ga10b_intr_isr_stall_host2soc_3(struct gk20a *g)
if ((unit_subtree_mask & engine_intr_mask) == 0ULL) {
continue;
}
if (g->ops.ce.isr_stall != NULL) {
g->ops.ce.isr_stall(g,
dev->inst_id,
dev->pri_base);
} else {
nvgpu_err(g, "unhandled intr_unit_ce_stall");
}
nvgpu_ce_stall_isr(g, dev->inst_id, dev->pri_base);
g->ops.ce.intr_retrigger(g, dev->inst_id);
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -83,4 +83,5 @@ struct gk20a;
*/
int nvgpu_ce_init_support(struct gk20a *g);
void nvgpu_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base);
#endif /*NVGPU_CE_H*/

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -39,6 +39,7 @@ enum {
NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD,
NVGPU_ERR_NOTIFIER_RESETCHANNEL_VERIF_ERROR,
NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH,
NVGPU_ERR_NOTIFIER_CE_ERROR,
};
void nvgpu_set_err_notifier_locked(struct nvgpu_channel *ch, u32 error);

View File

@@ -40,9 +40,13 @@ struct gops_ce {
/**
* @brief Handler for CE stalling interrupts.
*
* @param g [in] The GPU driver struct.
* @param inst_id [in] Copy engine instance id.
* @param pri_base [in] Start of h/w register address space.
* @param g [in] The GPU driver struct.
* @param inst_id [in] Copy engine instance id.
* @param pri_base [in] Start of h/w register address space.
* @param needs_rc [out] Flag indicating if recovery should be
* triggered as part of CE error handling.
* @param needs_quiesce [out] Flag indicating if SW quiesce should be
* triggered as part of CE error handling.
*
* This function is invoked by MC stalling isr handler to handle
* the CE stalling interrupt.
@@ -56,9 +60,11 @@ struct gops_ce {
* - Method buffer fault interrupt.
* - Blocking pipe interrupt.
* - Launch error interrupt.
* - Sets needs_rc / needs_quiesce based on error handling policy.
* - Clear the handled interrupts by writing to ce_intr_status_r.
*/
void (*isr_stall)(struct gk20a *g, u32 inst_id, u32 pri_base);
void (*isr_stall)(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce);
#ifdef CONFIG_NVGPU_NONSTALL_INTR
/**
@@ -113,6 +119,7 @@ struct gops_ce {
*/
u32 (*get_num_pce)(struct gk20a *g);
#ifdef CONFIG_NVGPU_HAL_NON_FUSA
/**
* @brief Handler for method buffer fault in BAR2.
*
@@ -126,6 +133,7 @@ struct gops_ce {
* clear if pending.
*/
void (*mthd_buffer_fault_in_bar2_fault)(struct gk20a *g);
#endif
/** @cond DOXYGEN_SHOULD_SKIP_THIS */
@@ -147,6 +155,7 @@ struct gops_ce {
void (*intr_retrigger)(struct gk20a *g, u32 inst_id);
u64 (*get_inst_ptr_from_lce)(struct gk20a *g, u32 inst_id);
#ifdef CONFIG_NVGPU_DGPU
int (*ce_app_init_support)(struct gk20a *g);
void (*ce_app_suspend)(struct gk20a *g);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -98,7 +98,7 @@ struct gops_mc {
* - For the FIFO engines with pending interrupt invoke corresponding
* handlers.
* - Invoke g->ops.gr.intr.stall_isr if GR interrupt is pending.
* - Invoke g->ops.ce.isr_stall if CE interrupt is pending.
* - Invoke nvgpu_ce_stall_isr if CE interrupt is pending.
* - For other units with pending interrupt invoke corresponding
* handlers.
* - Invoke g->ops.fb.intr.isr if HUB interrupt is pending, determined

View File

@@ -72,6 +72,16 @@
(nvgpu_safe_add_u32(0x00104434U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_intr_retrigger_trigger_true_f() (0x1U)
#define ce_pce_map_r() (0x00104028U)
#define ce_lce_bind_status_r(i)\
(nvgpu_safe_add_u32(0x00104404U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_bind_status_bound_v(r) (((r) >> 0U) & 0x1U)
#define ce_lce_bind_status_bound_false_v() (0x00000000U)
#define ce_lce_bind_status_ctx_ptr_v(r) (((r) >> 1U) & 0xfffffffU)
#define ce_lce_launcherr_r(i)\
(nvgpu_safe_add_u32(0x00104418U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_launcherr_report_v(r) (((r) >> 0U) & 0xfU)
#define ce_lce_launcherr_report_invalid_config_v() (0x0000000dU)
#define ce_lce_launcherr_report_method_buffer_access_fault_v() (0x0000000eU)
#define ce_lce_opt_r(i)\
(nvgpu_safe_add_u32(0x00104414U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_opt_force_barriers_npl__prod_f() (0x8U)

View File

@@ -101,6 +101,16 @@
#define ce_lce_intr_notify_ctrl_cpu_m() (U32(0x1U) << 31U)
#define ce_lce_intr_notify_ctrl_cpu_enable_f() (0x80000000U)
#define ce_lce_intr_notify_ctrl_cpu_disable_f() (0x0U)
#define ce_lce_bind_status_r(i)\
(nvgpu_safe_add_u32(0x00104404U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_bind_status_bound_v(r) (((r) >> 0U) & 0x1U)
#define ce_lce_bind_status_bound_false_v() (0x00000000U)
#define ce_lce_bind_status_ctx_ptr_v(r) (((r) >> 1U) & 0xfffffffU)
#define ce_lce_launcherr_r(i)\
(nvgpu_safe_add_u32(0x00104418U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_launcherr_report_v(r) (((r) >> 0U) & 0xfU)
#define ce_lce_launcherr_report_invalid_config_v() (0x0000000dU)
#define ce_lce_launcherr_report_method_buffer_access_fault_v() (0x0000000eU)
#define ce_lce_intr_ctrl_r(i)\
(nvgpu_safe_add_u32(0x0010442cU, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_intr_ctrl_gsp_m() (U32(0x1U) << 30U)

View File

@@ -72,6 +72,16 @@
#define ce_intr_status_mthd_buffer_fault_pending_f() (0x10U)
#define ce_intr_status_mthd_buffer_fault_reset_f() (0x10U)
#define ce_pce_map_r() (0x00104028U)
#define ce_lce_bind_status_r(i)\
(nvgpu_safe_add_u32(0x00104404U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_bind_status_bound_v(r) (((r) >> 0U) & 0x1U)
#define ce_lce_bind_status_bound_false_v() (0x00000000U)
#define ce_lce_bind_status_ctx_ptr_v(r) (((r) >> 1U) & 0xfffffffU)
#define ce_lce_launcherr_r(i)\
(nvgpu_safe_add_u32(0x00104418U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_launcherr_report_v(r) (((r) >> 0U) & 0xfU)
#define ce_lce_launcherr_report_invalid_config_v() (0x0000000dU)
#define ce_lce_launcherr_report_method_buffer_access_fault_v() (0x0000000eU)
#define ce_lce_opt_r(i)\
(nvgpu_safe_add_u32(0x00104414U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_opt_force_barriers_npl__prod_f() (0x8U)

View File

@@ -87,6 +87,16 @@
#define ce_grce_config_shared_f(v) ((U32(v) & 0x1U) << 30U)
#define ce_grce_config_timeslice_short_f() (0x0U)
#define ce_grce_config_timeslice_long_f() (0x80000000U)
#define ce_lce_bind_status_r(i)\
(nvgpu_safe_add_u32(0x00104404U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_bind_status_bound_v(r) (((r) >> 0U) & 0x1U)
#define ce_lce_bind_status_bound_false_v() (0x00000000U)
#define ce_lce_bind_status_ctx_ptr_v(r) (((r) >> 1U) & 0xfffffffU)
#define ce_lce_launcherr_r(i)\
(nvgpu_safe_add_u32(0x00104418U, nvgpu_safe_mult_u32((i), 128U)))
#define ce_lce_launcherr_report_v(r) (((r) >> 0U) & 0xfU)
#define ce_lce_launcherr_report_invalid_config_v() (0x0000000dU)
#define ce_lce_launcherr_report_method_buffer_access_fault_v() (0x0000000eU)
#define ce_pce2lce_config_r(i)\
(nvgpu_safe_add_u32(0x00104040U, nvgpu_safe_mult_u32((i), 4U)))
#define ce_pce2lce_config__size_1_v() (0x00000004U)

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -76,6 +76,10 @@
* Scheduler error recovery.
*/
#define RC_TYPE_SCHED_ERR 8U
/**
* Copy-engine error recovery.
*/
#define RC_TYPE_CE_FAULT 9U
/**
* Invalid recovery id.
@@ -132,6 +136,9 @@ static inline const char *nvgpu_rc_type_to_str(unsigned int rc_type)
case RC_TYPE_SCHED_ERR:
str = "Sched err";
break;
case RC_TYPE_CE_FAULT:
str = "Copy engine err";
break;
default:
str = "Unknown";
break;
@@ -272,4 +279,6 @@ void nvgpu_rc_fifo_recover(struct gk20a *g,
bool id_is_tsg, /* ignored if hw_id == ~0 */
bool id_is_known, bool debug_dump, u32 rc_type);
void nvgpu_rc_ce_fault(struct gk20a *g, u32 inst_id);
#endif /* NVGPU_RC_H */

View File

@@ -98,8 +98,8 @@ gv100_dump_engine_status
gv100_read_engine_status_info
gv11b_ce_get_num_pce
gv11b_ce_init_prod_values
gv11b_ce_mthd_buffer_fault_in_bar2_fault
gv11b_ce_stall_isr
gv11b_ce_get_inst_ptr_from_lce
gv11b_channel_count
gv11b_channel_read_state
gv11b_channel_reset_faulted
@@ -275,6 +275,7 @@ nvgpu_bug_unregister_cb
nvgpu_can_busy
nvgpu_ce_engine_interrupt_mask
nvgpu_ce_init_support
nvgpu_ce_stall_isr
nvgpu_cg_blcg_fb_load_enable
nvgpu_cg_blcg_ltc_load_enable
nvgpu_cg_blcg_fifo_load_enable
@@ -792,6 +793,7 @@ nvgpu_rc_gr_fault
nvgpu_rc_sched_error_bad_tsg
nvgpu_rc_tsg_and_related_engines
nvgpu_rc_mmu_fault
nvgpu_rc_ce_fault
nvgpu_init_pramin
gk20a_bus_set_bar0_window
nvgpu_pramin_ops_init

View File

@@ -98,8 +98,8 @@ gv100_dump_engine_status
gv100_read_engine_status_info
gv11b_ce_get_num_pce
gv11b_ce_init_prod_values
gv11b_ce_mthd_buffer_fault_in_bar2_fault
gv11b_ce_stall_isr
gv11b_ce_get_inst_ptr_from_lce
gv11b_channel_count
gv11b_channel_read_state
gv11b_channel_reset_faulted
@@ -283,6 +283,7 @@ nvgpu_bug_unregister_cb
nvgpu_can_busy
nvgpu_ce_engine_interrupt_mask
nvgpu_ce_init_support
nvgpu_ce_stall_isr
nvgpu_cg_blcg_fb_load_enable
nvgpu_cg_blcg_ltc_load_enable
nvgpu_cg_blcg_fifo_load_enable
@@ -811,6 +812,7 @@ nvgpu_rc_gr_fault
nvgpu_rc_sched_error_bad_tsg
nvgpu_rc_tsg_and_related_engines
nvgpu_rc_mmu_fault
nvgpu_rc_ce_fault
gp10b_priv_ring_isr_handle_0
gp10b_priv_ring_isr_handle_1
nvgpu_cic_mon_setup

View File

@@ -102,7 +102,6 @@ test_ce_setup_env.ce_setup_env=0
test_ce_stall_isr.ce_stall_isr=0
test_get_num_pce.ce_get_num_pce=0
test_init_prod_values.ce_init_prod_values=0
test_mthd_buffer_fault_in_bar2_fault.mthd_buffer_fault_in_bar2_fault=0
[cg]
init_test_env.init=0

View File

@@ -129,6 +129,7 @@ int test_ce_setup_env(struct unit_module *m,
nvgpu_spinlock_init(&g->mc.intr_lock);
g->ops.cic_mon.init = ga10b_cic_mon_init;
g->ops.ce.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce;
if (nvgpu_cic_mon_setup(g) != 0) {
unit_err(m, "%s: failed to initialize CIC\n",
@@ -211,7 +212,7 @@ int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args)
intr_val = 0x4;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
g->ops.ce.isr_stall(g, inst_id, 0);
nvgpu_ce_stall_isr(g, inst_id, 0);
if (intr_status_written[inst_id] != (intr_val &
~ce_intr_status_nonblockpipe_pending_f())) {
ret = UNIT_FAIL;
@@ -224,7 +225,7 @@ int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args)
intr_val = 0x0;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
g->ops.ce.isr_stall(g, inst_id, 0);
nvgpu_ce_stall_isr(g, inst_id, 0);
if (intr_status_written[inst_id] != intr_val) {
ret = UNIT_FAIL;
unit_err(m, "intr_status not cleared, only 0x%08x\n",
@@ -237,59 +238,6 @@ done:
return ret;
}
static u32 mock_get_num_lce(struct gk20a *g)
{
return NUM_INST;
}
int test_mthd_buffer_fault_in_bar2_fault(struct unit_module *m, struct gk20a *g,
void *args)
{
int ret = UNIT_SUCCESS;
int inst_id;
u32 intr_val;
g->ops.ce.mthd_buffer_fault_in_bar2_fault =
gv11b_ce_mthd_buffer_fault_in_bar2_fault;
g->ops.top.get_num_lce = mock_get_num_lce;
intr_val = 0x1f; /* all intr sources */
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
intr_status_written[inst_id] = 0;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
}
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
if (intr_status_written[inst_id] !=
ce_intr_status_mthd_buffer_fault_pending_f()) {
ret = UNIT_FAIL;
unit_err(m, "intr_status not cleared properly, only 0x%08x\n",
intr_status_written[inst_id]);
goto done;
}
}
intr_val = 0x0;
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
intr_status_written[inst_id] = 0;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
}
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
if (intr_status_written[inst_id] != 0) {
ret = UNIT_FAIL;
unit_err(m, "intr_status not cleared properly, only 0x%08x\n",
intr_status_written[inst_id]);
goto done;
}
}
done:
return ret;
}
int test_get_num_pce(struct unit_module *m, struct gk20a *g, void *args)
{
u32 pce_map_val; /* 16 bit bitmap */
@@ -334,7 +282,6 @@ struct unit_module_test ce_tests[] = {
UNIT_TEST(ce_setup_env, test_ce_setup_env, NULL, 0),
UNIT_TEST(ce_init_support, test_ce_init_support, NULL, 0),
UNIT_TEST(ce_stall_isr, test_ce_stall_isr, NULL, 0),
UNIT_TEST(mthd_buffer_fault_in_bar2_fault, test_mthd_buffer_fault_in_bar2_fault, NULL, 0),
UNIT_TEST(ce_get_num_pce, test_get_num_pce, NULL, 0),
UNIT_TEST(ce_init_prod_values, test_init_prod_values, NULL, 0),
UNIT_TEST(ce_free_env, test_ce_free_env, NULL, 0),

View File

@@ -115,33 +115,6 @@ int test_ce_init_support(struct unit_module *m, struct gk20a *g, void *args);
*/
int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args);
/**
* Test specification for: test_mthd_buffer_fault_in_bar2_fault
*
* Description: Validate method buffer interrupt functionality.
*
* Test Type: Feature
*
* Targets: gops_ce.mthd_buffer_fault_in_bar2_fault,
* gv11b_ce_mthd_buffer_fault_in_bar2_fault
*
* Input: test_ce_setup_env must have been run.
*
* Steps:
* - Set all CE interrupt sources pending in the interrupt status reg for each
* instance.
* - Call gops_ce.mthd_buffer_fault_in_bar2_fault.
* - Verify only the correct interrupt is cleared.
* - Set no CE interrupt sources pending in the interrupt status reg for each
* instance.
* - Call gops_ce.mthd_buffer_fault_in_bar2_fault.
* - Verify no interrupts are cleared.
*
* Output: Returns PASS if expected result is met, FAIL otherwise.
*/
int test_mthd_buffer_fault_in_bar2_fault(struct unit_module *m, struct gk20a *g,
void *args);
/**
* Test specification for: test_get_num_pce
*

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -49,11 +49,6 @@ static u32 hal_channel_count(struct gk20a *g)
return 0x00000200U;
}
static void hal_bar2_fault_nop(struct gk20a *g)
{
/* no-op */
}
static int hal_bar2_bind_nop(struct gk20a *g, struct nvgpu_mem *bar2_inst)
{
/* no-op */
@@ -99,7 +94,6 @@ int fb_mmu_fault_gv11b_init_test(struct unit_module *m, struct gk20a *g,
/* Other HALs that are needed */
g->ops.channel.count = hal_channel_count;
g->ops.ce.mthd_buffer_fault_in_bar2_fault = hal_bar2_fault_nop;
g->ops.bus.bar2_bind = hal_bar2_bind_nop;
g->ops.fifo.mmu_fault_id_to_pbdma_id =
hal_fifo_mmu_fault_id_to_pbdma_id;

View File

@@ -171,7 +171,8 @@ static void mock_bus_isr(struct gk20a *g)
u.bus_isr = true;
}
static void mock_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
static void mock_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce)
{
u.ce_isr = true;
}

View File

@@ -398,10 +398,6 @@ static const char *f_mmu_fault_notify[] = {
"mmu_fault_notify_eng_id_physical",
};
static void stub_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g)
{
}
static int stub_bus_bar2_bind(struct gk20a *g, struct nvgpu_mem *bar2_inst)
{
return 0;
@@ -427,8 +423,6 @@ int test_gv11b_mm_mmu_fault_handle_other_fault_notify(struct unit_module *m,
gv11b_fb_read_mmu_fault_addr_lo_hi;
g->ops.fb.read_mmu_fault_info = gv11b_fb_read_mmu_fault_info;
g->ops.fb.write_mmu_fault_status = gv11b_fb_write_mmu_fault_status;
g->ops.ce.mthd_buffer_fault_in_bar2_fault =
stub_ce_mthd_buffer_fault_in_bar2_fault;
g->ops.bus.bar2_bind = stub_bus_bar2_bind;
g->ops.fifo.mmu_fault_id_to_pbdma_id =
stub_fifo_mmu_fault_id_to_pbdma_id;