gpu: nvgpu: Add CE interrupt handling

a. LAUNCH_ERR
    - Userspace error.
    - Triggered due to faulty launch.
    - Handle using recovery to reset CE engine and teardown the
      faulty channel.

b. An INVALID_CONFIG -
    - Triggered when LCE is mapped to floorswept PCE.
    - On iGPU, we use the default PCE 2 LCE  HW mapping.
      The default mapping can be read from NV_CE_PCE2LCE_CONFIG
      INIT value in CE refmanual.
    - NvGPU driver configures the mapping on dGPUs (currently only on
      Turing).
    - So, this interrupt can only be triggered if there is
      kernel or HW error
    - Recovery ( which is killing the context + engine reset) will
      not help resolve this error.
    - Trigger Quiesce as part of handling.

c. A MTHD_BUFFER_FAULT -
    - NvGPU driver allocates fault buffers for all TSGs or contexts,
      maps them in BAR2 VA space and writes the VA into channel
      instance block.
    - Can be triggered only due to kernel bug
    - Recovery will not help, need quiesce

d. FBUF_CRC_FAIL
    - Triggered when the CRC entry read from the method fault buffer
      does not match the computed CRC from the methods contained in
      the buffer.
    - This indicates memory corruption and is a fatal interrupt which
      at least requires the LCE to be reset before operations can
      start again, if not the entire GPU.
    - Better to quiesce on memory corruption
      CE Engine reset (via recovery) will not help.

e. FBUF_MAGIC_CHK_FAIL
    - Triggered when the MAGIC_NUM entry read from the method fault
      buf does not match NV_CE_MTHD_BUFFER_GLOBAL_HDR_MAGIC_NUM_VAL
    - This indicates memory corruption and is a fatal interrupt
    - Better to quiesce on memory corruption

f. STALLING_DEBUG
    - Only triggered with SW write for debug purposes
    - Debug interrupt, currently ignored

Move launch error handling from GP10b to GV11b HAL as -
1. LAUNCHERR_REPORT errcode METHOD_BUFFER_ACCESS_FAULT is not
   defined on Pascal
2. We do not support GP10b on dev-main ToT

JIRA NVGPU-8102

Change-Id: Idc84119bc23b5e85f3479fe62cc8720e98b627a5
Signed-off-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2678893
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Tejal Kudav
2022-03-09 12:40:14 +00:00
committed by mobile promotions
parent 15739c52e9
commit b80b2bdab8
35 changed files with 246 additions and 144 deletions

View File

@@ -102,7 +102,6 @@ test_ce_setup_env.ce_setup_env=0
test_ce_stall_isr.ce_stall_isr=0
test_get_num_pce.ce_get_num_pce=0
test_init_prod_values.ce_init_prod_values=0
test_mthd_buffer_fault_in_bar2_fault.mthd_buffer_fault_in_bar2_fault=0
[cg]
init_test_env.init=0

View File

@@ -129,6 +129,7 @@ int test_ce_setup_env(struct unit_module *m,
nvgpu_spinlock_init(&g->mc.intr_lock);
g->ops.cic_mon.init = ga10b_cic_mon_init;
g->ops.ce.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce;
if (nvgpu_cic_mon_setup(g) != 0) {
unit_err(m, "%s: failed to initialize CIC\n",
@@ -211,7 +212,7 @@ int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args)
intr_val = 0x4;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
g->ops.ce.isr_stall(g, inst_id, 0);
nvgpu_ce_stall_isr(g, inst_id, 0);
if (intr_status_written[inst_id] != (intr_val &
~ce_intr_status_nonblockpipe_pending_f())) {
ret = UNIT_FAIL;
@@ -224,7 +225,7 @@ int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args)
intr_val = 0x0;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
g->ops.ce.isr_stall(g, inst_id, 0);
nvgpu_ce_stall_isr(g, inst_id, 0);
if (intr_status_written[inst_id] != intr_val) {
ret = UNIT_FAIL;
unit_err(m, "intr_status not cleared, only 0x%08x\n",
@@ -237,59 +238,6 @@ done:
return ret;
}
static u32 mock_get_num_lce(struct gk20a *g)
{
return NUM_INST;
}
int test_mthd_buffer_fault_in_bar2_fault(struct unit_module *m, struct gk20a *g,
void *args)
{
int ret = UNIT_SUCCESS;
int inst_id;
u32 intr_val;
g->ops.ce.mthd_buffer_fault_in_bar2_fault =
gv11b_ce_mthd_buffer_fault_in_bar2_fault;
g->ops.top.get_num_lce = mock_get_num_lce;
intr_val = 0x1f; /* all intr sources */
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
intr_status_written[inst_id] = 0;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
}
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
if (intr_status_written[inst_id] !=
ce_intr_status_mthd_buffer_fault_pending_f()) {
ret = UNIT_FAIL;
unit_err(m, "intr_status not cleared properly, only 0x%08x\n",
intr_status_written[inst_id]);
goto done;
}
}
intr_val = 0x0;
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
intr_status_written[inst_id] = 0;
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
intr_val);
}
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
if (intr_status_written[inst_id] != 0) {
ret = UNIT_FAIL;
unit_err(m, "intr_status not cleared properly, only 0x%08x\n",
intr_status_written[inst_id]);
goto done;
}
}
done:
return ret;
}
int test_get_num_pce(struct unit_module *m, struct gk20a *g, void *args)
{
u32 pce_map_val; /* 16 bit bitmap */
@@ -334,7 +282,6 @@ struct unit_module_test ce_tests[] = {
UNIT_TEST(ce_setup_env, test_ce_setup_env, NULL, 0),
UNIT_TEST(ce_init_support, test_ce_init_support, NULL, 0),
UNIT_TEST(ce_stall_isr, test_ce_stall_isr, NULL, 0),
UNIT_TEST(mthd_buffer_fault_in_bar2_fault, test_mthd_buffer_fault_in_bar2_fault, NULL, 0),
UNIT_TEST(ce_get_num_pce, test_get_num_pce, NULL, 0),
UNIT_TEST(ce_init_prod_values, test_init_prod_values, NULL, 0),
UNIT_TEST(ce_free_env, test_ce_free_env, NULL, 0),

View File

@@ -115,33 +115,6 @@ int test_ce_init_support(struct unit_module *m, struct gk20a *g, void *args);
*/
int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args);
/**
* Test specification for: test_mthd_buffer_fault_in_bar2_fault
*
* Description: Validate method buffer interrupt functionality.
*
* Test Type: Feature
*
* Targets: gops_ce.mthd_buffer_fault_in_bar2_fault,
* gv11b_ce_mthd_buffer_fault_in_bar2_fault
*
* Input: test_ce_setup_env must have been run.
*
* Steps:
* - Set all CE interrupt sources pending in the interrupt status reg for each
* instance.
* - Call gops_ce.mthd_buffer_fault_in_bar2_fault.
* - Verify only the correct interrupt is cleared.
* - Set no CE interrupt sources pending in the interrupt status reg for each
* instance.
* - Call gops_ce.mthd_buffer_fault_in_bar2_fault.
* - Verify no interrupts are cleared.
*
* Output: Returns PASS if expected result is met, FAIL otherwise.
*/
int test_mthd_buffer_fault_in_bar2_fault(struct unit_module *m, struct gk20a *g,
void *args);
/**
* Test specification for: test_get_num_pce
*

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -49,11 +49,6 @@ static u32 hal_channel_count(struct gk20a *g)
return 0x00000200U;
}
static void hal_bar2_fault_nop(struct gk20a *g)
{
/* no-op */
}
static int hal_bar2_bind_nop(struct gk20a *g, struct nvgpu_mem *bar2_inst)
{
/* no-op */
@@ -99,7 +94,6 @@ int fb_mmu_fault_gv11b_init_test(struct unit_module *m, struct gk20a *g,
/* Other HALs that are needed */
g->ops.channel.count = hal_channel_count;
g->ops.ce.mthd_buffer_fault_in_bar2_fault = hal_bar2_fault_nop;
g->ops.bus.bar2_bind = hal_bar2_bind_nop;
g->ops.fifo.mmu_fault_id_to_pbdma_id =
hal_fifo_mmu_fault_id_to_pbdma_id;

View File

@@ -171,7 +171,8 @@ static void mock_bus_isr(struct gk20a *g)
u.bus_isr = true;
}
static void mock_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
static void mock_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
bool *needs_rc, bool *needs_quiesce)
{
u.ce_isr = true;
}

View File

@@ -398,10 +398,6 @@ static const char *f_mmu_fault_notify[] = {
"mmu_fault_notify_eng_id_physical",
};
static void stub_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g)
{
}
static int stub_bus_bar2_bind(struct gk20a *g, struct nvgpu_mem *bar2_inst)
{
return 0;
@@ -427,8 +423,6 @@ int test_gv11b_mm_mmu_fault_handle_other_fault_notify(struct unit_module *m,
gv11b_fb_read_mmu_fault_addr_lo_hi;
g->ops.fb.read_mmu_fault_info = gv11b_fb_read_mmu_fault_info;
g->ops.fb.write_mmu_fault_status = gv11b_fb_write_mmu_fault_status;
g->ops.ce.mthd_buffer_fault_in_bar2_fault =
stub_ce_mthd_buffer_fault_in_bar2_fault;
g->ops.bus.bar2_bind = stub_bus_bar2_bind;
g->ops.fifo.mmu_fault_id_to_pbdma_id =
stub_fifo_mmu_fault_id_to_pbdma_id;