mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 09:12:24 +03:00
gpu: nvgpu: Add CE interrupt handling
a. LAUNCH_ERR
- Userspace error.
- Triggered due to faulty launch.
- Handle using recovery to reset CE engine and teardown the
faulty channel.
b. An INVALID_CONFIG -
- Triggered when LCE is mapped to floorswept PCE.
- On iGPU, we use the default PCE 2 LCE HW mapping.
The default mapping can be read from NV_CE_PCE2LCE_CONFIG
INIT value in CE refmanual.
- NvGPU driver configures the mapping on dGPUs (currently only on
Turing).
- So, this interrupt can only be triggered if there is
kernel or HW error
- Recovery ( which is killing the context + engine reset) will
not help resolve this error.
- Trigger Quiesce as part of handling.
c. A MTHD_BUFFER_FAULT -
- NvGPU driver allocates fault buffers for all TSGs or contexts,
maps them in BAR2 VA space and writes the VA into channel
instance block.
- Can be triggered only due to kernel bug
- Recovery will not help, need quiesce
d. FBUF_CRC_FAIL
- Triggered when the CRC entry read from the method fault buffer
does not match the computed CRC from the methods contained in
the buffer.
- This indicates memory corruption and is a fatal interrupt which
at least requires the LCE to be reset before operations can
start again, if not the entire GPU.
- Better to quiesce on memory corruption
CE Engine reset (via recovery) will not help.
e. FBUF_MAGIC_CHK_FAIL
- Triggered when the MAGIC_NUM entry read from the method fault
buf does not match NV_CE_MTHD_BUFFER_GLOBAL_HDR_MAGIC_NUM_VAL
- This indicates memory corruption and is a fatal interrupt
- Better to quiesce on memory corruption
f. STALLING_DEBUG
- Only triggered with SW write for debug purposes
- Debug interrupt, currently ignored
Move launch error handling from GP10b to GV11b HAL as -
1. LAUNCHERR_REPORT errcode METHOD_BUFFER_ACCESS_FAULT is not
defined on Pascal
2. We do not support GP10b on dev-main ToT
JIRA NVGPU-8102
Change-Id: Idc84119bc23b5e85f3479fe62cc8720e98b627a5
Signed-off-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2678893
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
15739c52e9
commit
b80b2bdab8
@@ -102,7 +102,6 @@ test_ce_setup_env.ce_setup_env=0
|
||||
test_ce_stall_isr.ce_stall_isr=0
|
||||
test_get_num_pce.ce_get_num_pce=0
|
||||
test_init_prod_values.ce_init_prod_values=0
|
||||
test_mthd_buffer_fault_in_bar2_fault.mthd_buffer_fault_in_bar2_fault=0
|
||||
|
||||
[cg]
|
||||
init_test_env.init=0
|
||||
|
||||
@@ -129,6 +129,7 @@ int test_ce_setup_env(struct unit_module *m,
|
||||
nvgpu_spinlock_init(&g->mc.intr_lock);
|
||||
|
||||
g->ops.cic_mon.init = ga10b_cic_mon_init;
|
||||
g->ops.ce.get_inst_ptr_from_lce = gv11b_ce_get_inst_ptr_from_lce;
|
||||
|
||||
if (nvgpu_cic_mon_setup(g) != 0) {
|
||||
unit_err(m, "%s: failed to initialize CIC\n",
|
||||
@@ -211,7 +212,7 @@ int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args)
|
||||
intr_val = 0x4;
|
||||
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
|
||||
intr_val);
|
||||
g->ops.ce.isr_stall(g, inst_id, 0);
|
||||
nvgpu_ce_stall_isr(g, inst_id, 0);
|
||||
if (intr_status_written[inst_id] != (intr_val &
|
||||
~ce_intr_status_nonblockpipe_pending_f())) {
|
||||
ret = UNIT_FAIL;
|
||||
@@ -224,7 +225,7 @@ int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args)
|
||||
intr_val = 0x0;
|
||||
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
|
||||
intr_val);
|
||||
g->ops.ce.isr_stall(g, inst_id, 0);
|
||||
nvgpu_ce_stall_isr(g, inst_id, 0);
|
||||
if (intr_status_written[inst_id] != intr_val) {
|
||||
ret = UNIT_FAIL;
|
||||
unit_err(m, "intr_status not cleared, only 0x%08x\n",
|
||||
@@ -237,59 +238,6 @@ done:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u32 mock_get_num_lce(struct gk20a *g)
|
||||
{
|
||||
return NUM_INST;
|
||||
}
|
||||
|
||||
int test_mthd_buffer_fault_in_bar2_fault(struct unit_module *m, struct gk20a *g,
|
||||
void *args)
|
||||
{
|
||||
int ret = UNIT_SUCCESS;
|
||||
int inst_id;
|
||||
u32 intr_val;
|
||||
|
||||
g->ops.ce.mthd_buffer_fault_in_bar2_fault =
|
||||
gv11b_ce_mthd_buffer_fault_in_bar2_fault;
|
||||
g->ops.top.get_num_lce = mock_get_num_lce;
|
||||
|
||||
intr_val = 0x1f; /* all intr sources */
|
||||
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
|
||||
intr_status_written[inst_id] = 0;
|
||||
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
|
||||
intr_val);
|
||||
}
|
||||
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
|
||||
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
|
||||
if (intr_status_written[inst_id] !=
|
||||
ce_intr_status_mthd_buffer_fault_pending_f()) {
|
||||
ret = UNIT_FAIL;
|
||||
unit_err(m, "intr_status not cleared properly, only 0x%08x\n",
|
||||
intr_status_written[inst_id]);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
intr_val = 0x0;
|
||||
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
|
||||
intr_status_written[inst_id] = 0;
|
||||
nvgpu_posix_io_writel_reg_space(g, ce_intr_status_r(inst_id),
|
||||
intr_val);
|
||||
}
|
||||
g->ops.ce.mthd_buffer_fault_in_bar2_fault(g);
|
||||
for (inst_id = 0; inst_id < NUM_INST; inst_id++) {
|
||||
if (intr_status_written[inst_id] != 0) {
|
||||
ret = UNIT_FAIL;
|
||||
unit_err(m, "intr_status not cleared properly, only 0x%08x\n",
|
||||
intr_status_written[inst_id]);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int test_get_num_pce(struct unit_module *m, struct gk20a *g, void *args)
|
||||
{
|
||||
u32 pce_map_val; /* 16 bit bitmap */
|
||||
@@ -334,7 +282,6 @@ struct unit_module_test ce_tests[] = {
|
||||
UNIT_TEST(ce_setup_env, test_ce_setup_env, NULL, 0),
|
||||
UNIT_TEST(ce_init_support, test_ce_init_support, NULL, 0),
|
||||
UNIT_TEST(ce_stall_isr, test_ce_stall_isr, NULL, 0),
|
||||
UNIT_TEST(mthd_buffer_fault_in_bar2_fault, test_mthd_buffer_fault_in_bar2_fault, NULL, 0),
|
||||
UNIT_TEST(ce_get_num_pce, test_get_num_pce, NULL, 0),
|
||||
UNIT_TEST(ce_init_prod_values, test_init_prod_values, NULL, 0),
|
||||
UNIT_TEST(ce_free_env, test_ce_free_env, NULL, 0),
|
||||
|
||||
@@ -115,33 +115,6 @@ int test_ce_init_support(struct unit_module *m, struct gk20a *g, void *args);
|
||||
*/
|
||||
int test_ce_stall_isr(struct unit_module *m, struct gk20a *g, void *args);
|
||||
|
||||
/**
|
||||
* Test specification for: test_mthd_buffer_fault_in_bar2_fault
|
||||
*
|
||||
* Description: Validate method buffer interrupt functionality.
|
||||
*
|
||||
* Test Type: Feature
|
||||
*
|
||||
* Targets: gops_ce.mthd_buffer_fault_in_bar2_fault,
|
||||
* gv11b_ce_mthd_buffer_fault_in_bar2_fault
|
||||
*
|
||||
* Input: test_ce_setup_env must have been run.
|
||||
*
|
||||
* Steps:
|
||||
* - Set all CE interrupt sources pending in the interrupt status reg for each
|
||||
* instance.
|
||||
* - Call gops_ce.mthd_buffer_fault_in_bar2_fault.
|
||||
* - Verify only the correct interrupt is cleared.
|
||||
* - Set no CE interrupt sources pending in the interrupt status reg for each
|
||||
* instance.
|
||||
* - Call gops_ce.mthd_buffer_fault_in_bar2_fault.
|
||||
* - Verify no interrupts are cleared.
|
||||
*
|
||||
* Output: Returns PASS if expected result is met, FAIL otherwise.
|
||||
*/
|
||||
int test_mthd_buffer_fault_in_bar2_fault(struct unit_module *m, struct gk20a *g,
|
||||
void *args);
|
||||
|
||||
/**
|
||||
* Test specification for: test_get_num_pce
|
||||
*
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -49,11 +49,6 @@ static u32 hal_channel_count(struct gk20a *g)
|
||||
return 0x00000200U;
|
||||
}
|
||||
|
||||
static void hal_bar2_fault_nop(struct gk20a *g)
|
||||
{
|
||||
/* no-op */
|
||||
}
|
||||
|
||||
static int hal_bar2_bind_nop(struct gk20a *g, struct nvgpu_mem *bar2_inst)
|
||||
{
|
||||
/* no-op */
|
||||
@@ -99,7 +94,6 @@ int fb_mmu_fault_gv11b_init_test(struct unit_module *m, struct gk20a *g,
|
||||
|
||||
/* Other HALs that are needed */
|
||||
g->ops.channel.count = hal_channel_count;
|
||||
g->ops.ce.mthd_buffer_fault_in_bar2_fault = hal_bar2_fault_nop;
|
||||
g->ops.bus.bar2_bind = hal_bar2_bind_nop;
|
||||
g->ops.fifo.mmu_fault_id_to_pbdma_id =
|
||||
hal_fifo_mmu_fault_id_to_pbdma_id;
|
||||
|
||||
@@ -171,7 +171,8 @@ static void mock_bus_isr(struct gk20a *g)
|
||||
u.bus_isr = true;
|
||||
}
|
||||
|
||||
static void mock_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base)
|
||||
static void mock_ce_stall_isr(struct gk20a *g, u32 inst_id, u32 pri_base,
|
||||
bool *needs_rc, bool *needs_quiesce)
|
||||
{
|
||||
u.ce_isr = true;
|
||||
}
|
||||
|
||||
@@ -398,10 +398,6 @@ static const char *f_mmu_fault_notify[] = {
|
||||
"mmu_fault_notify_eng_id_physical",
|
||||
};
|
||||
|
||||
static void stub_ce_mthd_buffer_fault_in_bar2_fault(struct gk20a *g)
|
||||
{
|
||||
}
|
||||
|
||||
static int stub_bus_bar2_bind(struct gk20a *g, struct nvgpu_mem *bar2_inst)
|
||||
{
|
||||
return 0;
|
||||
@@ -427,8 +423,6 @@ int test_gv11b_mm_mmu_fault_handle_other_fault_notify(struct unit_module *m,
|
||||
gv11b_fb_read_mmu_fault_addr_lo_hi;
|
||||
g->ops.fb.read_mmu_fault_info = gv11b_fb_read_mmu_fault_info;
|
||||
g->ops.fb.write_mmu_fault_status = gv11b_fb_write_mmu_fault_status;
|
||||
g->ops.ce.mthd_buffer_fault_in_bar2_fault =
|
||||
stub_ce_mthd_buffer_fault_in_bar2_fault;
|
||||
g->ops.bus.bar2_bind = stub_bus_bar2_bind;
|
||||
g->ops.fifo.mmu_fault_id_to_pbdma_id =
|
||||
stub_fifo_mmu_fault_id_to_pbdma_id;
|
||||
|
||||
Reference in New Issue
Block a user