gpu: nvgpu: recover pbdma errors before ack

When a pbdma fault needs a channel teardown, do the recovery/teardown
process before acking the pbdma interrupt status back. Acking it causes
the hardware to proceed which could release fences too early before the
involved channel(s) have been found to be broken.

With these host copyengine interrupts, the teardown sequence is light
and proceeds even with the pbdma intr flag still set; there are no
engines to reset when these pbdma launch check interrupts happen. The
bad tsg is just disabled and the channels in it aborted.

A few unit tests are so heavily affected by this refactor that they
would need to be rewritten. They're not strictly needed at the moment,
so do only half of the rewrite: just delete them.

Bug 200611198

Change-Id: Id126fb158b6d05e46ba124cd426389046eedc053
Signed-off-by: Konsta Hölttä <kholtta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2392669
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: Alex Waterman <alexw@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Konsta Hölttä
2020-08-14 15:20:31 +03:00
committed by Alex Waterman
parent 370ac6cc98
commit dfd9feace6
9 changed files with 34 additions and 313 deletions

View File

@@ -30,8 +30,6 @@
#include <nvgpu/tsg.h>
#include <nvgpu/rc.h>
#include <nvgpu/nvgpu_err.h>
#include <nvgpu/error_notifier.h>
#include <nvgpu/pbdma_status.h>
#include <nvgpu/engines.h>
#include <hal/fifo/fifo_intr_gk20a.h>
@@ -94,20 +92,12 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g)
u32 pbdma_id;
u32 num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r());
u32 error_notifier;
bool recover;
struct nvgpu_pbdma_status_info pbdma_status;
for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) {
if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending",
pbdma_id);
recover = g->ops.pbdma.handle_intr(g, pbdma_id,
&error_notifier, &pbdma_status);
if (recover) {
nvgpu_rc_pbdma_fault(g, pbdma_id,
error_notifier, &pbdma_status);
}
g->ops.pbdma.handle_intr(g, pbdma_id, true);
}
}
return fifo_intr_0_pbdma_intr_pending_f();

View File

@@ -33,9 +33,7 @@ struct nvgpu_pbdma_status_info;
bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id,
u32 pbdma_intr_0, u32 *error_notifier);
bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status);
void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover);
u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id);
void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id);

View File

@@ -34,6 +34,7 @@
#include <nvgpu/gk20a.h>
#include <nvgpu/pbdma_status.h>
#include <nvgpu/static_analysis.h>
#include <nvgpu/rc.h>
#include <nvgpu/hw/gm20b/hw_pbdma_gm20b.h>
@@ -318,17 +319,14 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void)
return restartable_0_intr_descs;
}
bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status)
void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover)
{
struct nvgpu_pbdma_status_info pbdma_status;
u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;
u32 pbdma_intr_0 = nvgpu_readl(g, pbdma_intr_0_r(pbdma_id));
u32 pbdma_intr_1 = nvgpu_readl(g, pbdma_intr_1_r(pbdma_id));
bool recover = false;
if (pbdma_intr_0 != 0U) {
nvgpu_log(g, gpu_dbg_info | gpu_dbg_intr,
"pbdma id %d intr_0 0x%08x pending",
@@ -337,8 +335,12 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
if (g->ops.pbdma.handle_intr_0(g, pbdma_id, pbdma_intr_0,
&intr_error_notifier)) {
g->ops.pbdma_status.read_pbdma_status_info(g,
pbdma_id, pbdma_status);
recover = true;
pbdma_id, &pbdma_status);
if (recover) {
nvgpu_rc_pbdma_fault(g, pbdma_id,
intr_error_notifier,
&pbdma_status);
}
}
nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
}
@@ -351,17 +353,16 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1,
&intr_error_notifier)) {
g->ops.pbdma_status.read_pbdma_status_info(g,
pbdma_id, pbdma_status);
recover = true;
pbdma_id, &pbdma_status);
if (recover) {
nvgpu_rc_pbdma_fault(g, pbdma_id,
intr_error_notifier,
&pbdma_status);
}
}
nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
}
if (error_notifier != NULL) {
*error_notifier = intr_error_notifier;
}
return recover;
}
u32 gm20b_pbdma_get_gp_base(u64 gpfifo_base)

View File

@@ -127,9 +127,6 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
loop_count++;
}
g->ops.pbdma_status.read_pbdma_status_info(g,
pbdma_id, &pbdma_status);
/*
* If the PBDMA has a stalling interrupt and receives a NACK,
* the PBDMA won't save out until the STALLING interrupt is
@@ -142,8 +139,10 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
* reported to SW.
*/
/* Ignore un-needed return value "recover" */
(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL, &pbdma_status);
g->ops.pbdma.handle_intr(g, pbdma_id, false);
g->ops.pbdma_status.read_pbdma_status_info(g,
pbdma_id, &pbdma_status);
ret = fifo_preempt_check_tsg_on_pbdma(tsgid, &pbdma_status);
if (ret == 0) {

View File

@@ -50,9 +50,7 @@ struct gops_pbdma {
bool (*handle_intr_1)(struct gk20a *g,
u32 pbdma_id, u32 pbdma_intr_1,
u32 *error_notifier);
bool (*handle_intr)(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status);
void (*handle_intr)(struct gk20a *g, u32 pbdma_id, bool recover);
u32 (*set_clear_intr_offsets) (struct gk20a *g,
u32 set_clear_size);
u32 (*get_signature)(struct gk20a *g);

View File

@@ -701,7 +701,6 @@ test_gm20b_pbdma_get_ctrl_hce_priv_mode_yes.pbdma_get_ctrl_hce_priv_mode_yes=0
test_gm20b_pbdma_get_fc_subdevice.pbdma_get_fc_subdevice=0
test_gm20b_pbdma_get_gp_base.pbdma_get_gp_base=0
test_gm20b_pbdma_get_userd.pbdma_get_userd=0
test_gm20b_pbdma_handle_intr.pbdma_handle_intr=0
test_gm20b_pbdma_handle_intr_0.pbdma_handle_intr_0=0
test_gm20b_pbdma_intr_descs.pbdma_intr_descs=0
test_gm20b_pbdma_read_data.pbdma_read_data=0
@@ -738,7 +737,6 @@ test_fifo_init_support.init_support=0
test_fifo_remove_support.remove_support=0
test_gv11b_fifo_is_preempt_pending.is_preempt_pending=0
test_gv11b_fifo_preempt_channel.preempt_channel=0
test_gv11b_fifo_preempt_poll_pbdma.preempt_poll_pbdma=0
test_gv11b_fifo_preempt_runlists_for_rc.preempt_runlists_for_rc=0
test_gv11b_fifo_preempt_trigger.preempt_trigger=0
test_gv11b_fifo_preempt_tsg.preempt_tsg=0

View File

@@ -171,17 +171,13 @@ done:
return ret;
}
static bool stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier, struct nvgpu_pbdma_status_info *pbdma_status)
static void stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover)
{
if (nvgpu_readl(g, fifo_intr_pbdma_id_r()) != BIT(pbdma_id)) {
u.fail = true;
}
pbdma_status->chsw_status = NVGPU_PBDMA_CHSW_STATUS_INVALID;
u.count++;
return u.recover;
}
int test_gk20a_fifo_pbdma_isr(struct unit_module *m,
@@ -189,7 +185,6 @@ int test_gk20a_fifo_pbdma_isr(struct unit_module *m,
{
int ret = UNIT_FAIL;
u32 pending;
int i;
u32 pbdma_id;
u32 num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA);
struct gpu_ops gops = g->ops;
@@ -199,8 +194,6 @@ int test_gk20a_fifo_pbdma_isr(struct unit_module *m,
g->ops.pbdma.handle_intr = stub_pbdma_handle_intr;
u.fail = false;
for (i = 0; i < 2; i++) {
u.recover = (i > 0);
for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) {
nvgpu_writel(g, fifo_intr_pbdma_id_r(), BIT(pbdma_id));
u.count = 0;
@@ -210,7 +203,6 @@ int test_gk20a_fifo_pbdma_isr(struct unit_module *m,
unit_assert(!u.fail, goto done);
unit_assert(u.count == 1, goto done);
}
}
ret = UNIT_SUCCESS;
done:

View File

@@ -163,114 +163,8 @@ done:
return ret;
}
#define F_PBDMA_HANDLE_INTR_0_PENDING BIT(0)
#define F_PBDMA_HANDLE_INTR_0_RECOVER BIT(1)
#define F_PBDMA_HANDLE_INTR_1_PENDING BIT(2)
#define F_PBDMA_HANDLE_INTR_1_RECOVER BIT(3)
#define F_PBDMA_HANDLE_INTR_ERR_NOTIFIER BIT(4)
#define F_PBDMA_HANDLE_INTR_LAST BIT(5)
#define INVALID_ERR_NOTIFIER U32_MAX
static bool stub_pbdma_handle_intr_0(struct gk20a *g,
u32 pbdma_id, u32 pbdma_intr_0, u32 *error_notifier)
{
u.stubs.pbdma_handle_intr_0.count++;
if (u.branches & F_PBDMA_HANDLE_INTR_0_RECOVER) {
return true;
}
return false;
}
static bool stub_pbdma_handle_intr_1(struct gk20a *g,
u32 pbdma_id, u32 pbdma_intr_1, u32 *error_notifier)
{
u.stubs.pbdma_handle_intr_1.count++;
if (u.branches & F_PBDMA_HANDLE_INTR_1_RECOVER) {
return true;
}
return false;
}
int test_gm20b_pbdma_handle_intr(struct unit_module *m,
struct gk20a *g, void *args)
{
int ret = UNIT_FAIL;
struct gpu_ops gops = g->ops;
u32 branches;
const char *labels[] = {
"intr_0_pending",
"intr_0_recover",
"intr_1_pending",
"intr_1_recover",
"err_notifier",
};
u32 pbdma_id = 0;
u32 _err_notifier;
u32 *err_notifier;
struct nvgpu_pbdma_status_info pbdma_status;
bool recover;
g->ops.pbdma.handle_intr_0 = stub_pbdma_handle_intr_0;
g->ops.pbdma.handle_intr_1 = stub_pbdma_handle_intr_1;
for (branches = 0; branches < F_PBDMA_HANDLE_INTR_LAST; branches++) {
subtest_setup(m, branches);
unit_verbose(m, "%s branches=%s\n", __func__,
branches_str(branches, labels));
err_notifier = branches & F_PBDMA_HANDLE_INTR_ERR_NOTIFIER ?
&_err_notifier : NULL;
_err_notifier = INVALID_ERR_NOTIFIER;
nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), 0);
nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), 0);
if (branches & F_PBDMA_HANDLE_INTR_0_PENDING) {
nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), BIT(0));
}
if (branches & F_PBDMA_HANDLE_INTR_1_PENDING) {
nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), BIT(0));
}
recover = gm20b_pbdma_handle_intr(g, pbdma_id, err_notifier, &pbdma_status);
if (branches & F_PBDMA_HANDLE_INTR_0_PENDING) {
unit_assert(u.stubs.pbdma_handle_intr_0.count > 0,
goto done);
if (branches & F_PBDMA_HANDLE_INTR_0_RECOVER) {
unit_assert(recover, goto done);
}
}
if (branches & F_PBDMA_HANDLE_INTR_1_PENDING) {
unit_assert(u.stubs.pbdma_handle_intr_1.count > 0,
goto done);
if (branches & F_PBDMA_HANDLE_INTR_1_RECOVER) {
unit_assert(recover, goto done);
}
}
if (branches & F_PBDMA_HANDLE_INTR_ERR_NOTIFIER) {
unit_assert(*err_notifier != INVALID_ERR_NOTIFIER,
goto done);
}
}
ret = UNIT_SUCCESS;
done:
if (ret != UNIT_SUCCESS) {
unit_err(m, "%s branches=%s\n", __func__,
branches_str(branches, labels));
}
g->ops = gops;
return ret;
}
#define PBDMA_NUM_INTRS 6
#define METHOD_NO_SUBCH 0
@@ -585,7 +479,6 @@ done:
struct unit_module_test nvgpu_pbdma_gm20b_tests[] = {
UNIT_TEST(init_support, test_fifo_init_support, NULL, 0),
UNIT_TEST(pbdma_acquire_val, test_gm20b_pbdma_acquire_val, NULL, 0),
UNIT_TEST(pbdma_handle_intr, test_gm20b_pbdma_handle_intr, NULL, 0),
UNIT_TEST(pbdma_handle_intr_0, test_gm20b_pbdma_handle_intr_0, NULL, 0),
UNIT_TEST(pbdma_read_data, test_gm20b_pbdma_read_data, NULL, 0),
UNIT_TEST(pbdma_intr_descs, test_gm20b_pbdma_intr_descs, NULL, 0),

View File

@@ -75,12 +75,6 @@ struct preempt_gv11b_unit_ctx {
static struct preempt_gv11b_unit_ctx unit_ctx;
static void subtest_setup(u32 branches)
{
unit_ctx.branches = branches;
}
#define F_PREEMPT_TRIGGER_TSG BIT(0)
#define F_PREEMPT_TRIGGER_LAST BIT(1)
@@ -150,149 +144,8 @@ done:
return ret;
}
static bool stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status)
static void stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover)
{
pbdma_status->chsw_status = stub.pbdma_st.chsw_status;
pbdma_status->id = stub.pbdma_st.id;
pbdma_status->next_id = stub.pbdma_st.next_id;
return false;
}
#define F_PREEMPT_POLL_PBDMA_TIMEOUT_INIT_FAIL BIT(0)
#define F_PREEMPT_POLL_PBDMA_PLATFORM_SILICON BIT(1)
#define F_PREEMPT_POLL_PBDMA_CHSW_IS_VALID BIT(2)
#define F_PREEMPT_POLL_PBDMA_CHSW_IS_SAVE BIT(3)
#define F_PREEMPT_POLL_PBDMA_CHSW_IS_LOAD BIT(4)
#define F_PREEMPT_POLL_PBDMA_CHSW_IS_SWITCH BIT(5)
#define F_PREEMPT_POLL_PBDMA_STATUS_ID_IS_TSGID BIT(6)
#define F_PREEMPT_POLL_PBDMA_STATUS_NEXT_ID_IS_TSGID BIT(7)
#define F_PREEMPT_POLL_PRE_SI_RETRIES BIT(8)
#define F_PREEMPT_POLL_PBDMA_LAST BIT(9)
static const char *f_preempt_poll_pbdma[] = {
"timeout_init_fail",
"platform_silicon",
"chsw_is_valid",
"chsw_is_save",
"chsw_is_load",
"chsw_is_switch",
"status_id_is_tsgid",
"status_next_id_is_tsgid",
};
int test_gv11b_fifo_preempt_poll_pbdma(struct unit_module *m, struct gk20a *g,
void *args)
{
u32 tsgid = 0U;
int ret = UNIT_FAIL;
int err;
u32 branches = 0U;
u32 prune = F_PREEMPT_POLL_PBDMA_TIMEOUT_INIT_FAIL |
F_PREEMPT_POLL_PRE_SI_RETRIES;
struct gpu_ops gops = g->ops;
struct nvgpu_posix_fault_inj *timers_fi;
struct nvgpu_os_posix *p = nvgpu_os_posix_from_gk20a(g);
timers_fi = nvgpu_timers_get_fault_injection();
g->ops.pbdma.handle_intr = stub_pbdma_handle_intr;
for (branches = 0U; branches < F_PREEMPT_POLL_PBDMA_LAST; branches++) {
if (pruned(branches, prune)) {
unit_verbose(m, "%s branches=%s (pruned)\n", __func__,
branches_str(branches,
f_preempt_poll_pbdma));
continue;
}
subtest_setup(branches);
unit_verbose(m, "%s branches=%s\n",
__func__, branches_str(branches, f_preempt_poll_pbdma));
nvgpu_posix_enable_fault_injection(timers_fi,
branches & F_PREEMPT_POLL_PBDMA_TIMEOUT_INIT_FAIL ?
true : false, 0);
if (branches & F_PREEMPT_POLL_PRE_SI_RETRIES) {
/* Timeout should not expire */
nvgpu_posix_enable_fault_injection(timers_fi, true,
PREEMPT_PENDING_POLL_PRE_SI_RETRIES + 4U);
/* Force pbdma status = chsw_valid */
branches |= F_PREEMPT_POLL_PBDMA_CHSW_IS_VALID;
/* Force tsgid = pbdma_status id */
branches |= F_PREEMPT_POLL_PBDMA_STATUS_ID_IS_TSGID;
}
p->is_silicon =
branches & F_PREEMPT_POLL_PBDMA_PLATFORM_SILICON ?
true : false;
if (branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_VALID) {
stub.pbdma_st.chsw_status =
NVGPU_PBDMA_CHSW_STATUS_VALID;
} else if (branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_SAVE) {
stub.pbdma_st.chsw_status =
NVGPU_PBDMA_CHSW_STATUS_SAVE;
} else if (branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_LOAD) {
stub.pbdma_st.chsw_status =
NVGPU_PBDMA_CHSW_STATUS_LOAD;
} else if (branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_SWITCH) {
stub.pbdma_st.chsw_status =
NVGPU_PBDMA_CHSW_STATUS_SWITCH;
} else {
stub.pbdma_st.chsw_status =
NVGPU_PBDMA_CHSW_STATUS_INVALID;
}
stub.pbdma_st.id =
branches & F_PREEMPT_POLL_PBDMA_STATUS_ID_IS_TSGID ?
tsgid : tsgid + 1U;
stub.pbdma_st.next_id = branches &
F_PREEMPT_POLL_PBDMA_STATUS_NEXT_ID_IS_TSGID ?
tsgid : tsgid + 1U;
err = gv11b_fifo_preempt_poll_pbdma(g, tsgid, 0U);
if (branches & F_PREEMPT_POLL_PBDMA_TIMEOUT_INIT_FAIL) {
unit_assert(err == -ETIMEDOUT, goto done);
} else if ((branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_VALID) ||
branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_SAVE) {
if (branches &
F_PREEMPT_POLL_PBDMA_STATUS_ID_IS_TSGID) {
unit_assert(err == -EBUSY, goto done);
} else {
unit_assert(err == 0, goto done);
}
} else if (branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_LOAD) {
if (branches &
F_PREEMPT_POLL_PBDMA_STATUS_NEXT_ID_IS_TSGID) {
unit_assert(err == -EBUSY, goto done);
} else {
unit_assert(err == 0, goto done);
}
} else if (branches & F_PREEMPT_POLL_PBDMA_CHSW_IS_SWITCH) {
if ((branches &
F_PREEMPT_POLL_PBDMA_STATUS_ID_IS_TSGID) ||
(branches &
F_PREEMPT_POLL_PBDMA_STATUS_NEXT_ID_IS_TSGID)) {
unit_assert(err == -EBUSY, goto done);
} else {
unit_assert(err == 0, goto done);
}
}
nvgpu_posix_enable_fault_injection(timers_fi, false, 0);
}
ret = UNIT_SUCCESS;
done:
if (ret != UNIT_SUCCESS) {
unit_err(m, "%s branches=%s\n", __func__,
branches_str(branches, f_preempt_poll_pbdma));
}
g->ops = gops;
return ret;
}
static int stub_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg)
@@ -672,7 +525,6 @@ struct unit_module_test nvgpu_preempt_gv11b_tests[] = {
UNIT_TEST(init_support, test_fifo_init_support, &unit_ctx, 0),
UNIT_TEST(preempt_trigger, test_gv11b_fifo_preempt_trigger, NULL, 0),
UNIT_TEST(preempt_runlists_for_rc, test_gv11b_fifo_preempt_runlists_for_rc, NULL, 0),
UNIT_TEST(preempt_poll_pbdma, test_gv11b_fifo_preempt_poll_pbdma, NULL, 0),
UNIT_TEST(preempt_channel, test_gv11b_fifo_preempt_channel, NULL, 0),
UNIT_TEST(preempt_tsg, test_gv11b_fifo_preempt_tsg, NULL, 0),
UNIT_TEST(is_preempt_pending, test_gv11b_fifo_is_preempt_pending, NULL, 0),