diff --git a/drivers/gpu/nvgpu/common/fifo/pbdma_status.c b/drivers/gpu/nvgpu/common/fifo/pbdma_status.c index 0c214d273..4b434ac21 100644 --- a/drivers/gpu/nvgpu/common/fifo/pbdma_status.c +++ b/drivers/gpu/nvgpu/common/fifo/pbdma_status.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -42,6 +42,11 @@ bool nvgpu_pbdma_status_is_chsw_valid(struct nvgpu_pbdma_status_info { return pbdma_status->chsw_status == NVGPU_PBDMA_CHSW_STATUS_VALID; } +bool nvgpu_pbdma_status_ch_not_loaded(struct nvgpu_pbdma_status_info + *pbdma_status) +{ + return pbdma_status->chsw_status == NVGPU_PBDMA_CHSW_STATUS_INVALID; +} bool nvgpu_pbdma_status_is_id_type_tsg(struct nvgpu_pbdma_status_info *pbdma_status) { diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index 8e22be557..df68830f9 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -621,9 +621,8 @@ void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct nvgpu_tsg *tsg, u32 error_notifier) { struct nvgpu_channel *ch = NULL; - u32 max_error_notifier_id = NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH; - if (error_notifier > max_error_notifier_id) { + if (error_notifier >= NVGPU_ERR_NOTIFIER_INVAL) { return; } diff --git a/drivers/gpu/nvgpu/common/rc/rc.c b/drivers/gpu/nvgpu/common/rc/rc.c index ddab7014e..23023f9b3 100644 --- a/drivers/gpu/nvgpu/common/rc/rc.c +++ b/drivers/gpu/nvgpu/common/rc/rc.c @@ -35,6 +35,7 @@ #include #include #include +#include void nvgpu_rc_fifo_recover(struct gk20a *g, u32 eng_bitmask, u32 hw_id, bool id_is_tsg, @@ -94,11 +95,18 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, #endif } -void nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier, +int nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, enum nvgpu_err_notif error_notifier, struct nvgpu_pbdma_status_info *pbdma_status) { - u32 id; u32 id_type = PBDMA_STATUS_ID_TYPE_INVALID; + int err = 0; + u32 id; + + if (error_notifier >= NVGPU_ERR_NOTIFIER_INVAL) { + nvgpu_err(g, "Invalid error notifier %u", error_notifier); + err = -EINVAL; + goto out; + } nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d", pbdma_id, error_notifier); @@ -111,10 +119,14 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier, nvgpu_pbdma_status_is_chsw_switch(pbdma_status)) { id = pbdma_status->next_id; id_type = pbdma_status->next_id_type; - } else { + } else if (nvgpu_pbdma_status_ch_not_loaded(pbdma_status)) { /* Nothing to do here */ - nvgpu_err(g, "Invalid pbdma_status.id"); - return; + nvgpu_log_info(g, "no channel loaded on pbdma."); + goto out; + } else { + nvgpu_err(g, "pbdma status not valid"); + err = -EINVAL; + goto out; } if (id_type == PBDMA_STATUS_ID_TYPE_TSGID) { @@ -128,7 +140,8 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier, struct nvgpu_tsg *tsg; if (ch == NULL) { nvgpu_err(g, "channel is not referenceable"); - return; + err = -EINVAL; + goto out; } tsg = nvgpu_tsg_from_ch(ch); @@ -138,12 +151,21 @@ void nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier, RC_TYPE_PBDMA_FAULT); } else { nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); + err = -EINVAL; } nvgpu_channel_put(ch); } else { - nvgpu_err(g, "Invalid pbdma_status.id_type"); + nvgpu_err(g, "Invalid pbdma_status id_type or next_id_type"); + err = -EINVAL; } + +out: + if (err != 0) { + nvgpu_sw_quiesce(g); + } + + return err; } void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id) diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b.h b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b.h index fad9272d2..cda6624f3 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b.h +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -36,7 +36,6 @@ void ga10b_fifo_intr_0_isr(struct gk20a *g); void ga10b_fifo_intr_set_recover_mask(struct gk20a *g); void ga10b_fifo_intr_unset_recover_mask(struct gk20a *g); -void ga10b_fifo_pbdma_isr(struct gk20a *g, struct nvgpu_runlist *runlist, u32 pbdma_idx); void ga10b_fifo_runlist_intr_retrigger(struct gk20a *g, u32 intr_tree); #endif /* NVGPU_FIFO_INTR_GA10B_H */ diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c index cfa0deb3d..df67ce015 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_ga10b_fusa.c @@ -317,6 +317,36 @@ static void ga10b_fifo_runlist_intr_clear(struct gk20a *g) } } +static int ga10b_fifo_pbdma_isr(struct gk20a *g, struct nvgpu_runlist *runlist, + u32 pbdma_idx) +{ + u32 pbdma_id; + const struct nvgpu_pbdma_info *pbdma_info; + int err; + + if (pbdma_idx >= PBDMA_PER_RUNLIST_SIZE) { + nvgpu_err(g, "pbdma_idx(%d) >= max_pbdmas_per_runlist(%d)", + pbdma_idx, PBDMA_PER_RUNLIST_SIZE); + return -EINVAL; + } + pbdma_info = runlist->pbdma_info; + pbdma_id = pbdma_info->pbdma_id[pbdma_idx]; + if (pbdma_id == PBDMA_ID_INVALID) { + nvgpu_err(g, "runlist_id(%d), pbdma_idx(%d): invalid PBDMA", + runlist->id, pbdma_idx); + return -EINVAL; + } + + err = g->ops.pbdma.handle_intr(g, pbdma_id, true); + if (err != 0) { + nvgpu_err(g, "pbdma intr failed id: %u", pbdma_idx); + return err; + } + + return err; +} + + void ga10b_fifo_intr_0_isr(struct gk20a *g) { u32 i, intr_0, handled_intr_0 = 0U; @@ -324,6 +354,7 @@ void ga10b_fifo_intr_0_isr(struct gk20a *g) u32 pbdma_idx = 0U; u32 intr_tree_0 = 0U, intr_tree_1 = 1U; struct nvgpu_runlist *runlist; + int err = 0; /* TODO: sw_ready is needed only for recovery part */ if (!g->fifo.sw_ready) { @@ -349,7 +380,17 @@ void ga10b_fifo_intr_0_isr(struct gk20a *g) pbdma_idx++) { if (intr_0 & runlist_intr_0_pbdmai_intr_tree_j_pending_f(pbdma_idx, intr_tree_0)) { - ga10b_fifo_pbdma_isr(g, runlist, pbdma_idx); + /** + * Quiesce is triggered as part of nvgpu_rc_pbdma_fault + * failure case, so - + * 1. Avoid looping through the rest of the PBDMAs by + * adding a return statement here. + * 2. Avoid re-triggering the PBDMA ISR by returning + * pbdma_intr field value here in handled_intr_0. + */ + if (err == 0) { + err = ga10b_fifo_pbdma_isr(g, runlist, pbdma_idx); + } handled_intr_0 |= runlist_intr_0_pbdmai_intr_tree_j_pending_f(pbdma_idx, intr_tree_0); } } @@ -456,27 +497,6 @@ void ga10b_fifo_intr_unset_recover_mask(struct gk20a *g) } - -void ga10b_fifo_pbdma_isr(struct gk20a *g, struct nvgpu_runlist *runlist, u32 pbdma_idx) -{ - u32 pbdma_id; - const struct nvgpu_pbdma_info *pbdma_info; - - if (pbdma_idx >= PBDMA_PER_RUNLIST_SIZE) { - nvgpu_err(g, "pbdma_idx(%d) >= max_pbdmas_per_runlist(%d)", - pbdma_idx, PBDMA_PER_RUNLIST_SIZE); - return; - } - pbdma_info = runlist->pbdma_info; - pbdma_id = pbdma_info->pbdma_id[pbdma_idx]; - if (pbdma_id == PBDMA_ID_INVALID) { - nvgpu_err(g, "runlist_id(%d), pbdma_idx(%d): invalid PBDMA", - runlist->id, pbdma_idx); - return; - } - g->ops.pbdma.handle_intr(g, pbdma_id, true); -} - void ga10b_fifo_runlist_intr_retrigger(struct gk20a *g, u32 intr_tree) { u32 i = 0U; diff --git a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c index 41a30a174..2e25cecac 100644 --- a/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/fifo_intr_gk20a_fusa.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -92,12 +93,25 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g) u32 pbdma_id; u32 num_pbdma = nvgpu_get_litter_value(g, GPU_LIT_HOST_NUM_PBDMA); u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r()); + int err; for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) { if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) { nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending", pbdma_id); - g->ops.pbdma.handle_intr(g, pbdma_id, true); + /** + * Quiesce is triggered as part of nvgpu_rc_pbdma_fault + * failure case, so - + * 1. Avoid looping through the rest of the PBDMAs by + * adding a return statement here. + * 2. Avoid re-triggering the PBDMA ISR by returning + * pbdma_intr field value here. + */ + err = g->ops.pbdma.handle_intr(g, pbdma_id, true); + if (err != 0) { + nvgpu_err(g, "pbdma intr failed id: %u", pbdma_id); + break; + } } } return fifo_intr_0_pbdma_intr_pending_f(); diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b.h b/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b.h index 3656fe7a9..fa240c2c1 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b.h +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -41,7 +41,7 @@ struct nvgpu_pbdma_status_info; struct nvgpu_device; void ga10b_pbdma_intr_enable(struct gk20a *g, bool enable); -void ga10b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover); +int ga10b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover); bool ga10b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_0, u32 *error_notifier); bool ga10b_pbdma_handle_intr_1(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1, diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c index 1ae99bc3b..e7cf37c2f 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_ga10b_fusa.c @@ -358,11 +358,11 @@ void ga10b_pbdma_intr_enable(struct gk20a *g, bool enable) } } - -void ga10b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) +int ga10b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) { struct nvgpu_pbdma_status_info pbdma_status; u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR; + int err = 0; u32 pbdma_intr_0 = nvgpu_readl(g, pbdma_intr_0_r(pbdma_id)); u32 pbdma_intr_1 = nvgpu_readl(g, pbdma_intr_1_r(pbdma_id)); @@ -377,9 +377,12 @@ void ga10b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, &pbdma_status); if (recover) { - nvgpu_rc_pbdma_fault(g, pbdma_id, + err = nvgpu_rc_pbdma_fault(g, pbdma_id, intr_error_notifier, &pbdma_status); + if (err != 0) { + nvgpu_err(g, "recovery failed"); + } } } nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0); @@ -391,17 +394,22 @@ void ga10b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) pbdma_id, pbdma_intr_1); if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1, - &intr_error_notifier)) { + &intr_error_notifier) && (err == 0)) { g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, &pbdma_status); if (recover) { - nvgpu_rc_pbdma_fault(g, pbdma_id, + err = nvgpu_rc_pbdma_fault(g, pbdma_id, intr_error_notifier, &pbdma_status); + if (err != 0) { + nvgpu_err(g, "recovery failed"); + } } } nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1); } + + return err; } static bool ga10b_pbdma_handle_intr_0_legacy(struct gk20a *g, u32 pbdma_id, diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h index 0a99a7c4a..191bc6434 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2014-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -34,7 +34,7 @@ struct nvgpu_device; bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_0, u32 *error_notifier); -void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover); +int gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover); u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id); void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id); diff --git a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c index 695c03a73..dd0b056b5 100644 --- a/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/pbdma_gm20b_fusa.c @@ -323,10 +323,11 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void) return restartable_0_intr_descs; } -void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) +int gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) { struct nvgpu_pbdma_status_info pbdma_status; u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR; + int err = 0; u32 pbdma_intr_0 = nvgpu_readl(g, pbdma_intr_0_r(pbdma_id)); u32 pbdma_intr_1 = nvgpu_readl(g, pbdma_intr_1_r(pbdma_id)); @@ -341,9 +342,12 @@ void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, &pbdma_status); if (recover) { - nvgpu_rc_pbdma_fault(g, pbdma_id, + err = nvgpu_rc_pbdma_fault(g, pbdma_id, intr_error_notifier, &pbdma_status); + if (err != 0) { + nvgpu_err(g, "recovery failed"); + } } } nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0); @@ -355,18 +359,23 @@ void gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) pbdma_id, pbdma_intr_1); if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1, - &intr_error_notifier)) { + &intr_error_notifier) && (err == 0)) { g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, &pbdma_status); if (recover) { - nvgpu_rc_pbdma_fault(g, pbdma_id, + err = nvgpu_rc_pbdma_fault(g, pbdma_id, intr_error_notifier, &pbdma_status); + if (err != 0) { + nvgpu_err(g, "recovery failed"); + } } } nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1); } + + return err; } u32 gm20b_pbdma_get_gp_base(u64 gpfifo_base) diff --git a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c index ff10839e4..38d202a5f 100644 --- a/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/fifo/preempt_gv11b_fusa.c @@ -131,7 +131,11 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid, * reported to SW. */ - g->ops.pbdma.handle_intr(g, pbdma_id, false); + ret = g->ops.pbdma.handle_intr(g, pbdma_id, false); + if (ret != 0) { + nvgpu_err(g, "pbdma intr failed id: %u %d", pbdma_id, ret); + break; + } g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id, &pbdma_status); diff --git a/drivers/gpu/nvgpu/include/nvgpu/error_notifier.h b/drivers/gpu/nvgpu/include/nvgpu/error_notifier.h index d97c67002..d01cdacf2 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/error_notifier.h +++ b/drivers/gpu/nvgpu/include/nvgpu/error_notifier.h @@ -27,7 +27,7 @@ struct nvgpu_channel; -enum { +enum nvgpu_err_notif { NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT = 0, NVGPU_ERR_NOTIFIER_GR_ERROR_SW_METHOD, NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY, @@ -40,6 +40,7 @@ enum { NVGPU_ERR_NOTIFIER_RESETCHANNEL_VERIF_ERROR, NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH, NVGPU_ERR_NOTIFIER_CE_ERROR, + NVGPU_ERR_NOTIFIER_INVAL, }; void nvgpu_set_err_notifier_locked(struct nvgpu_channel *ch, u32 error); diff --git a/drivers/gpu/nvgpu/include/nvgpu/gops/pbdma.h b/drivers/gpu/nvgpu/include/nvgpu/gops/pbdma.h index d903afc5c..722d4b508 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gops/pbdma.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gops/pbdma.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -51,7 +51,7 @@ struct gops_pbdma { bool (*handle_intr_1)(struct gk20a *g, u32 pbdma_id, u32 pbdma_intr_1, u32 *error_notifier); - void (*handle_intr)(struct gk20a *g, u32 pbdma_id, bool recover); + int (*handle_intr)(struct gk20a *g, u32 pbdma_id, bool recover); u32 (*set_clear_intr_offsets) (struct gk20a *g, u32 set_clear_size); u32 (*get_signature)(struct gk20a *g); diff --git a/drivers/gpu/nvgpu/include/nvgpu/pbdma_status.h b/drivers/gpu/nvgpu/include/nvgpu/pbdma_status.h index 95c9e78ae..c0ca97756 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/pbdma_status.h +++ b/drivers/gpu/nvgpu/include/nvgpu/pbdma_status.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -150,6 +150,17 @@ bool nvgpu_pbdma_status_is_chsw_save(struct nvgpu_pbdma_status_info */ bool nvgpu_pbdma_status_is_chsw_valid(struct nvgpu_pbdma_status_info *pbdma_status); +/** + * @brief Check if chsw_status is set to invalid. + * + * @param pbdma_status [in] Pointer to struct containing pbdma_status h/w + * reg/field value. + * + * @return Interprets #pbdma_status and returns true if channel + * status is set to #NVGPU_PBDMA_CHSW_STATUS_INVALID else returns false. + */ +bool nvgpu_pbdma_status_ch_not_loaded(struct nvgpu_pbdma_status_info + *pbdma_status); /** * @brief Check if id_type is tsg. * diff --git a/drivers/gpu/nvgpu/include/nvgpu/rc.h b/drivers/gpu/nvgpu/include/nvgpu/rc.h index 08cbb7e14..a75ed9f13 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/rc.h +++ b/drivers/gpu/nvgpu/include/nvgpu/rc.h @@ -103,6 +103,7 @@ struct nvgpu_tsg; struct nvgpu_channel; struct nvgpu_pbdma_status_info; struct mmu_fault_info; +enum nvgpu_err_notif; static inline const char *nvgpu_rc_type_to_str(unsigned int rc_type) { @@ -172,8 +173,16 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask, * * Do PBDMA fault recovery. Set error notifier as per \a error_notifier and call * \a nvgpu_rc_tsg_and_related_engines to do the recovery. + * + * @return 0 in case of success, < 0 in case of failure. + * @retval -EINVAL in case of following cases: + * 1. the error_notifier is invalid. + * 2. the pbdma status is invalid. + * 3. the channel is not referenceable. + * 4. the channel is not bound to tsg. + * 5. the id type or next_id type are not indicating channel id type or tsg id type. */ -void nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, u32 error_notifier, +int nvgpu_rc_pbdma_fault(struct gk20a *g, u32 pbdma_id, enum nvgpu_err_notif error_notifier, struct nvgpu_pbdma_status_info *pbdma_status); /** diff --git a/libs/dgpu/libnvgpu-drv-dgpu_safe.export b/libs/dgpu/libnvgpu-drv-dgpu_safe.export index 0be8774aa..2a7e0b6d5 100644 --- a/libs/dgpu/libnvgpu-drv-dgpu_safe.export +++ b/libs/dgpu/libnvgpu-drv-dgpu_safe.export @@ -594,6 +594,7 @@ nvgpu_pbdma_status_is_chsw_load nvgpu_pbdma_status_is_chsw_save nvgpu_pbdma_status_is_chsw_switch nvgpu_pbdma_status_is_chsw_valid +nvgpu_pbdma_status_ch_not_loaded nvgpu_pbdma_status_is_id_type_tsg nvgpu_pbdma_status_is_next_id_type_tsg nvgpu_pbdma_setup_sw diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export index 041be4cde..be292eb22 100644 --- a/libs/igpu/libnvgpu-drv-igpu_safe.export +++ b/libs/igpu/libnvgpu-drv-igpu_safe.export @@ -614,6 +614,7 @@ nvgpu_pbdma_status_is_chsw_switch nvgpu_pbdma_status_is_chsw_valid nvgpu_pbdma_status_is_id_type_tsg nvgpu_pbdma_status_is_next_id_type_tsg +nvgpu_pbdma_status_ch_not_loaded nvgpu_pbdma_setup_sw nvgpu_pd_alloc nvgpu_pd_cache_fini diff --git a/userspace/units/fifo/fifo/gk20a/nvgpu-fifo-intr-gk20a.c b/userspace/units/fifo/fifo/gk20a/nvgpu-fifo-intr-gk20a.c index 0d2a1d1ac..024c81da9 100644 --- a/userspace/units/fifo/fifo/gk20a/nvgpu-fifo-intr-gk20a.c +++ b/userspace/units/fifo/fifo/gk20a/nvgpu-fifo-intr-gk20a.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -172,13 +172,15 @@ done: return ret; } -static void stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) +static int stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) { if (nvgpu_readl(g, fifo_intr_pbdma_id_r()) != BIT(pbdma_id)) { u.fail = true; } u.count++; + + return 0; } int test_gk20a_fifo_pbdma_isr(struct unit_module *m, diff --git a/userspace/units/fifo/pbdma/nvgpu-pbdma.c b/userspace/units/fifo/pbdma/nvgpu-pbdma.c index c10d3f09f..14ab9a62e 100644 --- a/userspace/units/fifo/pbdma/nvgpu-pbdma.c +++ b/userspace/units/fifo/pbdma/nvgpu-pbdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -180,6 +180,9 @@ int test_pbdma_status(struct unit_module *m, unit_assert(nvgpu_pbdma_status_is_chsw_valid(&pbdma_status) == (pbdma_status.chsw_status == NVGPU_PBDMA_CHSW_STATUS_VALID), goto done); + unit_assert(nvgpu_pbdma_status_ch_not_loaded(&pbdma_status) == + (pbdma_status.chsw_status == + NVGPU_PBDMA_CHSW_STATUS_INVALID), goto done); } pbdma_status.id_type = PBDMA_STATUS_ID_TYPE_CHID; diff --git a/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c b/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c index e86b4e56d..ecea5b68f 100644 --- a/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c +++ b/userspace/units/fifo/preempt/gv11b/nvgpu-preempt-gv11b.c @@ -144,8 +144,9 @@ done: return ret; } -static void stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) +static int stub_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id, bool recover) { + return 0; } static int stub_fifo_preempt_tsg(struct gk20a *g, struct nvgpu_tsg *tsg) diff --git a/userspace/units/fifo/tsg/nvgpu-tsg.c b/userspace/units/fifo/tsg/nvgpu-tsg.c index 3ad96acd5..a7d8e4adf 100644 --- a/userspace/units/fifo/tsg/nvgpu-tsg.c +++ b/userspace/units/fifo/tsg/nvgpu-tsg.c @@ -1579,7 +1579,7 @@ int test_nvgpu_tsg_set_error_notifier_bvec(struct unit_module *m, int ret = 0; u32 valid_error_notifier_ids[][2] = {{NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH}}; - u32 invalid_error_notifier_ids[][2] = {{NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH + 1, U32_MAX}}; + u32 invalid_error_notifier_ids[][2] = {{NVGPU_ERR_NOTIFIER_INVAL, U32_MAX}}; u32 (*working_list)[2]; u32 error_code, error_notifier_range_len; /* diff --git a/userspace/units/fifo/tsg/nvgpu-tsg.h b/userspace/units/fifo/tsg/nvgpu-tsg.h index 161cfbfb8..4cbebfc09 100644 --- a/userspace/units/fifo/tsg/nvgpu-tsg.h +++ b/userspace/units/fifo/tsg/nvgpu-tsg.h @@ -448,7 +448,7 @@ int test_tsg_mark_error(struct unit_module *m, * Input: None * Equivalence classes: * error_notifier - * - Invalid : { NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH + 1, U32_MAX } + * - Invalid : { NVGPU_ERR_NOTIFIER_INVAL, U32_MAX } * - Valid : { NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH } * * Steps: diff --git a/userspace/units/rc/nvgpu-rc.c b/userspace/units/rc/nvgpu-rc.c index 457006142..cb59283ca 100644 --- a/userspace/units/rc/nvgpu-rc.c +++ b/userspace/units/rc/nvgpu-rc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -146,6 +147,9 @@ int test_rc_init(struct unit_module *m, struct gk20a *g, void *args) goto clear_posix_channel; } + /* initialize the seed for random number generation needed in bvec tests */ + srand(time(0)); + return UNIT_SUCCESS; clear_posix_channel: @@ -294,13 +298,18 @@ int test_rc_mmu_fault(struct unit_module *m, struct gk20a *g, void *args) return UNIT_SUCCESS; } -#define F_RC_IS_CHSW_VALID_OR_SAVE 0U -#define F_RC_IS_CHSW_LOAD_OR_SWITCH 1U -#define F_RC_IS_CHSW_INVALID 2U +#define F_RC_IS_CHSW_VALID_OR_SAVE 0U +#define F_RC_IS_CHSW_LOAD_OR_SWITCH 1U +#define F_RC_IS_CHSW_INVALID 2U +#define F_RC_IS_CHSW_INVALID_STATE_MIN 3U +#define F_RC_IS_CHSW_INVALID_STATE_RANDOM 4U +#define F_RC_IS_CHSW_INVALID_STATE_MAX 5U #define F_RC_ID_TYPE_TSG 0U #define F_RC_ID_TYPE_CH 1U -#define F_RC_ID_TYPE_INVALID 2U +#define F_RC_ID_TYPE_INVALID_MIN 2U +#define F_RC_ID_TYPE_INVALID_RANDOM 3U +#define F_RC_ID_TYPE_INVALID_MAX 4U #define F_RC_ID_TYPE_CH_NULL_CHANNEL 0U #define F_RC_ID_TYPE_CH_NULL_TSG 1U @@ -309,13 +318,18 @@ int test_rc_mmu_fault(struct unit_module *m, struct gk20a *g, void *args) static const char *f_rc_chsw[] = { "is_chsw_valid_or_save", "is_chsw_load_or_switch", - "is_chsw_invalid", + "is_chsw_invalid channel not loaded on engine", + "is_chsw_inval_min", + "is_chsw_inval_random", + "is_chsw_inval_max", }; static const char *f_rc_id_type[] = { "id_type_tsg", "id_type_ch", - "id_type_invalid", + "id_type_invalid_min", + "id_type_invalid_random", + "id_type_invalid_max", }; static const char *f_rc_id_ch_subbranch[] = { @@ -366,7 +380,13 @@ static void set_pbdma_info_id_type(u32 chsw_branches, info->next_id_type = (chsw_branches == F_RC_IS_CHSW_LOAD_OR_SWITCH) ? PBDMA_STATUS_NEXT_ID_TYPE_CHID : PBDMA_STATUS_NEXT_ID_TYPE_INVALID; } - } else { + } else if (id_type_branches == F_RC_ID_TYPE_INVALID_MIN) { + info->id_type = PBDMA_STATUS_ID_TYPE_TSGID + 1; + info->next_id_type = PBDMA_STATUS_ID_TYPE_TSGID + 1; + } else if (id_type_branches == F_RC_ID_TYPE_INVALID_RANDOM) { + info->id_type = PBDMA_STATUS_ID_TYPE_TSGID + 2 + get_random_u32(PBDMA_STATUS_ID_TYPE_TSGID + 1, U32_MAX); + info->next_id_type = PBDMA_STATUS_ID_TYPE_TSGID + 2 + get_random_u32(PBDMA_STATUS_ID_TYPE_TSGID + 1, U32_MAX); + } else if (id_type_branches == F_RC_ID_TYPE_INVALID_MAX) { info->id_type = PBDMA_STATUS_ID_INVALID; info->next_id_type = PBDMA_STATUS_ID_INVALID; } @@ -374,7 +394,13 @@ static void set_pbdma_info_id_type(u32 chsw_branches, int test_rc_pbdma_fault(struct unit_module *m, struct gk20a *g, void *args) { + int notifiers[] = {NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH, + NVGPU_ERR_NOTIFIER_INVAL, + NVGPU_ERR_NOTIFIER_INVAL + 1 + get_random_u32(NVGPU_ERR_NOTIFIER_INVAL, INT_MAX), INT_MAX}; + struct nvgpu_pbdma_status_info info = {0}; u32 chsw_branches, id_type_branches; + int err = UNIT_SUCCESS; + u32 i; u32 chsw_subbranch; struct nvgpu_channel *ch_without_tsg = NULL; @@ -388,18 +414,19 @@ int test_rc_pbdma_fault(struct unit_module *m, struct gk20a *g, void *args) g->sw_quiesce_pending = true; - for (chsw_branches = F_RC_IS_CHSW_VALID_OR_SAVE; - chsw_branches <= F_RC_IS_CHSW_INVALID; chsw_branches++) { - struct nvgpu_pbdma_status_info info = {0}; - - if (chsw_branches == F_RC_IS_CHSW_INVALID) { - info.chsw_status = NVGPU_PBDMA_CHSW_STATUS_INVALID; - unit_info(m, "%s branch: %s\n", __func__, f_rc_chsw[chsw_branches]); - nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info); - continue; + for (i = 0; i < ARRAY_SIZE(notifiers); i++) { + err = nvgpu_rc_pbdma_fault(g, 0U, notifiers[i], &info); + if (err != (i < 2 ? 0 : -EINVAL)) { + unit_err(m, "fault processing error with notifier %d", notifiers[i]); + err = UNIT_FAIL; + goto out; } + } - for (chsw_subbranch = 0U; chsw_subbranch < 2U; chsw_subbranch++) { + for (chsw_branches = F_RC_IS_CHSW_VALID_OR_SAVE; + chsw_branches <= F_RC_IS_CHSW_LOAD_OR_SWITCH; chsw_branches++) { + + for (chsw_subbranch = 0U; chsw_subbranch <= chsw_branches; chsw_subbranch++) { if (chsw_branches == F_RC_IS_CHSW_VALID_OR_SAVE) { info.chsw_status = (chsw_subbranch * NVGPU_PBDMA_CHSW_STATUS_VALID) + @@ -411,7 +438,7 @@ int test_rc_pbdma_fault(struct unit_module *m, struct gk20a *g, void *args) } } - for (id_type_branches = F_RC_ID_TYPE_TSG; id_type_branches <= F_RC_ID_TYPE_INVALID; + for (id_type_branches = F_RC_ID_TYPE_TSG; id_type_branches <= F_RC_ID_TYPE_INVALID_MAX; id_type_branches++) { u32 id_type_ch_sub_branches = 0U; if (id_type_branches == F_RC_ID_TYPE_CH) { @@ -425,27 +452,81 @@ int test_rc_pbdma_fault(struct unit_module *m, struct gk20a *g, void *args) f_rc_id_type[id_type_branches], f_rc_id_ch_subbranch[id_type_ch_sub_branches]); - nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info); + err = nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info); + if ((id_type_branches >= F_RC_ID_TYPE_INVALID_MIN) || + (id_type_ch_sub_branches < F_RC_ID_TYPE_CH_FULL)) { + if (err != -EINVAL) { + unit_err(m, "invalid id type or null ch/tsg passed"); + err = UNIT_FAIL; + goto out; + } + } else if (err != 0) { + unit_err(m, "valid id type with full ch failed"); + err = UNIT_FAIL; + goto out; + } } } else { set_pbdma_info_id_type(chsw_branches, &info, ch_without_tsg, id_type_branches, id_type_ch_sub_branches); - unit_info(m, "%s branch: %s - %s\n", __func__, f_rc_chsw[chsw_branches], f_rc_id_type[id_type_branches]); - nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info); + err = nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info); + if (id_type_branches >= F_RC_ID_TYPE_INVALID_MIN) { + if (err != -EINVAL) { + unit_err(m, "invalid id type passed"); + err = UNIT_FAIL; + goto out; + } + } else if (err != 0) { + unit_err(m, "valid id type with tsg failed"); + err = UNIT_FAIL; + goto out; + } } } } + for (chsw_branches = F_RC_IS_CHSW_INVALID; + chsw_branches <= F_RC_IS_CHSW_INVALID_STATE_MAX; chsw_branches++) { + + if (chsw_branches == F_RC_IS_CHSW_INVALID) { + info.chsw_status = NVGPU_PBDMA_CHSW_STATUS_INVALID; + } + + if (chsw_branches == F_RC_IS_CHSW_INVALID_STATE_MIN) { + info.chsw_status = NVGPU_PBDMA_CHSW_STATUS_SWITCH + 1; + } + + if (chsw_branches == F_RC_IS_CHSW_INVALID_STATE_RANDOM) { + info.chsw_status = NVGPU_PBDMA_CHSW_STATUS_SWITCH + 2 + + get_random_u32(NVGPU_PBDMA_CHSW_STATUS_SWITCH + 1, INT_MAX); + } + + if (chsw_branches == F_RC_IS_CHSW_INVALID_STATE_MAX) { + info.chsw_status = INT_MAX; + } + + unit_info(m, "%s branch: %s\n", __func__, f_rc_chsw[chsw_branches]); + err = nvgpu_rc_pbdma_fault(g, 0U, NVGPU_ERR_NOTIFIER_PBDMA_ERROR, &info); + if (err != ((chsw_branches == F_RC_IS_CHSW_INVALID) ? 0 : -EINVAL)) { + unit_err(m, "pbdma status check failed"); + err = UNIT_FAIL; + goto out; + } + } + + err = UNIT_SUCCESS; + +out: g->sw_quiesce_pending = false; nvgpu_channel_close(ch_without_tsg); - return UNIT_SUCCESS; + return err; } struct unit_module_test nvgpu_rc_tests[] = { diff --git a/userspace/units/rc/nvgpu-rc.h b/userspace/units/rc/nvgpu-rc.h index f53326c36..3b5b10803 100644 --- a/userspace/units/rc/nvgpu-rc.h +++ b/userspace/units/rc/nvgpu-rc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -249,33 +249,49 @@ int test_rc_mmu_fault(struct unit_module *m, struct gk20a *g, void *args); * * Description: Coverage test for nvgpu_rc_pbdma_fault * - * Test Type: Feature + * Test Type: Feature, Boundary Value * * Targets: nvgpu_rc_pbdma_fault * * Input: test_rc_init run for this GPU * + * Equivalence classes: + * Variable: error_notifier + * - Valid: [NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT, NVGPU_ERR_NOTIFIER_PBDMA_PUSHBUFFER_CRC_MISMATCH] + * - Invalid: [NVGPU_ERR_NOTIFIER_INVAL, INT_MAX] + * Variable: chsw state + * - Valid: [NVGPU_PBDMA_CHSW_STATUS_INVALID, NVGPU_PBDMA_CHSW_STATUS_SWITCH] + * - Invalid: [NVGPU_PBDMA_CHSW_STATUS_SWITCH + 1, INT_MAX] + * Variable: id_type + * - Valid: [PBDMA_STATUS_ID_TYPE_CHID, PBDMA_STATUS_ID_TYPE_TSGID] + * - Invalid: [PBDMA_STATUS_ID_TYPE_TSGID + 1, PBDMA_STATUS_ID_TYPE_INVALID] + * * Steps: * - initialize Channel error_notifier + * - test with valid and invalid error notifier values types * - set g->sw_quiesce_pending = true * - For each branch check with the following pbdma_status values * - set chsw_status to chsw_valid_or_save * - set id_type to TSG * - set id_type to Channel * - set Channel Id to Invalid - * - set Channel Id to a channel without TSG - * - set Channel Id to a channel with a valid TSG - * - set id_type to Invalid + * - set Channel Id to a channel without TSG + * - set Channel Id to a channel with a valid TSG + * - set id_type to chid, tsgid, tsgid + 1, tsgid + 1 + random, invalid_id + * - verify that nvgpu_rc_pbdma_fault fails for invalid id_types and invalid channel ids and succeeds otherwise. * - set chsw_status to is_chsw_load_or_switch * - set id_type to TSG * - set id_type to Channel * - set Channel Id to Invalid - * - set Channel Id to a channel without TSG - * - set Channel Id to a channel with a valid TSG - * - set id_type to Invalid - * - set chsw_status to chsw_invalid + * - set Channel Id to a channel without TSG + * - set Channel Id to a channel with a valid TSG + * - set id_type to chid, tsgid, tsgid + 1, tsgid + 1 + random, invalid_id + * - verify that nvgpu_rc_pbdma_fault fails for invalid id_types and invalid channel ids and succeeds otherwise. + * - set chsw_status to chsw_invalid and verify that nvgpu_rc_pbdma_fault succeeds. + * - set chsw_status to invalid states and verify that nvgpu_rc_pbdma_fault fails. * - * Output: Cover all branch in safety build. + * Output: Returns PASS if nvgpu_rc_pbdma_fault succeeds for valid inputs + * and fails for invalid inputs. Returns FAIL otherwise. */ int test_rc_pbdma_fault(struct unit_module *m, struct gk20a *g, void *args);