gpu: nvgpu: correct handling of pbdma rc

nvgpu_rc_pbdma_fault just checks for the id and id_type from struct
nvgpu_pbdma_status_info. These contain invalid values during chsw_load
and chsw_switch. This patch corrects the above bug by checking for the
chsw status and then loading the values for id and type.

The current code reads the pbdma_status info after clearing the
interrupt. Other interrupts can cause enough delay between clearing the
interrupt and pbdma switching the channel leading to invalid channel/tsg
ID. Correct that by reading the pbdma_status info register before
clearing of the pbdma interrupt to correctly read the context
information before the pbdma can switch out the context.

Bug 2648298

Change-Id: Ic2f0682526e00d14ad58f0411472f34388183f2b
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2165047
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: Deepak Nibade <dnibade@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Debarshi Dutta
2019-07-31 16:53:40 +05:30
committed by mobile promotions
parent 8a691fcf6c
commit 0ef96e4b1a
7 changed files with 38 additions and 18 deletions

View File

@@ -77,25 +77,36 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
}
void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f,
u32 pbdma_id, u32 error_notifier)
u32 pbdma_id, u32 error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status)
{
u32 id;
struct nvgpu_pbdma_status_info pbdma_status;
u32 id_type = PBDMA_STATUS_ID_TYPE_INVALID;
nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d",
pbdma_id, error_notifier);
g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id,
&pbdma_status);
/* Remove channel from runlist */
id = pbdma_status.id;
if (pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_TSGID) {
if (nvgpu_pbdma_status_is_chsw_valid(pbdma_status) ||
nvgpu_pbdma_status_is_chsw_save(pbdma_status)) {
id = pbdma_status->id;
id_type = pbdma_status->id_type;
} else if (nvgpu_pbdma_status_is_chsw_load(pbdma_status) ||
nvgpu_pbdma_status_is_chsw_switch(pbdma_status)) {
id = pbdma_status->next_id;
id_type = pbdma_status->next_id_type;
} else {
/* Nothing to do here */
nvgpu_err(g, "Invalid pbdma_status.id");
return;
}
if (id_type == PBDMA_STATUS_ID_TYPE_TSGID) {
struct nvgpu_tsg *tsg = nvgpu_tsg_get_from_id(g, id);
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
nvgpu_rc_tsg_and_related_engines(g, tsg, true,
RC_TYPE_PBDMA_FAULT);
} else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) {
} else if(id_type == PBDMA_STATUS_ID_TYPE_CHID) {
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, id);
struct nvgpu_tsg *tsg;
if (ch == NULL) {

View File

@@ -96,16 +96,17 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g)
u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r());
u32 error_notifier;
bool recover;
struct nvgpu_pbdma_status_info pbdma_status;
for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) {
if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) {
nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending",
pbdma_id);
recover = g->ops.pbdma.handle_intr(g, pbdma_id,
&error_notifier);
&error_notifier, &pbdma_status);
if (recover) {
nvgpu_rc_pbdma_fault(g, f, pbdma_id,
error_notifier);
error_notifier, &pbdma_status);
}
}
}

View File

@@ -29,11 +29,13 @@ struct gk20a;
struct nvgpu_debug_context;
struct nvgpu_channel_dump_info;
struct nvgpu_gpfifo_entry;
struct nvgpu_pbdma_status_info;
bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id,
u32 pbdma_intr_0, u32 *error_notifier);
bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier);
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status);
u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id);
void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id);

View File

@@ -356,7 +356,8 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void)
}
bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier)
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status)
{
u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;
@@ -372,6 +373,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
if (g->ops.pbdma.handle_intr_0(g, pbdma_id, pbdma_intr_0,
&intr_error_notifier)) {
g->ops.pbdma_status.read_pbdma_status_info(g,
pbdma_id, pbdma_status);
recover = true;
}
nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
@@ -384,6 +387,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1,
&intr_error_notifier)) {
g->ops.pbdma_status.read_pbdma_status_info(g,
pbdma_id, pbdma_status);
recover = true;
}
nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);

View File

@@ -173,10 +173,7 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
*/
/* Ignore un-needed return value "recover" */
(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL);
g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id,
&pbdma_status);
(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL, &pbdma_status);
if (nvgpu_pbdma_status_is_chsw_valid(&pbdma_status) ||
nvgpu_pbdma_status_is_chsw_save(&pbdma_status)) {

View File

@@ -116,6 +116,7 @@ struct _resmgr_context;
struct nvgpu_gpfifo_entry;
struct vm_gk20a_mapping_batch;
struct pmu_pg_stats_data;
struct nvgpu_pbdma_status_info;
enum nvgpu_flush_op;
enum gk20a_mem_rw_flag;
@@ -1172,7 +1173,8 @@ struct gpu_ops {
u32 *error_notifier);
/* error_notifier can be NULL */
bool (*handle_intr)(struct gk20a *g, u32 pbdma_id,
u32 *error_notifier);
u32 *error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status);
u32 (*get_signature)(struct gk20a *g);
void (*dump_status)(struct gk20a *g,
struct nvgpu_debug_context *o);

View File

@@ -41,12 +41,14 @@ struct gk20a;
struct nvgpu_fifo;
struct nvgpu_tsg;
struct nvgpu_channel;
struct nvgpu_pbdma_status_info;
void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
struct nvgpu_tsg *tsg, bool debug_dump);
void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f,
u32 pbdma_id, u32 error_notifier);
u32 pbdma_id, u32 error_notifier,
struct nvgpu_pbdma_status_info *pbdma_status);
void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id);