mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 10:34:43 +03:00
gpu: nvgpu: correct handling of pbdma rc
nvgpu_rc_pbdma_fault just checks for the id and id_type from struct nvgpu_pbdma_status_info. These contain invalid values during chsw_load and chsw_switch. This patch corrects the above bug by checking for the chsw status and then loading the values for id and type. The current code reads the pbdma_status info after clearing the interrupt. Other interrupts can cause enough delay between clearing the interrupt and pbdma switching the channel leading to invalid channel/tsg ID. Correct that by reading the pbdma_status info register before clearing of the pbdma interrupt to correctly read the context information before the pbdma can switch out the context. Bug 2648298 Change-Id: Ic2f0682526e00d14ad58f0411472f34388183f2b Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2165047 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: Deepak Nibade <dnibade@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
8a691fcf6c
commit
0ef96e4b1a
@@ -77,25 +77,36 @@ void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
|
||||
}
|
||||
|
||||
void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f,
|
||||
u32 pbdma_id, u32 error_notifier)
|
||||
u32 pbdma_id, u32 error_notifier,
|
||||
struct nvgpu_pbdma_status_info *pbdma_status)
|
||||
{
|
||||
u32 id;
|
||||
struct nvgpu_pbdma_status_info pbdma_status;
|
||||
u32 id_type = PBDMA_STATUS_ID_TYPE_INVALID;
|
||||
|
||||
nvgpu_log(g, gpu_dbg_info, "pbdma id %d error notifier %d",
|
||||
pbdma_id, error_notifier);
|
||||
|
||||
g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id,
|
||||
&pbdma_status);
|
||||
/* Remove channel from runlist */
|
||||
id = pbdma_status.id;
|
||||
if (pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_TSGID) {
|
||||
if (nvgpu_pbdma_status_is_chsw_valid(pbdma_status) ||
|
||||
nvgpu_pbdma_status_is_chsw_save(pbdma_status)) {
|
||||
id = pbdma_status->id;
|
||||
id_type = pbdma_status->id_type;
|
||||
} else if (nvgpu_pbdma_status_is_chsw_load(pbdma_status) ||
|
||||
nvgpu_pbdma_status_is_chsw_switch(pbdma_status)) {
|
||||
id = pbdma_status->next_id;
|
||||
id_type = pbdma_status->next_id_type;
|
||||
} else {
|
||||
/* Nothing to do here */
|
||||
nvgpu_err(g, "Invalid pbdma_status.id");
|
||||
return;
|
||||
}
|
||||
|
||||
if (id_type == PBDMA_STATUS_ID_TYPE_TSGID) {
|
||||
struct nvgpu_tsg *tsg = nvgpu_tsg_get_from_id(g, id);
|
||||
|
||||
nvgpu_tsg_set_error_notifier(g, tsg, error_notifier);
|
||||
nvgpu_rc_tsg_and_related_engines(g, tsg, true,
|
||||
RC_TYPE_PBDMA_FAULT);
|
||||
} else if(pbdma_status.id_type == PBDMA_STATUS_ID_TYPE_CHID) {
|
||||
} else if(id_type == PBDMA_STATUS_ID_TYPE_CHID) {
|
||||
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, id);
|
||||
struct nvgpu_tsg *tsg;
|
||||
if (ch == NULL) {
|
||||
|
||||
@@ -96,16 +96,17 @@ u32 gk20a_fifo_pbdma_isr(struct gk20a *g)
|
||||
u32 pbdma_pending_bitmask = nvgpu_readl(g, fifo_intr_pbdma_id_r());
|
||||
u32 error_notifier;
|
||||
bool recover;
|
||||
struct nvgpu_pbdma_status_info pbdma_status;
|
||||
|
||||
for (pbdma_id = 0; pbdma_id < num_pbdma; pbdma_id++) {
|
||||
if (fifo_intr_pbdma_id_status_v(pbdma_pending_bitmask, pbdma_id) != 0U) {
|
||||
nvgpu_log(g, gpu_dbg_intr, "pbdma id %d intr pending",
|
||||
pbdma_id);
|
||||
recover = g->ops.pbdma.handle_intr(g, pbdma_id,
|
||||
&error_notifier);
|
||||
&error_notifier, &pbdma_status);
|
||||
if (recover) {
|
||||
nvgpu_rc_pbdma_fault(g, f, pbdma_id,
|
||||
error_notifier);
|
||||
error_notifier, &pbdma_status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,11 +29,13 @@ struct gk20a;
|
||||
struct nvgpu_debug_context;
|
||||
struct nvgpu_channel_dump_info;
|
||||
struct nvgpu_gpfifo_entry;
|
||||
struct nvgpu_pbdma_status_info;
|
||||
|
||||
bool gm20b_pbdma_handle_intr_0(struct gk20a *g, u32 pbdma_id,
|
||||
u32 pbdma_intr_0, u32 *error_notifier);
|
||||
bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
|
||||
u32 *error_notifier);
|
||||
u32 *error_notifier,
|
||||
struct nvgpu_pbdma_status_info *pbdma_status);
|
||||
|
||||
u32 gm20b_pbdma_read_data(struct gk20a *g, u32 pbdma_id);
|
||||
void gm20b_pbdma_reset_header(struct gk20a *g, u32 pbdma_id);
|
||||
|
||||
@@ -356,7 +356,8 @@ u32 gm20b_pbdma_restartable_0_intr_descs(void)
|
||||
}
|
||||
|
||||
bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
|
||||
u32 *error_notifier)
|
||||
u32 *error_notifier,
|
||||
struct nvgpu_pbdma_status_info *pbdma_status)
|
||||
{
|
||||
u32 intr_error_notifier = NVGPU_ERR_NOTIFIER_PBDMA_ERROR;
|
||||
|
||||
@@ -372,6 +373,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
|
||||
|
||||
if (g->ops.pbdma.handle_intr_0(g, pbdma_id, pbdma_intr_0,
|
||||
&intr_error_notifier)) {
|
||||
g->ops.pbdma_status.read_pbdma_status_info(g,
|
||||
pbdma_id, pbdma_status);
|
||||
recover = true;
|
||||
}
|
||||
nvgpu_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
|
||||
@@ -384,6 +387,8 @@ bool gm20b_pbdma_handle_intr(struct gk20a *g, u32 pbdma_id,
|
||||
|
||||
if (g->ops.pbdma.handle_intr_1(g, pbdma_id, pbdma_intr_1,
|
||||
&intr_error_notifier)) {
|
||||
g->ops.pbdma_status.read_pbdma_status_info(g,
|
||||
pbdma_id, pbdma_status);
|
||||
recover = true;
|
||||
}
|
||||
nvgpu_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
|
||||
|
||||
@@ -173,10 +173,7 @@ int gv11b_fifo_preempt_poll_pbdma(struct gk20a *g, u32 tsgid,
|
||||
*/
|
||||
|
||||
/* Ignore un-needed return value "recover" */
|
||||
(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL);
|
||||
|
||||
g->ops.pbdma_status.read_pbdma_status_info(g, pbdma_id,
|
||||
&pbdma_status);
|
||||
(void)g->ops.pbdma.handle_intr(g, pbdma_id, NULL, &pbdma_status);
|
||||
|
||||
if (nvgpu_pbdma_status_is_chsw_valid(&pbdma_status) ||
|
||||
nvgpu_pbdma_status_is_chsw_save(&pbdma_status)) {
|
||||
|
||||
@@ -116,6 +116,7 @@ struct _resmgr_context;
|
||||
struct nvgpu_gpfifo_entry;
|
||||
struct vm_gk20a_mapping_batch;
|
||||
struct pmu_pg_stats_data;
|
||||
struct nvgpu_pbdma_status_info;
|
||||
|
||||
enum nvgpu_flush_op;
|
||||
enum gk20a_mem_rw_flag;
|
||||
@@ -1172,7 +1173,8 @@ struct gpu_ops {
|
||||
u32 *error_notifier);
|
||||
/* error_notifier can be NULL */
|
||||
bool (*handle_intr)(struct gk20a *g, u32 pbdma_id,
|
||||
u32 *error_notifier);
|
||||
u32 *error_notifier,
|
||||
struct nvgpu_pbdma_status_info *pbdma_status);
|
||||
u32 (*get_signature)(struct gk20a *g);
|
||||
void (*dump_status)(struct gk20a *g,
|
||||
struct nvgpu_debug_context *o);
|
||||
|
||||
@@ -41,12 +41,14 @@ struct gk20a;
|
||||
struct nvgpu_fifo;
|
||||
struct nvgpu_tsg;
|
||||
struct nvgpu_channel;
|
||||
struct nvgpu_pbdma_status_info;
|
||||
|
||||
void nvgpu_rc_ctxsw_timeout(struct gk20a *g, u32 eng_bitmask,
|
||||
struct nvgpu_tsg *tsg, bool debug_dump);
|
||||
|
||||
void nvgpu_rc_pbdma_fault(struct gk20a *g, struct nvgpu_fifo *f,
|
||||
u32 pbdma_id, u32 error_notifier);
|
||||
u32 pbdma_id, u32 error_notifier,
|
||||
struct nvgpu_pbdma_status_info *pbdma_status);
|
||||
|
||||
void nvgpu_rc_runlist_update(struct gk20a *g, u32 runlist_id);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user