gpu: nvgpu: wait ACK for FECS watchdog timeout

From gv11b onwards, FECS ucode returns an ACK for set watchdog
timeout method. Failure to wait for this ACK was leading to races,
and in some cases, the ACK could be mistaken for the reply to the
next method.

In particular, this happened for the discover golden image size
method which is sent after set watchdog timeout.

With instrumented FECS ucode, it takes longer for the code to
process the set watchdog timeout method, and the write to ack
that method could happen after nvgpu driver clears the mailbox to
send the discover image size method.

With an invalid golden context image size, FECS ended up causing
an MMU fault while attempting to save past allocated buffer.

Added NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT to be used with
gops_gr_falcon.ctrl_ctxsw, and implemented 2 variants:
- gm20b_gr_falcon_ctrl_ctxsw, without ACK
- gv11b_gr_falcon_ctrl_ctxsw, with ACK

Added NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED flag to allow
executing above method without re-acquiring FECS lock. Longer term,
the 'flags' could be added to gop_gr_falcon.ctrl_ctxsw parameters.

Use gops_gr_falcon.ctrl_ctxsw instead of register writes to invoke
set watchdog timeout method in gm20b_gr_falcon_wait_ctxsw_ready.

Also replaced calls to gm20b_gr_falcon_ctrl_ctxsw to
gops_gr.falcon.ctrl_ctxsw when appropriate, since there are
multiple variants (gm20b, gp10b and gv11b).

Last, fixed clearing of mailbox 0 in gm20b_gr_falcon_bind_instblk.

Bug 200586923

Change-Id: I653b9a216555eec8cd4bb01d6f202bc77b75a939
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2287340
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Thomas Fleury
2020-01-29 14:02:57 -05:00
committed by Alex Waterman
parent e21d70574c
commit 9a16bc3fd4
7 changed files with 94 additions and 28 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -47,7 +47,7 @@ void gm20b_gr_falcon_bind_instblk(struct gk20a *g,
int gm20b_gr_falcon_wait_mem_scrubbing(struct gk20a *g);
int gm20b_gr_falcon_wait_ctxsw_ready(struct gk20a *g);
int gm20b_gr_falcon_submit_fecs_method_op(struct gk20a *g,
struct nvgpu_fecs_method_op op, bool sleepduringwait);
struct nvgpu_fecs_method_op op, u32 flags);
int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
u32 data, u32 *ret_val);
void gm20b_gr_falcon_set_current_ctx_invalid(struct gk20a *g);

View File

@@ -127,7 +127,7 @@ void gm20b_gr_falcon_bind_instblk(struct gk20a *g,
FECS_ARB_CMD_TIMEOUT_DEFAULT_US;
u32 inst_ptr_u32;
nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), U32_MAX);
while (((nvgpu_readl(g, gr_fecs_ctxsw_status_1_r()) &
gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) &&
@@ -520,11 +520,12 @@ int gm20b_gr_falcon_wait_ctxsw_ready(struct gk20a *g)
}
#endif
nvgpu_log_info(g, "configuring ctxsw_ucode wdt = 0x%x", wdt_val);
nvgpu_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), U32_MAX);
nvgpu_writel(g, gr_fecs_method_data_r(), wdt_val);
nvgpu_writel(g, gr_fecs_method_push_r(),
gr_fecs_method_push_adr_set_watchdog_timeout_f());
ret = g->ops.gr.falcon.ctrl_ctxsw(g,
NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT, wdt_val, NULL);
if (ret != 0) {
nvgpu_err(g, "fail to set watchdog timeout");
return ret;
}
nvgpu_log_fn(g, "done");
return 0;
@@ -537,7 +538,7 @@ int gm20b_gr_falcon_init_ctx_state(struct gk20a *g,
nvgpu_log_fn(g, " ");
ret = gm20b_gr_falcon_ctrl_ctxsw(g,
ret = g->ops.gr.falcon.ctrl_ctxsw(g,
NVGPU_GR_FALCON_METHOD_CTXSW_DISCOVER_IMAGE_SIZE,
0, &sizes->golden_image_size);
if (ret != 0) {
@@ -548,7 +549,7 @@ int gm20b_gr_falcon_init_ctx_state(struct gk20a *g,
#if defined(CONFIG_NVGPU_DEBUGGER) || \
defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
ret = gm20b_gr_falcon_ctrl_ctxsw(g,
ret = g->ops.gr.falcon.ctrl_ctxsw(g,
NVGPU_GR_FALCON_METHOD_CTXSW_DISCOVER_PM_IMAGE_SIZE,
#ifndef CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING
0, &sizes->pm_ctxsw_image_size);
@@ -565,7 +566,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
#endif
#ifdef CONFIG_NVGPU_GRAPHICS
ret = gm20b_gr_falcon_ctrl_ctxsw(g,
ret = g->ops.gr.falcon.ctrl_ctxsw(g,
NVGPU_GR_FALCON_METHOD_CTXSW_DISCOVER_ZCULL_IMAGE_SIZE,
0, &sizes->zcull_image_size);
if (ret != 0) {
@@ -619,17 +620,21 @@ void gm20b_gr_falcon_set_current_ctx_invalid(struct gk20a *g)
gr_fecs_current_ctx_valid_false_f());
}
/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
/*
* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
* We should replace most, if not all, fecs method calls to this instead.
*/
int gm20b_gr_falcon_submit_fecs_method_op(struct gk20a *g,
struct nvgpu_fecs_method_op op,
bool sleepduringwait)
struct nvgpu_fecs_method_op op, u32 flags)
{
int ret;
struct nvgpu_gr_falcon *gr_falcon = nvgpu_gr_get_falcon_ptr(g);
bool sleepduringwait =
(flags & NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP) != 0U;
nvgpu_mutex_acquire(&gr_falcon->fecs_mutex);
if ((flags & NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED) == 0U) {
nvgpu_mutex_acquire(&gr_falcon->fecs_mutex);
}
if (op.mailbox.id != 0U) {
nvgpu_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
@@ -659,7 +664,9 @@ int gm20b_gr_falcon_submit_fecs_method_op(struct gk20a *g,
op.method.data, op.method.addr);
}
nvgpu_mutex_release(&gr_falcon->fecs_mutex);
if ((flags & NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED) == 0U) {
nvgpu_mutex_release(&gr_falcon->fecs_mutex);
}
return ret;
}
@@ -673,8 +680,8 @@ int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
.method.data = 0U,
.cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
.cond.fail = GR_IS_UCODE_OP_SKIP,
};
bool sleepduringwait = false;
};
u32 flags = 0U;
nvgpu_log_info(g, "fecs method %d data 0x%x ret_value %p",
fecs_method, data, ret_val);
@@ -689,7 +696,7 @@ int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
op.mailbox.fail = gr_fecs_ctxsw_mailbox_value_fail_v();
op.cond.ok = GR_IS_UCODE_OP_EQUAL;
op.cond.fail = GR_IS_UCODE_OP_EQUAL;
sleepduringwait = true;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
break;
case NVGPU_GR_FALCON_METHOD_CTXSW_START:
@@ -700,7 +707,7 @@ int gm20b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
op.mailbox.fail = gr_fecs_ctxsw_mailbox_value_fail_v();
op.cond.ok = GR_IS_UCODE_OP_EQUAL;
op.cond.fail = GR_IS_UCODE_OP_EQUAL;
sleepduringwait = true;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
break;
#endif
#ifdef CONFIG_NVGPU_ENGINE_RESET
@@ -733,7 +740,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
op.method.addr =
gr_fecs_method_push_adr_discover_pm_image_size_v();
op.mailbox.ret = ret_val;
sleepduringwait = true;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
break;
#endif
#ifdef CONFIG_NVGPU_POWER_PG
@@ -770,7 +777,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
op.mailbox.fail = 0x20U;
op.cond.ok = GR_IS_UCODE_OP_AND;
op.cond.fail = GR_IS_UCODE_OP_AND;
sleepduringwait = true;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
break;
case NVGPU_GR_FALCON_METHOD_GOLDEN_IMAGE_SAVE:
@@ -781,7 +788,7 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
op.mailbox.fail = 0x2U;
op.cond.ok = GR_IS_UCODE_OP_AND;
op.cond.fail = GR_IS_UCODE_OP_AND;
sleepduringwait = true;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP;
break;
#ifdef CONFIG_NVGPU_FECS_TRACE
case NVGPU_GR_FALCON_METHOD_FECS_TRACE_FLUSH:
@@ -789,12 +796,20 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
gr_fecs_method_push_adr_write_timestamp_record_v();
break;
#endif
case NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT:
op.method.addr =
gr_fecs_method_push_adr_set_watchdog_timeout_f();
op.method.data = data;
op.cond.ok = GR_IS_UCODE_OP_SKIP;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED;
break;
default:
nvgpu_err(g, "unsupported fecs mode %d", fecs_method);
break;
}
return gm20b_gr_falcon_submit_fecs_method_op(g, op, sleepduringwait);
return gm20b_gr_falcon_submit_fecs_method_op(g, op, flags);
}
u32 gm20b_gr_falcon_get_current_ctx(struct gk20a *g)

View File

@@ -80,7 +80,7 @@ int gp10b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
op.method.addr =
gr_fecs_method_push_adr_discover_preemption_image_size_v();
op.mailbox.ret = ret_val;
ret = gm20b_gr_falcon_submit_fecs_method_op(g, op, false);
ret = gm20b_gr_falcon_submit_fecs_method_op(g, op, 0U);
break;
case NVGPU_GR_FALCON_METHOD_CONFIGURE_CTXSW_INTR:

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -29,5 +29,7 @@ struct nvgpu_fecs_ecc_status;
void gv11b_gr_falcon_handle_fecs_ecc_error(struct gk20a *g,
struct nvgpu_fecs_ecc_status *fecs_ecc_status);
void gv11b_gr_falcon_fecs_host_int_enable(struct gk20a *g);
int gv11b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
u32 data, u32 *ret_val);
#endif /* NVGPU_GR_FALCON_GV11B_H */

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -24,7 +24,10 @@
#include <nvgpu/io.h>
#include <nvgpu/gr/gr_falcon.h>
#include "gr_falcon_gp10b.h"
#include "gr_falcon_gm20b.h"
#include "gr_falcon_gv11b.h"
#include "common/gr/gr_falcon_priv.h"
#include <nvgpu/hw/gv11b/hw_gr_gv11b.h>
@@ -117,6 +120,43 @@ void gv11b_gr_falcon_handle_fecs_ecc_error(struct gk20a *g,
}
}
int gv11b_gr_falcon_ctrl_ctxsw(struct gk20a *g, u32 fecs_method,
u32 data, u32 *ret_val)
{
struct nvgpu_fecs_method_op op = {
.mailbox = { .id = 0U, .data = 0U, .ret = NULL,
.clr = ~U32(0U), .ok = 0U, .fail = 0U},
.method.data = 0U,
.cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
.cond.fail = GR_IS_UCODE_OP_SKIP,
};
u32 flags = 0;
int ret;
nvgpu_log_info(g, "fecs method %d data 0x%x ret_val %p",
fecs_method, data, ret_val);
switch (fecs_method) {
case NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT:
op.method.addr =
gr_fecs_method_push_adr_set_watchdog_timeout_f();
op.method.data = data;
flags |= NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED;
#ifdef CONFIG_NVGPU_SIM
op.cond.ok = GR_IS_UCODE_OP_SKIP;
#endif
ret = gm20b_gr_falcon_submit_fecs_method_op(g, op, flags);
break;
default:
ret = gp10b_gr_falcon_ctrl_ctxsw(g, fecs_method,
data, ret_val);
break;
}
return ret;
}
void gv11b_gr_falcon_fecs_host_int_enable(struct gk20a *g)
{
nvgpu_writel(g, gr_fecs_host_int_enable_r(),

View File

@@ -766,7 +766,7 @@ NVGPU_COV_WHITELIST_BLOCK_END(NVGPU_MISRA(Rule, 8_7))
.wait_mem_scrubbing =
gm20b_gr_falcon_wait_mem_scrubbing,
.wait_ctxsw_ready = gm20b_gr_falcon_wait_ctxsw_ready,
.ctrl_ctxsw = gp10b_gr_falcon_ctrl_ctxsw,
.ctrl_ctxsw = gv11b_gr_falcon_ctrl_ctxsw,
.get_current_ctx = gm20b_gr_falcon_get_current_ctx,
.get_ctx_ptr = gm20b_gr_falcon_get_ctx_ptr,
.get_fecs_current_ctx_data =

View File

@@ -73,6 +73,15 @@ defined(CONFIG_NVGPU_CTXSW_FW_ERROR_CODE_TESTING)
#define NVGPU_GR_FALCON_METHOD_CONFIGURE_CTXSW_INTR 13
#endif
/** Falcon method to set watchdog timeout. */
#define NVGPU_GR_FALCON_METHOD_SET_WATCHDOG_TIMEOUT 14
/** Sleep while waiting for Falcon ACK */
#define NVGPU_GR_FALCON_SUBMIT_METHOD_F_SLEEP BIT32(0)
/** Falcon lock already held */
#define NVGPU_GR_FALCON_SUBMIT_METHOD_F_LOCKED BIT32(1)
/** Falcon index of mailbox 0. */
#define NVGPU_GR_FALCON_FECS_CTXSW_MAILBOX0 0U
/** Falcon index of mailbox 1. */