Files
linux-nvgpu/drivers/gpu/nvgpu/common/fifo/channel_wdt.c
Rajesh Devaraj 2e36ad9e35 gpu: nvgpu: add null check for gp_get, pb_get
This patch adds NULL check for gp_get and pb_get.

JIRA NVGPU-9325

Change-Id: If41c1c526c58a18cc91a95686e71bdfae9edb328
Signed-off-by: Rajesh Devaraj <rdevaraj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2836366
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-by: Seema Khowala <seemaj@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2023-01-03 19:10:11 -08:00

196 lines
5.5 KiB
C

/*
* Copyright (c) 2015-2023, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "channel_wdt.h"
#include "channel_worker.h"
#include <nvgpu/watchdog.h>
#include <nvgpu/channel.h>
#include <nvgpu/error_notifier.h>
#include <nvgpu/gk20a.h>
void nvgpu_channel_set_wdt_debug_dump(struct nvgpu_channel *ch, bool dump)
{
ch->wdt_debug_dump = dump;
}
static struct nvgpu_channel_wdt_state nvgpu_channel_collect_wdt_state(
struct nvgpu_channel *ch)
{
struct gk20a *g = ch->g;
struct nvgpu_channel_wdt_state state = { 0, 0 };
/*
* Note: just checking for nvgpu_channel_wdt_enabled() is not enough at
* the moment because system suspend puts g->regs away but doesn't stop
* the worker thread that runs the watchdog. This might need to be
* cleared up in the future.
*/
if (nvgpu_channel_wdt_running(ch->wdt)) {
/*
* Read the state only if the wdt is on to avoid unnecessary
* accesses. The kernel mem for userd may not even exist; this
* channel could be in usermode submit mode.
*/
if (g->ops.userd.gp_get != NULL) {
state.gp_get = g->ops.userd.gp_get(g, ch);
}
if (g->ops.userd.pb_get != NULL) {
state.pb_get = g->ops.userd.pb_get(g, ch);
}
}
return state;
}
void nvgpu_channel_launch_wdt(struct nvgpu_channel *ch)
{
struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch);
/*
* FIXME: channel recovery can race the submit path and can start even
* after this, but this check is the best we can do for now.
*/
if (!nvgpu_channel_check_unserviceable(ch)) {
nvgpu_channel_wdt_start(ch->wdt, &state);
}
}
void nvgpu_channel_restart_all_wdts(struct gk20a *g)
{
struct nvgpu_fifo *f = &g->fifo;
u32 chid;
for (chid = 0; chid < f->num_channels; chid++) {
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
if (ch != NULL) {
if ((ch->wdt != NULL) &&
!nvgpu_channel_check_unserviceable(ch)) {
struct nvgpu_channel_wdt_state state =
nvgpu_channel_collect_wdt_state(ch);
nvgpu_channel_wdt_rewind(ch->wdt, &state);
}
nvgpu_channel_put(ch);
}
}
}
static void nvgpu_channel_recover_from_wdt(struct nvgpu_channel *ch)
{
struct gk20a *g = ch->g;
nvgpu_log_fn(g, " ");
if (nvgpu_channel_check_unserviceable(ch)) {
/* channel is already recovered */
nvgpu_info(g, "chid: %d unserviceable but wdt was ON", ch->chid);
return;
}
nvgpu_err(g, "Job on channel %d timed out", ch->chid);
/* force reset calls gk20a_debug_dump but not this */
if (ch->wdt_debug_dump) {
gk20a_gr_debug_dump(g);
}
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
if (g->ops.tsg.force_reset(ch,
NVGPU_ERR_NOTIFIER_FIFO_ERROR_IDLE_TIMEOUT,
ch->wdt_debug_dump) != 0) {
nvgpu_err(g, "failed tsg force reset for chid: %d", ch->chid);
}
#endif
}
/*
* Test the watchdog progress. If the channel is stuck, reset it.
*
* The gpu is implicitly on at this point because the watchdog can only run on
* channels that have submitted jobs pending for cleanup.
*/
static void nvgpu_channel_check_wdt(struct nvgpu_channel *ch)
{
struct nvgpu_channel_wdt_state state = nvgpu_channel_collect_wdt_state(ch);
if (nvgpu_channel_wdt_check(ch->wdt, &state)) {
nvgpu_channel_recover_from_wdt(ch);
}
}
void nvgpu_channel_worker_poll_init(struct nvgpu_worker *worker)
{
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
ch_worker->watchdog_interval = 100U;
nvgpu_timeout_init_cpu_timer_sw(worker->g, &ch_worker->timeout,
ch_worker->watchdog_interval);
}
/**
* Loop every living channel, check timeouts and handle stuck channels.
*/
static void nvgpu_channel_poll_wdt(struct gk20a *g)
{
unsigned int chid;
for (chid = 0; chid < g->fifo.num_channels; chid++) {
struct nvgpu_channel *ch = nvgpu_channel_from_id(g, chid);
if (ch != NULL) {
if (!nvgpu_channel_check_unserviceable(ch)) {
nvgpu_channel_check_wdt(ch);
}
nvgpu_channel_put(ch);
}
}
}
void nvgpu_channel_worker_poll_wakeup_post_process_item(
struct nvgpu_worker *worker)
{
struct gk20a *g = worker->g;
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
if (nvgpu_timeout_peek_expired(&ch_worker->timeout)) {
nvgpu_channel_poll_wdt(g);
nvgpu_timeout_init_cpu_timer_sw(g, &ch_worker->timeout,
ch_worker->watchdog_interval);
}
}
u32 nvgpu_channel_worker_poll_wakeup_condition_get_timeout(
struct nvgpu_worker *worker)
{
struct nvgpu_channel_worker *ch_worker =
nvgpu_channel_worker_from_worker(worker);
return ch_worker->watchdog_interval;
}