mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
For safety build, nvgpu driver should enter SW quiesce state in case an uncorrectable error has occurred. In this state, any activity on the GPU should be prevented, without powering off the GPU. Also, a minimal set of operations should be used to enter SW quiesce state. Entering SW quiesce state does the following: - set sw_quiesce_pending: when this flag is set, interrupt handlers exit after masking interrupts. This should help mitigate an interrupt storm. - wake up thread to complete quiescing. The thread performs the following: - set NVGPU_DRIVER_IS_DYING to prevent allocation of new resources - disable interrupts - disable fifo scheduling - preempt all runlists - set error notifier for all active channels Note: for channels with usermode submit enabled, userspace can still ring doorbell, but this will not trigger any work on engines since fifo scheduling is disabled. Jira NVGPU-3493 Change-Id: I639a32da754d8833f54dcec1fa23135721d8d89a Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2172391 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
158 lines
3.7 KiB
C
158 lines
3.7 KiB
C
/*
|
|
* Copyright (c) 2014-2019, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify it
|
|
* under the terms and conditions of the GNU General Public License,
|
|
* version 2, as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
* more details.
|
|
*/
|
|
|
|
#ifdef CONFIG_NVGPU_TRACE
|
|
#include <trace/events/gk20a.h>
|
|
#endif
|
|
#include <linux/irqreturn.h>
|
|
|
|
#include <nvgpu/gk20a.h>
|
|
|
|
#include <nvgpu/atomic.h>
|
|
#include <nvgpu/unit.h>
|
|
#ifndef CONFIG_NVGPU_RECOVERY
|
|
#include <nvgpu/nvgpu_init.h>
|
|
#endif
|
|
#include "os_linux.h"
|
|
|
|
irqreturn_t nvgpu_intr_stall(struct gk20a *g)
|
|
{
|
|
u32 mc_intr_0;
|
|
|
|
#ifdef CONFIG_NVGPU_TRACE
|
|
trace_mc_gk20a_intr_stall(g->name);
|
|
#endif
|
|
|
|
if (!g->power_on)
|
|
return IRQ_NONE;
|
|
|
|
/* not from gpu when sharing irq with others */
|
|
mc_intr_0 = g->ops.mc.intr_stall(g);
|
|
if (unlikely(!mc_intr_0))
|
|
return IRQ_NONE;
|
|
|
|
g->ops.mc.intr_stall_pause(g);
|
|
#ifndef CONFIG_NVGPU_RECOVERY
|
|
if (g->sw_quiesce_pending) {
|
|
return IRQ_NONE;
|
|
}
|
|
#endif
|
|
|
|
nvgpu_atomic_inc(&g->hw_irq_stall_count);
|
|
|
|
#ifdef CONFIG_NVGPU_TRACE
|
|
trace_mc_gk20a_intr_stall_done(g->name);
|
|
#endif
|
|
|
|
return IRQ_WAKE_THREAD;
|
|
}
|
|
|
|
irqreturn_t nvgpu_intr_thread_stall(struct gk20a *g)
|
|
{
|
|
int hw_irq_count;
|
|
|
|
nvgpu_log(g, gpu_dbg_intr, "interrupt thread launched");
|
|
|
|
#ifdef CONFIG_NVGPU_TRACE
|
|
trace_mc_gk20a_intr_thread_stall(g->name);
|
|
#endif
|
|
|
|
hw_irq_count = nvgpu_atomic_read(&g->hw_irq_stall_count);
|
|
g->ops.mc.isr_stall(g);
|
|
g->ops.mc.intr_stall_resume(g);
|
|
/* sync handled irq counter before re-enabling interrupts */
|
|
nvgpu_atomic_set(&g->sw_irq_stall_last_handled, hw_irq_count);
|
|
|
|
nvgpu_cond_broadcast(&g->sw_irq_stall_last_handled_cond);
|
|
|
|
#ifdef CONFIG_NVGPU_TRACE
|
|
trace_mc_gk20a_intr_thread_stall_done(g->name);
|
|
#endif
|
|
|
|
return IRQ_HANDLED;
|
|
}
|
|
|
|
irqreturn_t nvgpu_intr_nonstall(struct gk20a *g)
|
|
{
|
|
u32 non_stall_intr_val;
|
|
u32 hw_irq_count;
|
|
int ops_old, ops_new, ops = 0;
|
|
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
|
|
|
|
if (!g->power_on)
|
|
return IRQ_NONE;
|
|
|
|
/* not from gpu when sharing irq with others */
|
|
non_stall_intr_val = g->ops.mc.intr_nonstall(g);
|
|
if (unlikely(!non_stall_intr_val))
|
|
return IRQ_NONE;
|
|
|
|
g->ops.mc.intr_nonstall_pause(g);
|
|
#ifndef CONFIG_NVGPU_RECOVERY
|
|
if (g->sw_quiesce_pending) {
|
|
return IRQ_NONE;
|
|
}
|
|
#endif
|
|
|
|
ops = g->ops.mc.isr_nonstall(g);
|
|
if (ops) {
|
|
do {
|
|
ops_old = atomic_read(&l->nonstall_ops);
|
|
ops_new = ops_old | ops;
|
|
} while (ops_old != atomic_cmpxchg(&l->nonstall_ops,
|
|
ops_old, ops_new));
|
|
|
|
queue_work(l->nonstall_work_queue, &l->nonstall_fn_work);
|
|
}
|
|
|
|
hw_irq_count = nvgpu_atomic_inc_return(&g->hw_irq_nonstall_count);
|
|
|
|
/* sync handled irq counter before re-enabling interrupts */
|
|
nvgpu_atomic_set(&g->sw_irq_nonstall_last_handled, hw_irq_count);
|
|
|
|
g->ops.mc.intr_nonstall_resume(g);
|
|
|
|
nvgpu_cond_broadcast(&g->sw_irq_nonstall_last_handled_cond);
|
|
|
|
return IRQ_HANDLED;
|
|
}
|
|
|
|
static void mc_gk20a_handle_intr_nonstall(struct gk20a *g, u32 ops)
|
|
{
|
|
bool semaphore_wakeup, post_events;
|
|
|
|
semaphore_wakeup =
|
|
(((ops & GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE) != 0U) ?
|
|
true : false);
|
|
post_events = (((ops & GK20A_NONSTALL_OPS_POST_EVENTS) != 0U) ?
|
|
true: false);
|
|
|
|
if (semaphore_wakeup) {
|
|
g->ops.semaphore_wakeup(g, post_events);
|
|
}
|
|
}
|
|
|
|
void nvgpu_intr_nonstall_cb(struct work_struct *work)
|
|
{
|
|
struct nvgpu_os_linux *l =
|
|
container_of(work, struct nvgpu_os_linux, nonstall_fn_work);
|
|
struct gk20a *g = &l->g;
|
|
|
|
do {
|
|
u32 ops;
|
|
|
|
ops = atomic_xchg(&l->nonstall_ops, 0);
|
|
mc_gk20a_handle_intr_nonstall(g, ops);
|
|
} while (atomic_read(&l->nonstall_ops) != 0);
|
|
}
|