mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-23 18:16:01 +03:00
gpu: nvgpu: Resolve failed cond init.
Following changes are added to fix the issue. 1) Threads having higher priority e.g. RT may preempt threads with sched-normal priority. As a consequence, higher priority threads might not still see initialization of data in another thread resulting in failures such as accessing a condition value before initialization. Any initialization in the parent thread must be accompanied by a barrier to make it visible in other thread. Added appropriate barriers to prevent reordering of the initialization in the thread construction path. 2) There is a race condition between nvgpu_cond_signal() and nvgpu_cond_destroy() in the asynchronous submit code and corresponding worker thread's process_item callback for NVS. This may lead to data corruption and resulting in the above errors as well. Fixed that by adding a refcount based mechanism for ownership sharing of the struct nvgpu_nvs_worker_item between the two threads. Bug 3778235 Signed-off-by: Debarshi Dutta <ddutta@nvidia.com> Change-Id: Ie9b9ba57bc1dcbb8780801be79863adc39690f72 Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2771535 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Prateek Sethi <prsethi@nvidia.com> Reviewed-by: Ketan Patil <ketanp@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
committed by
Ketan Patil
parent
038005986e
commit
667867a199
@@ -28,6 +28,7 @@
|
|||||||
#include <nvgpu/kmem.h>
|
#include <nvgpu/kmem.h>
|
||||||
#include <nvgpu/gk20a.h>
|
#include <nvgpu/gk20a.h>
|
||||||
#include <nvgpu/runlist.h>
|
#include <nvgpu/runlist.h>
|
||||||
|
#include <nvgpu/kref.h>
|
||||||
|
|
||||||
static struct nvs_sched_ops nvgpu_nvs_ops = {
|
static struct nvs_sched_ops nvgpu_nvs_ops = {
|
||||||
.preempt = NULL,
|
.preempt = NULL,
|
||||||
@@ -55,6 +56,7 @@ struct nvgpu_nvs_worker_item {
|
|||||||
bool wait_for_finish;
|
bool wait_for_finish;
|
||||||
bool locked;
|
bool locked;
|
||||||
int status;
|
int status;
|
||||||
|
struct nvgpu_ref ref;
|
||||||
struct nvgpu_list_node list;
|
struct nvgpu_list_node list;
|
||||||
nvgpu_atomic_t state;
|
nvgpu_atomic_t state;
|
||||||
};
|
};
|
||||||
@@ -77,6 +79,13 @@ nvgpu_nvs_worker_from_worker(struct nvgpu_worker *worker)
|
|||||||
((uintptr_t)worker - offsetof(struct nvgpu_nvs_worker, worker));
|
((uintptr_t)worker - offsetof(struct nvgpu_nvs_worker, worker));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static inline struct nvgpu_nvs_worker_item *
|
||||||
|
nvgpu_nvs_worker_item_from_ref(struct nvgpu_ref *ref_node)
|
||||||
|
{
|
||||||
|
return (struct nvgpu_nvs_worker_item *)
|
||||||
|
((uintptr_t)ref_node - offsetof(struct nvgpu_nvs_worker_item, ref));
|
||||||
|
};
|
||||||
|
|
||||||
static void nvgpu_nvs_worker_poll_init(struct nvgpu_worker *worker)
|
static void nvgpu_nvs_worker_poll_init(struct nvgpu_worker *worker)
|
||||||
{
|
{
|
||||||
struct nvgpu_nvs_worker *nvs_worker =
|
struct nvgpu_nvs_worker *nvs_worker =
|
||||||
@@ -152,6 +161,16 @@ static u64 nvgpu_nvs_tick(struct gk20a *g)
|
|||||||
return timeslice;
|
return timeslice;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void nvgpu_nvs_worker_item_release(struct nvgpu_ref *ref)
|
||||||
|
{
|
||||||
|
struct nvgpu_nvs_worker_item *work =
|
||||||
|
nvgpu_nvs_worker_item_from_ref(ref);
|
||||||
|
struct gk20a *g = work->g;
|
||||||
|
|
||||||
|
nvgpu_cond_destroy(&work->cond);
|
||||||
|
nvgpu_kfree(g, work);
|
||||||
|
}
|
||||||
|
|
||||||
static void nvgpu_nvs_worker_wakeup_process_item(struct nvgpu_list_node *work_item)
|
static void nvgpu_nvs_worker_wakeup_process_item(struct nvgpu_list_node *work_item)
|
||||||
{
|
{
|
||||||
struct nvgpu_nvs_worker_item *work =
|
struct nvgpu_nvs_worker_item *work =
|
||||||
@@ -195,9 +214,14 @@ static void nvgpu_nvs_worker_wakeup_process_item(struct nvgpu_list_node *work_it
|
|||||||
done:
|
done:
|
||||||
nvgpu_mutex_release(&g->sched_mutex);
|
nvgpu_mutex_release(&g->sched_mutex);
|
||||||
work->status = ret;
|
work->status = ret;
|
||||||
(void)nvgpu_atomic_xchg(&work->state, 1);
|
nvgpu_atomic_set(&work->state, 1);
|
||||||
|
|
||||||
|
nvgpu_smp_mb();
|
||||||
/* Wakeup threads waiting on runlist submit */
|
/* Wakeup threads waiting on runlist submit */
|
||||||
nvgpu_cond_signal(&work->cond);
|
nvgpu_cond_signal(&work->cond);
|
||||||
|
|
||||||
|
/* This reference was taken as part of nvgpu_nvs_worker_submit */
|
||||||
|
nvgpu_ref_put(&work->ref, nvgpu_nvs_worker_item_release);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int nvgpu_nvs_worker_submit(struct gk20a *g, struct nvgpu_runlist *rl,
|
static int nvgpu_nvs_worker_submit(struct gk20a *g, struct nvgpu_runlist *rl,
|
||||||
@@ -228,21 +252,29 @@ static int nvgpu_nvs_worker_submit(struct gk20a *g, struct nvgpu_runlist *rl,
|
|||||||
nvgpu_init_list_node(&work->list);
|
nvgpu_init_list_node(&work->list);
|
||||||
work->wait_for_finish = wait_for_finish;
|
work->wait_for_finish = wait_for_finish;
|
||||||
nvgpu_atomic_set(&work->state, 0);
|
nvgpu_atomic_set(&work->state, 0);
|
||||||
|
nvgpu_ref_init(&work->ref);
|
||||||
|
|
||||||
nvs_dbg(g, " enqueueing runlist submit");
|
nvs_dbg(g, " enqueueing runlist submit");
|
||||||
|
|
||||||
|
/* Add a barrier here to ensure all reads and writes have happened before
|
||||||
|
* enqueuing the job in the worker thread.
|
||||||
|
*/
|
||||||
|
nvgpu_smp_mb();
|
||||||
|
|
||||||
|
/* The corresponding refcount is decremented inside the wakeup_process item */
|
||||||
|
nvgpu_ref_get(&work->ref);
|
||||||
ret = nvgpu_worker_enqueue(&worker->worker, &work->list);
|
ret = nvgpu_worker_enqueue(&worker->worker, &work->list);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
|
/* Refcount is decremented here as no additional job is enqueued */
|
||||||
|
nvgpu_ref_put(&work->ref, nvgpu_nvs_worker_item_release);
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Add a barrier here to ensure that worker thread is interrupted
|
|
||||||
* before waiting on the condition below
|
|
||||||
*/
|
|
||||||
nvgpu_mb();
|
|
||||||
|
|
||||||
ret = NVGPU_COND_WAIT(&work->cond, nvgpu_atomic_read(&work->state) == 1, 0U);
|
ret = NVGPU_COND_WAIT(&work->cond, nvgpu_atomic_read(&work->state) == 1, 0U);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
|
/* refcount is not decremented here since even though this thread is
|
||||||
|
* unblocked, but the job could be still queued.
|
||||||
|
*/
|
||||||
nvgpu_err(g, "Runlist submit interrupted while waiting for submit");
|
nvgpu_err(g, "Runlist submit interrupted while waiting for submit");
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
@@ -252,8 +284,7 @@ static int nvgpu_nvs_worker_submit(struct gk20a *g, struct nvgpu_runlist *rl,
|
|||||||
ret = work->status;
|
ret = work->status;
|
||||||
|
|
||||||
fail:
|
fail:
|
||||||
nvgpu_cond_destroy(&work->cond);
|
nvgpu_ref_put(&work->ref, nvgpu_nvs_worker_item_release);
|
||||||
nvgpu_kfree(g, work);
|
|
||||||
|
|
||||||
free_domain:
|
free_domain:
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,7 @@
|
|||||||
#include <nvgpu/worker.h>
|
#include <nvgpu/worker.h>
|
||||||
#include <nvgpu/string.h>
|
#include <nvgpu/string.h>
|
||||||
#include <nvgpu/thread.h>
|
#include <nvgpu/thread.h>
|
||||||
|
#include <nvgpu/barrier.h>
|
||||||
|
|
||||||
static void nvgpu_worker_pre_process(struct nvgpu_worker *worker)
|
static void nvgpu_worker_pre_process(struct nvgpu_worker *worker)
|
||||||
{
|
{
|
||||||
@@ -331,6 +332,11 @@ static void nvgpu_worker_init_common(struct gk20a *g,
|
|||||||
nvgpu_mutex_init(&worker->start_lock);
|
nvgpu_mutex_init(&worker->start_lock);
|
||||||
|
|
||||||
worker->ops = worker_ops;
|
worker->ops = worker_ops;
|
||||||
|
|
||||||
|
/* Ensure initialization is complete before actually invoking the thread.
|
||||||
|
* The corresponding read barrier lies in the nvgpu_thread_proxy function.
|
||||||
|
*/
|
||||||
|
nvgpu_smp_wmb();
|
||||||
}
|
}
|
||||||
|
|
||||||
int nvgpu_worker_init(struct gk20a *g, struct nvgpu_worker *worker,
|
int nvgpu_worker_init(struct gk20a *g, struct nvgpu_worker *worker,
|
||||||
|
|||||||
@@ -18,14 +18,22 @@
|
|||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
#include <linux/version.h>
|
#include <linux/version.h>
|
||||||
|
|
||||||
|
#include <nvgpu/barrier.h>
|
||||||
#include <nvgpu/thread.h>
|
#include <nvgpu/thread.h>
|
||||||
#include <nvgpu/timers.h>
|
#include <nvgpu/timers.h>
|
||||||
|
|
||||||
int nvgpu_thread_proxy(void *threaddata)
|
int nvgpu_thread_proxy(void *threaddata)
|
||||||
{
|
{
|
||||||
struct nvgpu_thread *thread = threaddata;
|
struct nvgpu_thread *thread = threaddata;
|
||||||
int ret = thread->fn(thread->data);
|
|
||||||
bool was_running;
|
bool was_running;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* Ensure any initialization required for this thread is completed.
|
||||||
|
* The corresponding write barrier lies at the end of nvgpu_worker_init_common.
|
||||||
|
*/
|
||||||
|
nvgpu_smp_rmb();
|
||||||
|
|
||||||
|
ret = thread->fn(thread->data);
|
||||||
|
|
||||||
was_running = nvgpu_atomic_xchg(&thread->running, false);
|
was_running = nvgpu_atomic_xchg(&thread->running, false);
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
@@ -155,13 +155,16 @@ void nvgpu_cond_destroy(struct nvgpu_cond *cond)
|
|||||||
if (cond == NULL) {
|
if (cond == NULL) {
|
||||||
BUG();
|
BUG();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nvgpu_mutex_acquire(&cond->mutex);
|
||||||
err = pthread_cond_destroy(&cond->cond);
|
err = pthread_cond_destroy(&cond->cond);
|
||||||
nvgpu_assert(err == 0);
|
nvgpu_assert(err == 0);
|
||||||
nvgpu_mutex_destroy(&cond->mutex);
|
|
||||||
err = pthread_condattr_destroy(&cond->attr);
|
err = pthread_condattr_destroy(&cond->attr);
|
||||||
|
nvgpu_mutex_release(&cond->mutex);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
nvgpu_info(NULL, "Cond attr destroy error");
|
nvgpu_info(NULL, "Cond attr destroy error");
|
||||||
}
|
}
|
||||||
|
nvgpu_mutex_destroy(&cond->mutex);
|
||||||
cond->initialized = false;
|
cond->initialized = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
* copy of this software and associated documentation files (the "Software"),
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
@@ -22,6 +22,7 @@
|
|||||||
|
|
||||||
#include <nvgpu/bug.h>
|
#include <nvgpu/bug.h>
|
||||||
#include <nvgpu/thread.h>
|
#include <nvgpu/thread.h>
|
||||||
|
#include <nvgpu/barrier.h>
|
||||||
#include <nvgpu/os_sched.h>
|
#include <nvgpu/os_sched.h>
|
||||||
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
|
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
|
||||||
#include <nvgpu/posix/posix-fault-injection.h>
|
#include <nvgpu/posix/posix-fault-injection.h>
|
||||||
@@ -74,6 +75,8 @@ static void *nvgpu_posix_thread_wrapper(void *data)
|
|||||||
nvgpu_posix_init_fault_injection(nvgpu->fi_container);
|
nvgpu_posix_init_fault_injection(nvgpu->fi_container);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
nvgpu_smp_rmb();
|
||||||
|
|
||||||
ret = nvgpu->fn(nvgpu->data);
|
ret = nvgpu->fn(nvgpu->data);
|
||||||
|
|
||||||
if (ret != 0L) {
|
if (ret != 0L) {
|
||||||
|
|||||||
Reference in New Issue
Block a user