Files
linux-nvgpu/drivers/gpu/nvgpu/os/posix/thread.c
Debarshi Dutta 667867a199 gpu: nvgpu: Resolve failed cond init.
Following changes are added to fix the issue.

1) Threads having higher priority e.g. RT may preempt
threads with sched-normal priority. As a consequence, higher priority
threads might not still see initialization of data in another thread
resulting in failures such as accessing a condition value before initialization.

Any initialization in the parent thread must be accompanied by a barrier
to make it visible in other thread. Added appropriate barriers to prevent
reordering of the initialization in the thread construction path.

2) There is a race condition between nvgpu_cond_signal() and
nvgpu_cond_destroy() in the asynchronous submit code and corresponding
worker thread's process_item callback for NVS. This may lead to
data corruption and resulting in the above errors as well. Fixed
that by adding a refcount based mechanism for ownership sharing
of the struct nvgpu_nvs_worker_item between the two threads.

Bug 3778235

Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
Change-Id: Ie9b9ba57bc1dcbb8780801be79863adc39690f72
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2771535
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Prateek Sethi <prsethi@nvidia.com>
Reviewed-by: Ketan Patil <ketanp@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2022-09-27 23:25:55 -07:00

341 lines
8.9 KiB
C

/*
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/bug.h>
#include <nvgpu/thread.h>
#include <nvgpu/barrier.h>
#include <nvgpu/os_sched.h>
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
#include <nvgpu/posix/posix-fault-injection.h>
#endif
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
struct nvgpu_posix_fault_inj *nvgpu_thread_get_fault_injection(void)
{
struct nvgpu_posix_fault_inj_container *c =
nvgpu_posix_fault_injection_get_container();
return &c->thread_fi;
}
struct nvgpu_posix_fault_inj
*nvgpu_thread_running_true_get_fault_injection(void)
{
struct nvgpu_posix_fault_inj_container *c =
nvgpu_posix_fault_injection_get_container();
return &c->thread_running_true_fi;
}
struct nvgpu_posix_fault_inj *nvgpu_thread_serial_get_fault_injection(void)
{
struct nvgpu_posix_fault_inj_container *c =
nvgpu_posix_fault_injection_get_container();
return &c->thread_serial_fi;
}
#endif
/**
* Use pthreads to mostly emulate the Linux kernel APIs. There are some things
* that are quite different - especially the stop/should_stop notions. In user
* space threads can send signals to one another but of course within the kernel
* that is not as simple.
*
* This could use some nice debugging some day as well.
*/
static void *nvgpu_posix_thread_wrapper(void *data)
{
long ret;
struct nvgpu_posix_thread_data *nvgpu =
(struct nvgpu_posix_thread_data *)data;
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
/* setup the fault injection container from the parent */
nvgpu_posix_init_fault_injection(nvgpu->fi_container);
#endif
nvgpu_smp_rmb();
ret = nvgpu->fn(nvgpu->data);
if (ret != 0L) {
nvgpu_info(NULL, "Error %ld return from thread: %d",
ret, nvgpu_current_tid(NULL));
}
return NULL;
}
static void nvgpu_thread_cancel_sync(struct nvgpu_thread *thread)
{
int err;
err = pthread_cancel(thread->thread);
if (err != 0) {
nvgpu_info(NULL, "Thread cancel error");
}
}
int nvgpu_thread_create(struct nvgpu_thread *thread,
void *data,
int (*threadfn)(void *data), const char *name)
{
pthread_attr_t attr;
int ret;
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
if (nvgpu_posix_fault_injection_handle_call(
nvgpu_thread_get_fault_injection())) {
return -EINVAL;
}
#endif
(void) memset(thread, 0, sizeof(*thread));
/*
* By subtracting 1 the above memset ensures that we have a zero
* terminated string.
*/
if (name != NULL) {
(void) strncpy(thread->tname, name,
NVGPU_THREAD_POSIX_MAX_NAMELEN - 1);
}
thread->nvgpu.data = data;
thread->nvgpu.fn = threadfn;
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
/* pass the fault injection container to the child */
thread->nvgpu.fi_container =
nvgpu_posix_fault_injection_get_container();
#endif
nvgpu_atomic_set(&thread->running, 1);
ret = pthread_attr_init(&attr);
if (ret != 0) {
return ret;
}
ret = pthread_create(&thread->thread, &attr,
nvgpu_posix_thread_wrapper,
&thread->nvgpu);
if (ret != 0) {
if ((pthread_attr_destroy(&attr)) != 0) {
nvgpu_info(NULL, "Thread attr destroy error");
}
return ret;
}
#ifdef _GNU_SOURCE
pthread_setname_np(thread->thread, thread->tname);
#endif
ret = pthread_attr_destroy(&attr);
if (ret != 0) {
if ((pthread_cancel(thread->thread)) != 0) {
nvgpu_info(NULL, "Thread cancel error");
}
return ret;
}
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
/*
* For some code that is tested using public APIs it's not safe for the
* parent thread to continue while the child thread is running. Fault
* injection per thread pointer points to the same container for both
* parent thread as well as the created one. So, there is chance of a
* race in fault injection functionality. Serialize the run so that race
* can be mitigated. Caller of nvgpu_thread_create() API should ensure
* that the created thread is stopped using some fault injection or
* otherwise.
*/
if (nvgpu_posix_fault_injection_handle_call(
nvgpu_thread_serial_get_fault_injection())) {
if ((pthread_join(thread->thread, NULL)) != 0) {
nvgpu_info(NULL, "Thread join error");
}
}
#endif
return 0;
}
int nvgpu_thread_create_priority(struct nvgpu_thread *thread,
void *data, int (*threadfn)(void *data),
int priority, const char *name)
{
pthread_attr_t attr;
struct sched_param param;
int ret;
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
if (nvgpu_posix_fault_injection_handle_call(
nvgpu_thread_get_fault_injection())) {
return -EINVAL;
}
#endif
(void) memset(thread, 0, sizeof(*thread));
(void) memset(&param, 0, sizeof(struct sched_param));
/*
* By subtracting 1 the above memset ensures that we have a zero
* terminated string.
*/
if (name != NULL) {
(void) strncpy(thread->tname, name,
NVGPU_THREAD_POSIX_MAX_NAMELEN - 1);
}
thread->nvgpu.data = data;
thread->nvgpu.fn = threadfn;
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
/* pass the fault injection container to the child */
thread->nvgpu.fi_container =
nvgpu_posix_fault_injection_get_container();
#endif
nvgpu_atomic_set(&thread->running, 1);
ret = pthread_attr_init(&attr);
if (ret != 0) {
return ret;
}
ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
if (ret != 0) {
goto attr_destroy;
}
ret = pthread_attr_setschedpolicy(&attr, SCHED_RR);
if (ret != 0) {
goto attr_destroy;
}
param.sched_priority = priority;
ret = pthread_attr_setschedparam(&attr, &param);
if (ret != 0) {
goto attr_destroy;
}
ret = pthread_create(&thread->thread, &attr,
nvgpu_posix_thread_wrapper, &thread->nvgpu);
if (ret != 0) {
goto attr_destroy;
}
#ifdef _GNU_SOURCE
pthread_setname_np(thread->thread, thread->tname);
#endif
ret = pthread_attr_destroy(&attr);
if (ret != 0) {
if ((pthread_cancel(thread->thread)) != 0) {
nvgpu_info(NULL, "Thread cancel error");
}
return ret;
}
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
/*
* For some code that is tested using public APIs it's not safe for the
* parent thread to continue while the child thread is running. Fault
* injection per thread pointer points to the same container for both
* parent thread as well as the created one. So, there is chance of a
* race in fault injection functionality. Serialize the run so that race
* can be mitigated. Caller of nvgpu_thread_create() API should ensure
* that the created thread is stopped using some fault injection or
* otherwise.
*/
if (nvgpu_posix_fault_injection_handle_call(
nvgpu_thread_serial_get_fault_injection())) {
(void) pthread_join(thread->thread, NULL);
}
#endif
return 0;
attr_destroy:
if ((pthread_attr_destroy(&attr)) != 0) {
nvgpu_info(NULL, "Thread attr destroy error");
}
return ret;
}
void nvgpu_thread_stop(struct nvgpu_thread *thread)
{
int old = nvgpu_atomic_cmpxchg(&thread->running, 1, 0);
if (old != 0) {
nvgpu_thread_cancel_sync(thread);
nvgpu_thread_join(thread);
}
}
void nvgpu_thread_stop_graceful(struct nvgpu_thread *thread,
void (*thread_stop_fn)(void *data), void *data)
{
int old = nvgpu_atomic_cmpxchg(&thread->running, 1, 0);
if (old != 0) {
if (thread_stop_fn != NULL) {
thread_stop_fn(data);
}
nvgpu_thread_join(thread);
}
}
bool nvgpu_thread_should_stop(struct nvgpu_thread *thread)
{
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
if (nvgpu_posix_fault_injection_handle_call(
nvgpu_thread_get_fault_injection())) {
return true;
}
#endif
return (nvgpu_atomic_read(&thread->running) == 0);
}
bool nvgpu_thread_is_running(struct nvgpu_thread *thread)
{
#ifdef NVGPU_UNITTEST_FAULT_INJECTION_ENABLEMENT
if (nvgpu_posix_fault_injection_handle_call(
nvgpu_thread_running_true_get_fault_injection())) {
return true;
}
#endif
return (nvgpu_atomic_read(&thread->running) == 1);
}
void nvgpu_thread_join(struct nvgpu_thread *thread)
{
int err;
err = pthread_join(thread->thread, NULL);
if(err != 0 && err != ESRCH) {
BUG();
}
}