gpu: nvgpu: add BUG() callbacks

Add support for registering callbacks that will
be called on BUG().

Jira NVGPU-4512

Change-Id: I35c9b6c17db3b9fa5d098918223083f0b4aaace4
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2266391
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Thomas Fleury
2019-12-19 15:00:39 -05:00
committed by Alex Waterman
parent 569f34470e
commit 1f3f34b906
4 changed files with 123 additions and 3 deletions

View File

@@ -28,6 +28,7 @@
#include <nvgpu/posix/bug.h> #include <nvgpu/posix/bug.h>
#endif #endif
#include <nvgpu/cov_whitelist.h> #include <nvgpu/cov_whitelist.h>
#include <nvgpu/list.h>
/* /*
* Define an assert macro that code within nvgpu can use. * Define an assert macro that code within nvgpu can use.
@@ -92,4 +93,25 @@ struct gk20a;
nvgpu_do_assert(); \ nvgpu_do_assert(); \
} while (false) } while (false)
struct nvgpu_bug_cb
{
void (*cb)(void *arg);
void *arg;
struct nvgpu_list_node node;
};
static inline struct nvgpu_bug_cb *
nvgpu_bug_cb_from_node(struct nvgpu_list_node *node)
{
return (struct nvgpu_bug_cb *)
((uintptr_t)node - offsetof(struct nvgpu_bug_cb, node));
};
#ifdef __KERNEL__
static inline void nvgpu_bug_exit(void) { }
static inline void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb) { }
static inline void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb) { }
#endif
#endif /* NVGPU_BUG_H */ #endif /* NVGPU_BUG_H */

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
* *
* GK20A Graphics * GK20A Graphics
* *
@@ -698,6 +698,7 @@ struct gk20a {
struct nvgpu_cond sw_quiesce_cond; struct nvgpu_cond sw_quiesce_cond;
struct nvgpu_thread sw_quiesce_thread; struct nvgpu_thread sw_quiesce_thread;
#endif #endif
struct nvgpu_list_node bug_node;
/** Controls which messages are logged */ /** Controls which messages are logged */
u64 log_mask; u64 log_mask;

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -134,4 +134,33 @@ void bug_handler_cancel(void);
bug_result; \ bug_result; \
}) })
#endif #endif
struct nvgpu_bug_cb;
/**
* @brief Register callback to be invoked on BUG()
*
* @param cb [in] Pointer to callback structure
*
* Register a callback to be invoked on BUG().
* The nvgpu_bug_cb structure contains a function pointer
* and an argument to be passed to this function.
* This mechanism can be used to perform some emergency
* operations on a GPU before exiting the process.
*
* Note: callback is automatically unregistered before
* being invoked.
*/
void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb);
/**
* @brief Unregister a callback for BUG()
*
* @param cb [in] Pointer to callback structure
*
* Remove a callback from the list of callbacks to be
* invoked on BUG().
*/
void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb);
#endif /* NVGPU_POSIX_BUG_H */ #endif /* NVGPU_POSIX_BUG_H */

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -21,6 +21,8 @@
*/ */
#include <nvgpu/log.h> #include <nvgpu/log.h>
#include <nvgpu/lock.h>
#include <nvgpu/list.h>
#include <nvgpu/posix/bug.h> #include <nvgpu/posix/bug.h>
#include <pthread.h> #include <pthread.h>
#include <stdbool.h> #include <stdbool.h>
@@ -75,11 +77,48 @@ void dump_stack(void)
nvgpu_posix_dump_stack(2); nvgpu_posix_dump_stack(2);
} }
struct nvgpu_bug_desc {
bool in_use;
pthread_once_t once;
struct nvgpu_spinlock lock;
struct nvgpu_list_node head;
};
struct nvgpu_bug_desc bug = {
.once = PTHREAD_ONCE_INIT
};
static void nvgpu_bug_init(void)
{
nvgpu_err(NULL, "doing init for bug cb");
nvgpu_spinlock_init(&bug.lock);
nvgpu_init_list_node(&bug.head);
bug.in_use = true;
}
void nvgpu_bug_register_cb(struct nvgpu_bug_cb *cb)
{
(void) pthread_once(&bug.once, nvgpu_bug_init);
nvgpu_spinlock_acquire(&bug.lock);
nvgpu_list_add_tail(&cb->node, &bug.head);
nvgpu_spinlock_release(&bug.lock);
}
void nvgpu_bug_unregister_cb(struct nvgpu_bug_cb *cb)
{
(void) pthread_once(&bug.once, nvgpu_bug_init);
nvgpu_spinlock_acquire(&bug.lock);
nvgpu_list_del(&cb->node);
nvgpu_spinlock_release(&bug.lock);
}
/* /*
* Ahhh! A bug! * Ahhh! A bug!
*/ */
void nvgpu_posix_bug(const char *fmt, ...) void nvgpu_posix_bug(const char *fmt, ...)
{ {
struct nvgpu_bug_cb *cb;
#ifdef __NVGPU_UNIT_TEST__ #ifdef __NVGPU_UNIT_TEST__
if (expect_bug) { if (expect_bug) {
nvgpu_info(NULL, "Expected BUG detected!"); nvgpu_info(NULL, "Expected BUG detected!");
@@ -94,6 +133,35 @@ void nvgpu_posix_bug(const char *fmt, ...)
*/ */
nvgpu_err(NULL, "BUG detected!"); nvgpu_err(NULL, "BUG detected!");
dump_stack(); dump_stack();
if (!bug.in_use) {
goto done;
}
nvgpu_spinlock_acquire(&bug.lock);
while (!nvgpu_list_empty(&bug.head)) {
/*
* Always process first entry, in -unlikely- where a
* callback would unregister another one.
*/
cb = nvgpu_list_first_entry(&bug.head,
nvgpu_bug_cb, node);
/* Remove callback from list */
nvgpu_list_del(&cb->node);
/*
* Release spinlock before invoking callback.
* This allows callback to register/unregister other
* callbacks (unlikely).
* This allows using a longjmp in a callback
* for unit testing.
*/
nvgpu_spinlock_release(&bug.lock);
cb->cb(cb->arg);
nvgpu_spinlock_acquire(&bug.lock);
}
nvgpu_spinlock_release(&bug.lock);
done:
(void) raise(SIGSEGV); (void) raise(SIGSEGV);
pthread_exit(NULL); pthread_exit(NULL);
} }