gpu: nvgpu: Capture thread name for every channel created

This change ensures that in scenarios where GPU enters a bad
state because of the work submitted by a misbehaved thread,
we should be able to capture thread name as part of our
1st set of failure logs.
Changes for QNX env is pending.

JIRA NVGPU-7783

Change-Id: I65d55a6ade749ff91739458e0642ed2dafaae5cc
Signed-off-by: Kishan <kpalankar@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2879197
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com>
Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com>
Reviewed-by: Tejal Kudav <tkudav@nvidia.com>
Reviewed-by: Ankur Kishore <ankkishore@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
Kishan
2023-03-29 11:07:51 +00:00
committed by mobile promotions
parent af48120169
commit c6d5fb348c
8 changed files with 52 additions and 11 deletions

View File

@@ -1260,6 +1260,7 @@ struct nvgpu_channel *nvgpu_channel_open_new(struct gk20a *g,
ch->pid = tid; ch->pid = tid;
ch->tgid = pid; /* process granularity for FECS traces */ ch->tgid = pid; /* process granularity for FECS traces */
nvgpu_get_thread_name(ch->thread_name);
#ifdef CONFIG_NVGPU_USERD #ifdef CONFIG_NVGPU_USERD
if (nvgpu_userd_init_channel(g, ch) != 0) { if (nvgpu_userd_init_channel(g, ch) != 0) {
@@ -2125,11 +2126,12 @@ static void nvgpu_channel_info_debug_dump(struct gk20a *g,
*/ */
u32 ver = nvgpu_safe_add_u32(g->params.gpu_arch, g->params.gpu_impl); u32 ver = nvgpu_safe_add_u32(g->params.gpu_arch, g->params.gpu_impl);
gk20a_debug_output(o, "%d-%s, TSG: %u, pid %d, refs: %d, deterministic: %s, domain name: %s", gk20a_debug_output(o, "%d-%s, TSG: %u, pid %d, thread name %s, refs: %d, deterministic: %s, domain name: %s",
info->chid, info->chid,
g->name, g->name,
info->tsgid, info->tsgid,
info->pid, info->pid,
info->thread_name,
info->refs, info->refs,
info->deterministic ? "yes" : "no", info->deterministic ? "yes" : "no",
info->nvs_domain_name); info->nvs_domain_name);
@@ -2229,6 +2231,7 @@ void nvgpu_channel_debug_dump_all(struct gk20a *g,
info->chid = ch->chid; info->chid = ch->chid;
info->tsgid = ch->tsgid; info->tsgid = ch->tsgid;
info->pid = ch->pid; info->pid = ch->pid;
(void)memcpy(info->thread_name, ch->thread_name, sizeof(info->thread_name));
info->refs = nvgpu_atomic_read(&ch->ref_count); info->refs = nvgpu_atomic_read(&ch->ref_count);
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
info->deterministic = nvgpu_channel_is_deterministic(ch); info->deterministic = nvgpu_channel_is_deterministic(ch);

View File

@@ -51,6 +51,10 @@ struct nvgpu_channel_wdt;
struct nvgpu_user_fence; struct nvgpu_user_fence;
struct nvgpu_runlist; struct nvgpu_runlist;
/**
* Size of task name. Should strictly be equal to TASK_COMM_LEN
*/
#define TASK_NAME_LEN (16U)
/** /**
* S/W defined invalid channel identifier. * S/W defined invalid channel identifier.
*/ */
@@ -187,6 +191,11 @@ struct nvgpu_channel_dump_info {
u32 tsgid; u32 tsgid;
/** Pid of the process that created this channel. */ /** Pid of the process that created this channel. */
int pid; int pid;
/**
* Name of the thread that created the channel.
* Same size as task_struct.comm[] on linux.
*/
char thread_name[TASK_NAME_LEN];
/** Number of references to this channel. */ /** Number of references to this channel. */
int refs; int refs;
/** Channel uses deterministic submit (kernel submit only). */ /** Channel uses deterministic submit (kernel submit only). */
@@ -356,6 +365,11 @@ struct nvgpu_channel {
* Confusingly, at userspace level, this is what is seen as the "pid". * Confusingly, at userspace level, this is what is seen as the "pid".
*/ */
pid_t tgid; pid_t tgid;
/**
* Name of the thread that created the channel.
* Same size as task_struct.comm[] on linux.
*/
char thread_name[TASK_NAME_LEN];
/** Lock to serialize ioctls for this channel. */ /** Lock to serialize ioctls for this channel. */
struct nvgpu_mutex ioctl_lock; struct nvgpu_mutex ioctl_lock;

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -53,6 +53,13 @@ int nvgpu_current_tid(struct gk20a *g);
*/ */
int nvgpu_current_pid(struct gk20a *g); int nvgpu_current_pid(struct gk20a *g);
/**
* @brief API to get the name of current thread.
*
* @param dest [in/out] Pointer to the string buffer.
*/
void nvgpu_get_thread_name(char *dest);
/** /**
* @brief Print the name of current thread. * @brief Print the name of current thread.
* *

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -18,6 +18,7 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/dma-buf.h> #include <linux/dma-buf.h>
#include <linux/sched.h>
#include <nvgpu/types.h> #include <nvgpu/types.h>

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2022, NVIDIA Corporation. All rights reserved. * Copyright (c) 2017-2023, NVIDIA Corporation. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -150,12 +150,12 @@ void nvgpu_set_err_notifier_locked(struct nvgpu_channel *ch, u32 error)
if (error == NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR) { if (error == NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR) {
nvgpu_log_info(ch->g, nvgpu_log_info(ch->g,
"error notifier set to %d for ch %d", "error notifier set to %d for ch %d owned by %s",
error, ch->chid); error, ch->chid, ch->thread_name);
} else { } else {
nvgpu_err(ch->g, nvgpu_err(ch->g,
"error notifier set to %d for ch %d", "error notifier set to %d for ch %d owned by %s",
error, ch->chid); error, ch->chid, ch->thread_name);
} }
} }
} }

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
@@ -30,3 +30,13 @@ void nvgpu_print_current_impl(struct gk20a *g, const char *func_name, int line,
{ {
nvgpu_log_msg_impl(g, func_name, line, type, current->comm); nvgpu_log_msg_impl(g, func_name, line, type, current->comm);
} }
void nvgpu_get_thread_name(char *dest)
{
char buf[TASK_COMM_LEN];
get_task_comm(buf, current);
strncpy(dest, buf, TASK_COMM_LEN);
/* Ensure buffer is null terminated */
dest[TASK_COMM_LEN-1] = '\0';
}

View File

@@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Permission is hereby granted, free of charge, to any person obtaining a * Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"), * copy of this software and associated documentation files (the "Software"),
@@ -53,6 +53,11 @@ int nvgpu_current_tid(struct gk20a *g)
return (int)pthread_self(); return (int)pthread_self();
} }
void nvgpu_get_thread_name(char *dest)
{
(void)dest;
}
void nvgpu_print_current_impl(struct gk20a *g, const char *func_name, int line, void nvgpu_print_current_impl(struct gk20a *g, const char *func_name, int line,
void *ctx, enum nvgpu_log_type type) void *ctx, enum nvgpu_log_type type)
{ {

View File

@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
bitmap_find_next_zero_area bitmap_find_next_zero_area
fb_gv11b_write_mmu_fault_buffer_get fb_gv11b_write_mmu_fault_buffer_get
@@ -342,6 +342,7 @@ nvgpu_cond_timedwait
nvgpu_cond_unlock nvgpu_cond_unlock
nvgpu_current_pid nvgpu_current_pid
nvgpu_current_tid nvgpu_current_tid
nvgpu_get_thread_name
nvgpu_current_time_ms nvgpu_current_time_ms
nvgpu_current_time_ns nvgpu_current_time_ns
nvgpu_current_time_us nvgpu_current_time_us