gpu: nvgpu: Capture thread name for every channel created

This change ensures that in scenarios where GPU enters a bad state because of the work submitted by a misbehaved thread, we should be able to capture thread name as part of our 1st set of failure logs. Changes for QNX env is pending. JIRA NVGPU-7783 Change-Id: I65d55a6ade749ff91739458e0642ed2dafaae5cc Signed-off-by: Kishan <kpalankar@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2879197 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: svc-mobile-misra <svc-mobile-misra@nvidia.com> Reviewed-by: svc-mobile-cert <svc-mobile-cert@nvidia.com> Reviewed-by: Tejal Kudav <tkudav@nvidia.com> Reviewed-by: Ankur Kishore <ankkishore@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2023-03-29 11:07:51 +00:00
parent af48120169
commit c6d5fb348c
8 changed files with 52 additions and 11 deletions
--- a/drivers/gpu/nvgpu/common/fifo/channel.c
+++ b/drivers/gpu/nvgpu/common/fifo/channel.c
@@ -1260,6 +1260,7 @@ struct nvgpu_channel *nvgpu_channel_open_new(struct gk20a *g,

 	ch->pid = tid;
 	ch->tgid = pid;  /* process granularity for FECS traces */
+	nvgpu_get_thread_name(ch->thread_name);

 #ifdef CONFIG_NVGPU_USERD
 	if (nvgpu_userd_init_channel(g, ch) != 0) {
@@ -2125,11 +2126,12 @@ static void nvgpu_channel_info_debug_dump(struct gk20a *g,
 	 */
 	u32 ver = nvgpu_safe_add_u32(g->params.gpu_arch, g->params.gpu_impl);

-	gk20a_debug_output(o, "%d-%s, TSG: %u, pid %d, refs: %d, deterministic: %s, domain name: %s",
+	gk20a_debug_output(o, "%d-%s, TSG: %u, pid %d, thread name %s, refs: %d, deterministic: %s, domain name: %s",
 			info->chid,
 			g->name,
 			info->tsgid,
 			info->pid,
+			info->thread_name,
 			info->refs,
 			info->deterministic ? "yes" : "no",
 			info->nvs_domain_name);
@@ -2229,6 +2231,7 @@ void nvgpu_channel_debug_dump_all(struct gk20a *g,
 		info->chid = ch->chid;
 		info->tsgid = ch->tsgid;
 		info->pid = ch->pid;
+		(void)memcpy(info->thread_name, ch->thread_name, sizeof(info->thread_name));
 		info->refs = nvgpu_atomic_read(&ch->ref_count);
 #ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
 		info->deterministic = nvgpu_channel_is_deterministic(ch);
--- a/drivers/gpu/nvgpu/include/nvgpu/channel.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/channel.h
@@ -51,6 +51,10 @@ struct nvgpu_channel_wdt;
 struct nvgpu_user_fence;
 struct nvgpu_runlist;

+/**
+ * Size of task name. Should strictly be equal to TASK_COMM_LEN
+ */
+#define TASK_NAME_LEN		(16U)
 /**
 * S/W defined invalid channel identifier.
 */
@@ -187,6 +191,11 @@ struct nvgpu_channel_dump_info {
 	u32 tsgid;
 	/** Pid of the process that created this channel. */
 	int pid;
+	/**
+	 * Name of the thread that created the channel.
+	 * Same size as task_struct.comm[] on linux.
+	 */
+	char thread_name[TASK_NAME_LEN];
 	/** Number of references to this channel. */
 	int refs;
 	/** Channel uses deterministic submit (kernel submit only). */
@@ -356,6 +365,11 @@ struct nvgpu_channel {
 	 * Confusingly, at userspace level, this is what is seen as the "pid".
 	 */
 	pid_t tgid;
+	/**
+	 * Name of the thread that created the channel.
+	 * Same size as task_struct.comm[] on linux.
+	 */
+	char thread_name[TASK_NAME_LEN];
 	/** Lock to serialize ioctls for this channel. */
 	struct nvgpu_mutex ioctl_lock;

--- a/drivers/gpu/nvgpu/include/nvgpu/os_sched.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/os_sched.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -53,6 +53,13 @@ int nvgpu_current_tid(struct gk20a *g);
 */
 int nvgpu_current_pid(struct gk20a *g);

+/**
+ * @brief  API to get the name of current thread.
+ *
+ * @param dest [in/out]		Pointer to the string buffer.
+ */
+void nvgpu_get_thread_name(char *dest);
+
 /**
 * @brief Print the name of current thread.
 *
--- a/drivers/gpu/nvgpu/os/linux/channel.h
+++ b/drivers/gpu/nvgpu/os/linux/channel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -18,6 +18,7 @@

 #include <linux/workqueue.h>
 #include <linux/dma-buf.h>
+#include <linux/sched.h>

 #include <nvgpu/types.h>

--- a/drivers/gpu/nvgpu/os/linux/linux-channel.c
+++ b/drivers/gpu/nvgpu/os/linux/linux-channel.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022, NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2017-2023, NVIDIA Corporation.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -150,12 +150,12 @@ void nvgpu_set_err_notifier_locked(struct nvgpu_channel *ch, u32 error)

 		if (error == NVGPU_CHANNEL_RESETCHANNEL_VERIF_ERROR) {
 			nvgpu_log_info(ch->g,
-			    "error notifier set to %d for ch %d",
-			    error, ch->chid);
+			    "error notifier set to %d for ch %d owned by %s",
+			    error, ch->chid, ch->thread_name);
 		} else {
 			nvgpu_err(ch->g,
-			    "error notifier set to %d for ch %d",
-			    error, ch->chid);
+			    "error notifier set to %d for ch %d owned by %s",
+			    error, ch->chid, ch->thread_name);
 		}
 	}
 }
--- a/drivers/gpu/nvgpu/os/linux/os_sched.c
+++ b/drivers/gpu/nvgpu/os/linux/os_sched.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -30,3 +30,13 @@ void nvgpu_print_current_impl(struct gk20a *g, const char *func_name, int line,
 {
 	nvgpu_log_msg_impl(g, func_name, line, type, current->comm);
 }
+
+void nvgpu_get_thread_name(char *dest)
+{
+	char buf[TASK_COMM_LEN];
+
+	get_task_comm(buf, current);
+	strncpy(dest, buf, TASK_COMM_LEN);
+	/* Ensure buffer is null terminated */
+	dest[TASK_COMM_LEN-1] = '\0';
+}
--- a/drivers/gpu/nvgpu/os/posix/os_sched.c
+++ b/drivers/gpu/nvgpu/os/posix/os_sched.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -53,6 +53,11 @@ int nvgpu_current_tid(struct gk20a *g)
 	return (int)pthread_self();
 }

+void nvgpu_get_thread_name(char *dest)
+{
+	(void)dest;
+}
+
 void nvgpu_print_current_impl(struct gk20a *g, const char *func_name, int line,
 		void *ctx, enum nvgpu_log_type type)
 {
--- a/libs/igpu/libnvgpu-drv-igpu_safe.export
+++ b/libs/igpu/libnvgpu-drv-igpu_safe.export
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.

 bitmap_find_next_zero_area
 fb_gv11b_write_mmu_fault_buffer_get
@@ -342,6 +342,7 @@ nvgpu_cond_timedwait
 nvgpu_cond_unlock
 nvgpu_current_pid
 nvgpu_current_tid
+nvgpu_get_thread_name
 nvgpu_current_time_ms
 nvgpu_current_time_ns
 nvgpu_current_time_us