From 8a0a5345705e069e398a79dbcba96c5db54a37f1 Mon Sep 17 00:00:00 2001 From: Kishan Palankar Date: Wed, 30 Oct 2024 05:57:46 +0000 Subject: [PATCH] gpu: nvgpu: Guard profiler_objects list operations with a lock Both profiler and debugger device nodes access and update the list, g->profiler_objects. List operations were currently not guarded by lock thus leading to synchronisation issues. Stress-ng test attempts to trigger repeated random open close sessions on all the device nodes exposed by gpu. This results in kernel panic at random stages of test. Failure signature - Profiler node receives a release call and as part of it, nvgpu_profiler_free attempts to delete the prof_obj_entry and free the prof memory. Simulataneously debugger node also receives a release call and as part of gk20a_dbg_gpu_dev_release, nvgpu attempts to access g->profiler_objects to check for any profiling sessions associated with debugger node. There is a race to access the list which results in kernel panic for address 0x8 because nvgpu tries to access prof_obj->session_id which is at offset 0x8. As part of this change, g->profiler_objects list access/update is guarded with a mutex lock. Bug 4858627 Change-Id: I1e2cf8d27d195bbc9c012cf511029de9eaadb038 Signed-off-by: Kishan Palankar Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/3239897 (cherry picked from commit 2eabcdb8a4e815581487b0a92f680366d6f3718b) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/3262771 GVS: buildbot_gerritrpt Reviewed-by: Amulya Yarlagadda Tested-by: Brad Griffis Reviewed-by: Brad Griffis --- drivers/gpu/nvgpu/common/profiler/profiler.c | 4 ++- drivers/gpu/nvgpu/include/nvgpu/gk20a.h | 4 ++- drivers/gpu/nvgpu/os/linux/driver_common.c | 5 ++- drivers/gpu/nvgpu/os/linux/ioctl_dbg.c | 32 +++++++++++++++++--- drivers/gpu/nvgpu/os/linux/ioctl_prof.c | 6 +++- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/nvgpu/common/profiler/profiler.c b/drivers/gpu/nvgpu/common/profiler/profiler.c index 6c6953966..6022d0a7b 100644 --- a/drivers/gpu/nvgpu/common/profiler/profiler.c +++ b/drivers/gpu/nvgpu/common/profiler/profiler.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -/* SPDX-FileCopyrightText: Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. +/* SPDX-FileCopyrightText: Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. * All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -72,7 +72,9 @@ int nvgpu_profiler_alloc(struct gk20a *g, nvgpu_mutex_init(&prof->ioctl_lock); nvgpu_init_list_node(&prof->prof_obj_entry); + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_list_add(&prof->prof_obj_entry, &g->profiler_objects); + nvgpu_mutex_release(&g->prof_obj_lock); nvgpu_log(g, gpu_dbg_prof, "Allocated profiler handle %u", prof->prof_handle); diff --git a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h index 2fef09af6..33e5f73b4 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/gk20a.h +++ b/drivers/gpu/nvgpu/include/nvgpu/gk20a.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved. * * GK20A Graphics * @@ -702,6 +702,8 @@ struct gk20a { #endif /* CONFIG_NVGPU_DEBUGGER */ #ifdef CONFIG_NVGPU_PROFILER + /* Held while accessing/manipulating profiler_objects */ + struct nvgpu_mutex prof_obj_lock; struct nvgpu_list_node profiler_objects; struct nvgpu_pm_resource_reservations *pm_reservations; nvgpu_atomic_t hwpm_refcount; diff --git a/drivers/gpu/nvgpu/os/linux/driver_common.c b/drivers/gpu/nvgpu/os/linux/driver_common.c index b0ae8c868..e5ff525ec 100644 --- a/drivers/gpu/nvgpu/os/linux/driver_common.c +++ b/drivers/gpu/nvgpu/os/linux/driver_common.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -165,6 +165,9 @@ static void nvgpu_init_vars(struct gk20a *g) #ifdef CONFIG_NVGPU_TSG_SHARING nvgpu_mutex_init(&g->ctrl_dev_id_lock); #endif +#ifdef CONFIG_NVGPU_PROFILER + nvgpu_mutex_init(&g->prof_obj_lock); +#endif /* Init the clock req count to 0 */ nvgpu_atomic_set(&g->clk_arb_global_nr, 0); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c index 8ff879473..10ccfb3bd 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c @@ -1,7 +1,7 @@ /* * Tegra GK20A GPU Debugger/Profiler Driver * - * Copyright (c) 2017-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -223,6 +223,7 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp) /* Per-context profiler objects were released when we called * dbg_unbind_all_channels. We could still have global ones. */ + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_list_for_each_entry_safe(prof_obj, tmp_obj, &g->profiler_objects, nvgpu_profiler_object, prof_obj_entry) { if (prof_obj->session_id == dbg_s->id) { @@ -231,6 +232,7 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp) nvgpu_profiler_free(prof_obj); } } + nvgpu_mutex_release(&g->prof_obj_lock); dbg_s->gpu_instance_id = 0U; nvgpu_mutex_release(&g->dbg_sessions_lock); @@ -463,6 +465,7 @@ static int dbg_unbind_single_channel_gk20a(struct dbg_session_gk20a *dbg_s, /* If there's a profiler ctx reservation record associated with this * session/channel pair, release it. */ + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_list_for_each_entry_safe(prof_obj, tmp_obj, &g->profiler_objects, nvgpu_profiler_object, prof_obj_entry) { if ((prof_obj->session_id == dbg_s->id) && @@ -472,6 +475,7 @@ static int dbg_unbind_single_channel_gk20a(struct dbg_session_gk20a *dbg_s, nvgpu_profiler_free(prof_obj); } } + nvgpu_mutex_release(&g->prof_obj_lock); nvgpu_list_del(&ch_data->ch_entry); @@ -1129,6 +1133,7 @@ static int nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(struct dbg_session_gk20a *dbg_s, * return an error, at the point where all client sw has been * cleaned up. */ + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_list_for_each_entry_safe(prof_obj, tmp_obj, &g->profiler_objects, nvgpu_profiler_object, prof_obj_entry) { if (prof_obj->session_id == dbg_s->id) { @@ -1137,6 +1142,7 @@ static int nvgpu_dbg_gpu_ioctl_hwpm_ctxsw_mode(struct dbg_session_gk20a *dbg_s, } } } + nvgpu_mutex_release(&g->prof_obj_lock); if (!reserved) { nvgpu_err(g, "session doesn't have a valid reservation"); @@ -1371,7 +1377,9 @@ static int nvgpu_ioctl_allocate_profiler_object( if (ch != NULL) { tsg = nvgpu_tsg_from_ch(ch); if (tsg == NULL) { + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_profiler_free(prof_obj); + nvgpu_mutex_release(&g->prof_obj_lock); goto clean_up; } @@ -1404,6 +1412,7 @@ static int nvgpu_ioctl_free_profiler_object( nvgpu_mutex_acquire(&g->dbg_sessions_lock); /* Remove profiler object from the list, if a match is found */ + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_list_for_each_entry_safe(prof_obj, tmp_obj, &g->profiler_objects, nvgpu_profiler_object, prof_obj_entry) { if (prof_obj->prof_handle == args->profiler_handle) { @@ -1421,6 +1430,7 @@ static int nvgpu_ioctl_free_profiler_object( break; } } + nvgpu_mutex_release(&g->prof_obj_lock); if (!obj_found) { nvgpu_err(g, "profiler %x not found", args->profiler_handle); @@ -1436,8 +1446,9 @@ static struct nvgpu_profiler_object *find_matching_prof_obj( u32 profiler_handle) { struct gk20a *g = dbg_s->g; - struct nvgpu_profiler_object *prof_obj; + struct nvgpu_profiler_object *prof_obj, *ret_obj; + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_list_for_each_entry(prof_obj, &g->profiler_objects, nvgpu_profiler_object, prof_obj_entry) { if (prof_obj->prof_handle == profiler_handle) { @@ -1445,12 +1456,18 @@ static struct nvgpu_profiler_object *find_matching_prof_obj( nvgpu_err(g, "invalid handle %x", profiler_handle); - return NULL; + ret_obj = NULL; + goto ret; } - return prof_obj; + ret_obj = prof_obj; + goto ret; } } - return NULL; + ret_obj = NULL; + +ret: + nvgpu_mutex_release(&g->prof_obj_lock); + return ret_obj; } /* used in scenarios where the debugger session can take just the inter-session @@ -1577,7 +1594,9 @@ static int nvgpu_perfbuf_reserve_pma(struct dbg_session_gk20a *dbg_s) NVGPU_PROFILER_PM_RESOURCE_TYPE_PMA_STREAM); if (err != 0) { nvgpu_err(g, "Failed to reserve PMA stream"); + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_profiler_free(dbg_s->prof); + nvgpu_mutex_release(&g->prof_obj_lock); return err; } @@ -2131,6 +2150,8 @@ static int nvgpu_profiler_reserve_acquire(struct dbg_session_gk20a *dbg_s, /* Find matching object. */ prof_obj = find_matching_prof_obj(dbg_s, profiler_handle); + nvgpu_mutex_acquire(&g->prof_obj_lock); + if (!prof_obj) { nvgpu_err(g, "object not found"); err = -EINVAL; @@ -2158,6 +2179,7 @@ static int nvgpu_profiler_reserve_acquire(struct dbg_session_gk20a *dbg_s, NVGPU_PROFILER_PM_RESOURCE_TYPE_HWPM_LEGACY); exit: + nvgpu_mutex_release(&g->prof_obj_lock); nvgpu_mutex_release(&g->dbg_sessions_lock); return err; } diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_prof.c b/drivers/gpu/nvgpu/os/linux/ioctl_prof.c index 20e70008c..7c37118bf 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_prof.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_prof.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -130,7 +130,9 @@ static int nvgpu_prof_fops_open(struct gk20a *g, struct file *filp, free_umd_buf: nvgpu_kfree(g, prof_priv->regops_umd_copy_buf); free_prof: + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_profiler_free(prof); + nvgpu_mutex_release(&g->prof_obj_lock); free_priv: nvgpu_kfree(g, prof_priv); return err; @@ -211,7 +213,9 @@ int nvgpu_prof_fops_release(struct inode *inode, struct file *filp) nvgpu_prof_free_pma_stream_priv_data(prof_priv); + nvgpu_mutex_acquire(&g->prof_obj_lock); nvgpu_profiler_free(prof); + nvgpu_mutex_release(&g->prof_obj_lock); nvgpu_kfree(g, prof_priv->regops_umd_copy_buf); nvgpu_kfree(g, prof_priv->regops_staging_buf);