gpu: nvgpu: vgpu: ecc sysfs support for vgpu

- fetch ecc info from RM server and create sysfs nodes - new file ecc_vgpu.c for platform-independent code - add 2 new commands: GET_ECC_INFO and GET_ECC_COUNTER_VALUE JIRA EVLR-2590 Change-Id: I040a9fcd23326e432ca93e9a028319f9c1c570f0 Signed-off-by: Kyle Guo <kyleg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1777428 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2018-07-12 17:51:42 -07:00
parent 91390d857f
commit 2a25d03f2b
6 changed files with 247 additions and 0 deletions
--- a/drivers/gpu/nvgpu/Makefile
+++ b/drivers/gpu/nvgpu/Makefile
@@ -245,6 +245,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
 	vgpu/dbg_vgpu.o \
 	vgpu/tsg_vgpu.o \
 	vgpu/css_vgpu.o \
 	vgpu/ecc_vgpu.o \
 	vgpu/gm20b/vgpu_gr_gm20b.o \
 	vgpu/gp10b/vgpu_hal_gp10b.o  \
 	vgpu/gp10b/vgpu_gr_gp10b.o  \
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/tegra_vgpu.h
@@ -26,6 +26,7 @@
 #define __TEGRA_VGPU_H
 #include <nvgpu/types.h>
 #include <nvgpu/ecc.h>	/* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
 enum {
 	TEGRA_VGPU_MODULE_GPU = 0,
@@ -120,6 +121,8 @@ enum {
 	TEGRA_VGPU_CMD_UPDATE_PC_SAMPLING = 81,
 	TEGRA_VGPU_CMD_SUSPEND = 82,
 	TEGRA_VGPU_CMD_RESUME = 83,
 	TEGRA_VGPU_CMD_GET_ECC_INFO = 84,
 	TEGRA_VGPU_CMD_GET_ECC_COUNTER_VALUE = 85,
 };
 struct tegra_vgpu_connect_params {
@@ -378,6 +381,20 @@ struct tegra_vgpu_channel_free_hwpm_ctx {
 	u64 handle;
 };
 struct tegra_vgpu_ecc_info_params {
 	u32 ecc_stats_count;
 };
 struct tegra_vgpu_ecc_info_entry {
 	u32 ecc_id;
 	char name[NVGPU_ECC_STAT_NAME_MAX_SIZE];
 };
 struct tegra_vgpu_ecc_counter_params {
 	u32 ecc_id;
 	u32 value;
 };
 struct tegra_vgpu_gr_ctx_params {
 	u64 gr_ctx_handle;
 	u64 as_handle;
@@ -659,6 +676,8 @@ struct tegra_vgpu_cmd_msg {
 		struct tegra_vgpu_map_syncpt_params map_syncpt;
 		struct tegra_vgpu_tsg_bind_channel_ex_params tsg_bind_channel_ex;
 		struct tegra_vgpu_channel_update_pc_sampling update_pc_sampling;
 		struct tegra_vgpu_ecc_info_params ecc_info;
 		struct tegra_vgpu_ecc_counter_params ecc_counter;
 		char padding[192];
 	} params;
 };
--- a/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h
+++ b/drivers/gpu/nvgpu/include/nvgpu/vgpu/vgpu.h
@@ -37,11 +37,14 @@ struct gk20a;
 struct vm_gk20a;
 struct nvgpu_gr_ctx;
 struct nvgpu_cpu_time_correlation_sample;
 struct vgpu_ecc_stat;
 struct vgpu_priv_data {
 	u64 virt_handle;
 	struct nvgpu_thread intr_handler;
 	struct tegra_vgpu_constants_params constants;
 	struct vgpu_ecc_stat *ecc_stats;
 	int ecc_stats_count;
 };
 struct vgpu_priv_data *vgpu_get_priv_data(struct gk20a *g);
--- a/drivers/gpu/nvgpu/os/linux/vgpu/sysfs_vgpu.c
+++ b/drivers/gpu/nvgpu/os/linux/vgpu/sysfs_vgpu.c
@@ -18,6 +18,8 @@
 #include <nvgpu/vgpu/vgpu.h>
 #include "os/linux/platform_gk20a.h"
 #include "os/linux/os_linux.h"
 #include "vgpu/ecc_vgpu.h"
 static ssize_t vgpu_load_show(struct device *dev,
 			      struct device_attribute *attr,
@@ -38,13 +40,104 @@ static ssize_t vgpu_load_show(struct device *dev,
 }
 static DEVICE_ATTR(load, S_IRUGO, vgpu_load_show, NULL);
 static ssize_t vgpu_ecc_stat_show(struct device *dev,
 			      struct device_attribute *attr,
 			      char *buf)
 {
 	struct gk20a *g = get_gk20a(dev);
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_ecc_counter_params *p = &msg.params.ecc_counter;
 	struct dev_ext_attribute *ext_attr = container_of(attr,
 			struct dev_ext_attribute, attr);
 	struct vgpu_ecc_stat *ecc_stat = ext_attr->var;
 	int err;
 	p->ecc_id = ecc_stat->ecc_id;
 	msg.cmd = TEGRA_VGPU_CMD_GET_ECC_COUNTER_VALUE;
 	msg.handle = vgpu_get_handle(g);
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (unlikely(err)) {
 		nvgpu_err(g, "ecc: cannot get ECC counter value: %d", err);
 		return err;
 	}
 	return snprintf(buf, PAGE_SIZE, "%u\n", p->value);
 }
 static int vgpu_create_ecc_sysfs(struct device *dev)
 {
 	struct gk20a *g = get_gk20a(dev);
 	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
 	struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
 	struct vgpu_ecc_stat *stats;
 	struct dev_ext_attribute *attrs;
 	int err, i, count;
 	err = vgpu_ecc_get_info(g);
 	if (unlikely(err)) {
 		nvgpu_err(g, "ecc: cannot get ECC info: %d", err);
 		return err;
 	}
 	stats = priv->ecc_stats;
 	count = priv->ecc_stats_count;
 	attrs = nvgpu_kzalloc(g, count * sizeof(*attrs));
 	if (unlikely(!attrs)) {
 		nvgpu_err(g, "ecc: no memory");
 		vgpu_ecc_remove_info(g);
 		return -ENOMEM;
 	}
 	for (i = 0; i < count; i++) {
 		sysfs_attr_init(&attrs[i].attr);
 		attrs[i].attr.attr.name = stats[i].name;
 		attrs[i].attr.attr.mode = VERIFY_OCTAL_PERMISSIONS(S_IRUGO);
 		attrs[i].attr.show = vgpu_ecc_stat_show;
 		attrs[i].attr.store = NULL;
 		attrs[i].var = &stats[i];
 		err = device_create_file(dev, &attrs[i].attr);
 		if (unlikely(err)) {
 			nvgpu_warn(g, "ecc: cannot create file \"%s\": %d",
 				   stats[i].name, err);
 		}
 	}
 	l->ecc_attrs = attrs;
 	return 0;
 }
 static void vgpu_remove_ecc_sysfs(struct device *dev)
 {
 	struct gk20a *g = get_gk20a(dev);
 	struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
 	struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
 	int i;
 	if (l->ecc_attrs) {
 		for (i = 0; i < priv->ecc_stats_count; i++)
 			device_remove_file(dev, &l->ecc_attrs[i].attr);
 		nvgpu_kfree(g, l->ecc_attrs);
 		l->ecc_attrs = NULL;
 	}
 	vgpu_ecc_remove_info(g);
 }
 void vgpu_create_sysfs(struct device *dev)
 {
 	if (device_create_file(dev, &dev_attr_load))
 		dev_err(dev, "Failed to create vgpu sysfs attributes!\n");
 	vgpu_create_ecc_sysfs(dev);
 }
 void vgpu_remove_sysfs(struct device *dev)
 {
 	device_remove_file(dev, &dev_attr_load);
 	vgpu_remove_ecc_sysfs(dev);
 }
--- a/drivers/gpu/nvgpu/vgpu/ecc_vgpu.c
+++ b/drivers/gpu/nvgpu/vgpu/ecc_vgpu.c
@@ -0,0 +1,92 @@
 /*
 * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #include <nvgpu/kmem.h>
 #include <nvgpu/vgpu/vgpu_ivc.h>
 #include <nvgpu/vgpu/vgpu.h>
 #include <nvgpu/errno.h>
 #include "vgpu/ecc_vgpu.h"
 int vgpu_ecc_get_info(struct gk20a *g)
 {
 	struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
 	struct tegra_vgpu_cmd_msg msg = {0};
 	struct tegra_vgpu_ecc_info_params *p = &msg.params.ecc_info;
 	struct tegra_vgpu_ecc_info_entry *entry;
 	struct vgpu_ecc_stat *stats;
 	void *handle;
 	int err, i, count;
 	size_t oob_size;
 	msg.cmd = TEGRA_VGPU_CMD_GET_ECC_INFO;
 	msg.handle = vgpu_get_handle(g);
 	err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
 	err = err ? err : msg.ret;
 	if (unlikely(err)) {
 		nvgpu_err(g, "vgpu get_ecc_info failed, err=%d", err);
 		return err;
 	}
 	count = p->ecc_stats_count;
 	handle = vgpu_ivc_oob_get_ptr(vgpu_ivc_get_server_vmid(),
 					TEGRA_VGPU_QUEUE_CMD,
 					(void **)&entry, &oob_size);
 	if (unlikely(!handle))
 		return -EINVAL;
 	if (unlikely(oob_size < count * sizeof(*entry))) {
 		err = -E2BIG;
 		goto out;
 	}
 	stats = nvgpu_kzalloc(g, count * sizeof(*stats));
 	if (unlikely(!stats)) {
 		err = -ENOMEM;
 		goto out;
 	}
 	for (i = 0; i < count; i++) {
 		stats[i].ecc_id = entry[i].ecc_id;
 		strncpy(stats[i].name, entry[i].name,
 			NVGPU_ECC_STAT_NAME_MAX_SIZE);
 	}
 	priv->ecc_stats = stats;
 	priv->ecc_stats_count = count;
 out:
 	vgpu_ivc_oob_put_ptr(handle);
 	return err;
 }
 void vgpu_ecc_remove_info(struct gk20a *g)
 {
 	struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
 	priv->ecc_stats_count = 0;
 	if (priv->ecc_stats) {
 		nvgpu_kfree(g, priv->ecc_stats);
 		priv->ecc_stats = NULL;
 	}
 }
--- a/drivers/gpu/nvgpu/vgpu/ecc_vgpu.h
+++ b/drivers/gpu/nvgpu/vgpu/ecc_vgpu.h
@@ -0,0 +1,39 @@
 /*
 * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */
 #ifndef _ECC_VGPU_H_
 #define _ECC_VGPU_H_
 #include <nvgpu/types.h>
 #include <nvgpu/ecc.h>	/* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
 struct gk20a;
 struct vgpu_ecc_stat {
 	u32 ecc_id;
 	char name[NVGPU_ECC_STAT_NAME_MAX_SIZE + 1];
 };
 int vgpu_ecc_get_info(struct gk20a *g);
 void vgpu_ecc_remove_info(struct gk20a *g);
 #endif