gpu: nvgpu: vgpu: ecc sysfs support for vgpu

- fetch ecc info from RM server and create sysfs nodes
- new file ecc_vgpu.c for platform-independent code
- add 2 new commands: GET_ECC_INFO and GET_ECC_COUNTER_VALUE

JIRA EVLR-2590

Change-Id: I040a9fcd23326e432ca93e9a028319f9c1c570f0
Signed-off-by: Kyle Guo <kyleg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1777428
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Kyle Guo
2018-07-12 17:51:42 -07:00
committed by mobile promotions
parent 91390d857f
commit 2a25d03f2b
6 changed files with 247 additions and 0 deletions

View File

@@ -245,6 +245,7 @@ nvgpu-$(CONFIG_TEGRA_GR_VIRTUALIZATION) += \
vgpu/dbg_vgpu.o \ vgpu/dbg_vgpu.o \
vgpu/tsg_vgpu.o \ vgpu/tsg_vgpu.o \
vgpu/css_vgpu.o \ vgpu/css_vgpu.o \
vgpu/ecc_vgpu.o \
vgpu/gm20b/vgpu_gr_gm20b.o \ vgpu/gm20b/vgpu_gr_gm20b.o \
vgpu/gp10b/vgpu_hal_gp10b.o \ vgpu/gp10b/vgpu_hal_gp10b.o \
vgpu/gp10b/vgpu_gr_gp10b.o \ vgpu/gp10b/vgpu_gr_gp10b.o \

View File

@@ -26,6 +26,7 @@
#define __TEGRA_VGPU_H #define __TEGRA_VGPU_H
#include <nvgpu/types.h> #include <nvgpu/types.h>
#include <nvgpu/ecc.h> /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
enum { enum {
TEGRA_VGPU_MODULE_GPU = 0, TEGRA_VGPU_MODULE_GPU = 0,
@@ -120,6 +121,8 @@ enum {
TEGRA_VGPU_CMD_UPDATE_PC_SAMPLING = 81, TEGRA_VGPU_CMD_UPDATE_PC_SAMPLING = 81,
TEGRA_VGPU_CMD_SUSPEND = 82, TEGRA_VGPU_CMD_SUSPEND = 82,
TEGRA_VGPU_CMD_RESUME = 83, TEGRA_VGPU_CMD_RESUME = 83,
TEGRA_VGPU_CMD_GET_ECC_INFO = 84,
TEGRA_VGPU_CMD_GET_ECC_COUNTER_VALUE = 85,
}; };
struct tegra_vgpu_connect_params { struct tegra_vgpu_connect_params {
@@ -378,6 +381,20 @@ struct tegra_vgpu_channel_free_hwpm_ctx {
u64 handle; u64 handle;
}; };
struct tegra_vgpu_ecc_info_params {
u32 ecc_stats_count;
};
struct tegra_vgpu_ecc_info_entry {
u32 ecc_id;
char name[NVGPU_ECC_STAT_NAME_MAX_SIZE];
};
struct tegra_vgpu_ecc_counter_params {
u32 ecc_id;
u32 value;
};
struct tegra_vgpu_gr_ctx_params { struct tegra_vgpu_gr_ctx_params {
u64 gr_ctx_handle; u64 gr_ctx_handle;
u64 as_handle; u64 as_handle;
@@ -659,6 +676,8 @@ struct tegra_vgpu_cmd_msg {
struct tegra_vgpu_map_syncpt_params map_syncpt; struct tegra_vgpu_map_syncpt_params map_syncpt;
struct tegra_vgpu_tsg_bind_channel_ex_params tsg_bind_channel_ex; struct tegra_vgpu_tsg_bind_channel_ex_params tsg_bind_channel_ex;
struct tegra_vgpu_channel_update_pc_sampling update_pc_sampling; struct tegra_vgpu_channel_update_pc_sampling update_pc_sampling;
struct tegra_vgpu_ecc_info_params ecc_info;
struct tegra_vgpu_ecc_counter_params ecc_counter;
char padding[192]; char padding[192];
} params; } params;
}; };

View File

@@ -37,11 +37,14 @@ struct gk20a;
struct vm_gk20a; struct vm_gk20a;
struct nvgpu_gr_ctx; struct nvgpu_gr_ctx;
struct nvgpu_cpu_time_correlation_sample; struct nvgpu_cpu_time_correlation_sample;
struct vgpu_ecc_stat;
struct vgpu_priv_data { struct vgpu_priv_data {
u64 virt_handle; u64 virt_handle;
struct nvgpu_thread intr_handler; struct nvgpu_thread intr_handler;
struct tegra_vgpu_constants_params constants; struct tegra_vgpu_constants_params constants;
struct vgpu_ecc_stat *ecc_stats;
int ecc_stats_count;
}; };
struct vgpu_priv_data *vgpu_get_priv_data(struct gk20a *g); struct vgpu_priv_data *vgpu_get_priv_data(struct gk20a *g);

View File

@@ -18,6 +18,8 @@
#include <nvgpu/vgpu/vgpu.h> #include <nvgpu/vgpu/vgpu.h>
#include "os/linux/platform_gk20a.h" #include "os/linux/platform_gk20a.h"
#include "os/linux/os_linux.h"
#include "vgpu/ecc_vgpu.h"
static ssize_t vgpu_load_show(struct device *dev, static ssize_t vgpu_load_show(struct device *dev,
struct device_attribute *attr, struct device_attribute *attr,
@@ -38,13 +40,104 @@ static ssize_t vgpu_load_show(struct device *dev,
} }
static DEVICE_ATTR(load, S_IRUGO, vgpu_load_show, NULL); static DEVICE_ATTR(load, S_IRUGO, vgpu_load_show, NULL);
static ssize_t vgpu_ecc_stat_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct gk20a *g = get_gk20a(dev);
struct tegra_vgpu_cmd_msg msg = {0};
struct tegra_vgpu_ecc_counter_params *p = &msg.params.ecc_counter;
struct dev_ext_attribute *ext_attr = container_of(attr,
struct dev_ext_attribute, attr);
struct vgpu_ecc_stat *ecc_stat = ext_attr->var;
int err;
p->ecc_id = ecc_stat->ecc_id;
msg.cmd = TEGRA_VGPU_CMD_GET_ECC_COUNTER_VALUE;
msg.handle = vgpu_get_handle(g);
err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
err = err ? err : msg.ret;
if (unlikely(err)) {
nvgpu_err(g, "ecc: cannot get ECC counter value: %d", err);
return err;
}
return snprintf(buf, PAGE_SIZE, "%u\n", p->value);
}
static int vgpu_create_ecc_sysfs(struct device *dev)
{
struct gk20a *g = get_gk20a(dev);
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
struct vgpu_ecc_stat *stats;
struct dev_ext_attribute *attrs;
int err, i, count;
err = vgpu_ecc_get_info(g);
if (unlikely(err)) {
nvgpu_err(g, "ecc: cannot get ECC info: %d", err);
return err;
}
stats = priv->ecc_stats;
count = priv->ecc_stats_count;
attrs = nvgpu_kzalloc(g, count * sizeof(*attrs));
if (unlikely(!attrs)) {
nvgpu_err(g, "ecc: no memory");
vgpu_ecc_remove_info(g);
return -ENOMEM;
}
for (i = 0; i < count; i++) {
sysfs_attr_init(&attrs[i].attr);
attrs[i].attr.attr.name = stats[i].name;
attrs[i].attr.attr.mode = VERIFY_OCTAL_PERMISSIONS(S_IRUGO);
attrs[i].attr.show = vgpu_ecc_stat_show;
attrs[i].attr.store = NULL;
attrs[i].var = &stats[i];
err = device_create_file(dev, &attrs[i].attr);
if (unlikely(err)) {
nvgpu_warn(g, "ecc: cannot create file \"%s\": %d",
stats[i].name, err);
}
}
l->ecc_attrs = attrs;
return 0;
}
static void vgpu_remove_ecc_sysfs(struct device *dev)
{
struct gk20a *g = get_gk20a(dev);
struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
int i;
if (l->ecc_attrs) {
for (i = 0; i < priv->ecc_stats_count; i++)
device_remove_file(dev, &l->ecc_attrs[i].attr);
nvgpu_kfree(g, l->ecc_attrs);
l->ecc_attrs = NULL;
}
vgpu_ecc_remove_info(g);
}
void vgpu_create_sysfs(struct device *dev) void vgpu_create_sysfs(struct device *dev)
{ {
if (device_create_file(dev, &dev_attr_load)) if (device_create_file(dev, &dev_attr_load))
dev_err(dev, "Failed to create vgpu sysfs attributes!\n"); dev_err(dev, "Failed to create vgpu sysfs attributes!\n");
vgpu_create_ecc_sysfs(dev);
} }
void vgpu_remove_sysfs(struct device *dev) void vgpu_remove_sysfs(struct device *dev)
{ {
device_remove_file(dev, &dev_attr_load); device_remove_file(dev, &dev_attr_load);
vgpu_remove_ecc_sysfs(dev);
} }

View File

@@ -0,0 +1,92 @@
/*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <nvgpu/kmem.h>
#include <nvgpu/vgpu/vgpu_ivc.h>
#include <nvgpu/vgpu/vgpu.h>
#include <nvgpu/errno.h>
#include "vgpu/ecc_vgpu.h"
int vgpu_ecc_get_info(struct gk20a *g)
{
struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
struct tegra_vgpu_cmd_msg msg = {0};
struct tegra_vgpu_ecc_info_params *p = &msg.params.ecc_info;
struct tegra_vgpu_ecc_info_entry *entry;
struct vgpu_ecc_stat *stats;
void *handle;
int err, i, count;
size_t oob_size;
msg.cmd = TEGRA_VGPU_CMD_GET_ECC_INFO;
msg.handle = vgpu_get_handle(g);
err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
err = err ? err : msg.ret;
if (unlikely(err)) {
nvgpu_err(g, "vgpu get_ecc_info failed, err=%d", err);
return err;
}
count = p->ecc_stats_count;
handle = vgpu_ivc_oob_get_ptr(vgpu_ivc_get_server_vmid(),
TEGRA_VGPU_QUEUE_CMD,
(void **)&entry, &oob_size);
if (unlikely(!handle))
return -EINVAL;
if (unlikely(oob_size < count * sizeof(*entry))) {
err = -E2BIG;
goto out;
}
stats = nvgpu_kzalloc(g, count * sizeof(*stats));
if (unlikely(!stats)) {
err = -ENOMEM;
goto out;
}
for (i = 0; i < count; i++) {
stats[i].ecc_id = entry[i].ecc_id;
strncpy(stats[i].name, entry[i].name,
NVGPU_ECC_STAT_NAME_MAX_SIZE);
}
priv->ecc_stats = stats;
priv->ecc_stats_count = count;
out:
vgpu_ivc_oob_put_ptr(handle);
return err;
}
void vgpu_ecc_remove_info(struct gk20a *g)
{
struct vgpu_priv_data *priv = vgpu_get_priv_data(g);
priv->ecc_stats_count = 0;
if (priv->ecc_stats) {
nvgpu_kfree(g, priv->ecc_stats);
priv->ecc_stats = NULL;
}
}

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef _ECC_VGPU_H_
#define _ECC_VGPU_H_
#include <nvgpu/types.h>
#include <nvgpu/ecc.h> /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
struct gk20a;
struct vgpu_ecc_stat {
u32 ecc_id;
char name[NVGPU_ECC_STAT_NAME_MAX_SIZE + 1];
};
int vgpu_ecc_get_info(struct gk20a *g);
void vgpu_ecc_remove_info(struct gk20a *g);
#endif