gpu: nvgpu: add device alarms

Add event definitions for:
- Clock alarm (target frequency not met)
- Thermal alarm (temperature above threshold)
- Power alarm (power above threshold)
- GPU shut down

Jira DNVGPU-186

Change-Id: I52edd44352ed0cba83033949272f41cc9e1c630f
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1249342
(cherry picked from commit 67a6681aade241ff24982771778f7e2193d1cd7f)
Reviewed-on: http://git-master/r/1267157
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Thomas Fleury
2016-11-07 10:17:56 -08:00
committed by mobile promotions
parent 0250221955
commit ec011cd1ee
3 changed files with 79 additions and 9 deletions

View File

@@ -1158,9 +1158,9 @@ static int nvgpu_gpu_clk_get_info(struct gk20a *g,
return 0; return 0;
} }
static int nvgpu_gpu_clk_get_event_fd(struct gk20a *g, static int nvgpu_gpu_get_event_fd(struct gk20a *g,
struct gk20a_ctrl_priv *priv, struct gk20a_ctrl_priv *priv,
struct nvgpu_gpu_clk_get_event_fd_args *args) struct nvgpu_gpu_get_event_fd_args *args)
{ {
struct nvgpu_clk_session *session = priv->clk_session; struct nvgpu_clk_session *session = priv->clk_session;
@@ -1567,9 +1567,9 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
(struct nvgpu_gpu_clk_get_info_args *)buf); (struct nvgpu_gpu_clk_get_info_args *)buf);
break; break;
case NVGPU_GPU_IOCTL_CLK_GET_EVENT_FD: case NVGPU_GPU_IOCTL_GET_EVENT_FD:
err = nvgpu_gpu_clk_get_event_fd(g, priv, err = nvgpu_gpu_get_event_fd(g, priv,
(struct nvgpu_gpu_clk_get_event_fd_args *)buf); (struct nvgpu_gpu_get_event_fd_args *)buf);
break; break;
case NVGPU_GPU_IOCTL_GET_VOLTAGE: case NVGPU_GPU_IOCTL_GET_VOLTAGE:

View File

@@ -2071,6 +2071,7 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
gpu->dbg_gpu_ioctl_nr_last = NVGPU_DBG_GPU_IOCTL_LAST; gpu->dbg_gpu_ioctl_nr_last = NVGPU_DBG_GPU_IOCTL_LAST;
gpu->ioctl_channel_nr_last = NVGPU_IOCTL_CHANNEL_LAST; gpu->ioctl_channel_nr_last = NVGPU_IOCTL_CHANNEL_LAST;
gpu->as_ioctl_nr_last = NVGPU_AS_IOCTL_LAST; gpu->as_ioctl_nr_last = NVGPU_AS_IOCTL_LAST;
gpu->event_ioctl_nr_last = NVGPU_EVENT_IOCTL_LAST;
gpu->gpu_va_bit_count = 40; gpu->gpu_va_bit_count = 40;
strlcpy(gpu->chipname, g->ops.name, sizeof(gpu->chipname)); strlcpy(gpu->chipname, g->ops.name, sizeof(gpu->chipname));

View File

@@ -126,6 +126,10 @@ struct nvgpu_gpu_zbc_query_table_args {
#define NVGPU_GPU_FLAGS_SUPPORT_GET_POWER (1ULL << 12) #define NVGPU_GPU_FLAGS_SUPPORT_GET_POWER (1ULL << 12)
/* NVGPU_GPU_IOCTL_GET_TEMPERATURE is available */ /* NVGPU_GPU_IOCTL_GET_TEMPERATURE is available */
#define NVGPU_GPU_FLAGS_SUPPORT_GET_TEMPERATURE (1ULL << 13) #define NVGPU_GPU_FLAGS_SUPPORT_GET_TEMPERATURE (1ULL << 13)
/* NVGPU_GPU_IOCTL_SET_THERM_ALERT_LIMIT is available */
#define NVGPU_GPU_FLAGS_SUPPORT_SET_THERM_ALERT_LIMIT (1ULL << 14)
/* NVGPU_GPU_IOCTL_GET_EVENT_FD is available */
#define NVGPU_GPU_FLAGS_SUPPORT_DEVICE_EVENTS (1ULL << 15)
struct nvgpu_gpu_characteristics { struct nvgpu_gpu_characteristics {
__u32 arch; __u32 arch;
@@ -223,6 +227,8 @@ struct nvgpu_gpu_characteristics {
- If the last field is reserved/padding, it is not - If the last field is reserved/padding, it is not
generally safe to repurpose the field in future revisions. generally safe to repurpose the field in future revisions.
*/ */
__s16 event_ioctl_nr_last;
__u16 pad[3];
}; };
struct nvgpu_gpu_get_characteristics { struct nvgpu_gpu_get_characteristics {
@@ -684,12 +690,18 @@ struct nvgpu_gpu_clk_set_info_args {
__s32 completion_fd; __s32 completion_fd;
}; };
struct nvgpu_gpu_clk_get_event_fd_args { struct nvgpu_gpu_get_event_fd_args {
/* in: Flags (not currently used). */ /* in: Flags (not currently used). */
__u32 flags; __u32 flags;
/* out: File descriptor for events, i.e. any clock update. */ /* out: File descriptor for events, i.e. clock update.
* On successful polling of this event_fd, application is
* expected to read status (nvgpu_gpu_event_info),
* which provides detailed event information
* For a poll operation, alarms will be reported with POLLPRI,
* and GPU shutdown will be reported with POLLHUP.
*/
__s32 event_fd; __s32 event_fd;
}; };
@@ -815,8 +827,8 @@ struct nvgpu_gpu_get_temperature_args {
_IOWR(NVGPU_GPU_IOCTL_MAGIC, 30, struct nvgpu_gpu_clk_get_info_args) _IOWR(NVGPU_GPU_IOCTL_MAGIC, 30, struct nvgpu_gpu_clk_get_info_args)
#define NVGPU_GPU_IOCTL_CLK_SET_INFO \ #define NVGPU_GPU_IOCTL_CLK_SET_INFO \
_IOWR(NVGPU_GPU_IOCTL_MAGIC, 31, struct nvgpu_gpu_clk_set_info_args) _IOWR(NVGPU_GPU_IOCTL_MAGIC, 31, struct nvgpu_gpu_clk_set_info_args)
#define NVGPU_GPU_IOCTL_CLK_GET_EVENT_FD \ #define NVGPU_GPU_IOCTL_GET_EVENT_FD \
_IOWR(NVGPU_GPU_IOCTL_MAGIC, 32, struct nvgpu_gpu_clk_get_event_fd_args) _IOWR(NVGPU_GPU_IOCTL_MAGIC, 32, struct nvgpu_gpu_get_event_fd_args)
#define NVGPU_GPU_IOCTL_GET_MEMORY_STATE \ #define NVGPU_GPU_IOCTL_GET_MEMORY_STATE \
_IOWR(NVGPU_GPU_IOCTL_MAGIC, 33, \ _IOWR(NVGPU_GPU_IOCTL_MAGIC, 33, \
struct nvgpu_gpu_get_memory_state_args) struct nvgpu_gpu_get_memory_state_args)
@@ -835,6 +847,63 @@ struct nvgpu_gpu_get_temperature_args {
#define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \ #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args) sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
/*
* Event session
*
* NVGPU_GPU_IOCTL_GET_EVENT_FD opens an event session.
* Below ioctls can be used on these sessions fds.
*/
#define NVGPU_EVENT_IOCTL_MAGIC 'E'
/* Normal events (POLLIN) */
/* Event associated to a VF update */
#define NVGPU_GPU_EVENT_VF_UPDATE 0
/* Recoverable alarms (POLLPRI) */
/* Alarm when target frequency on any session is not possible */
#define NVGPU_GPU_EVENT_ALARM_TARGET_VF_NOT_POSSIBLE 2
/* Alarm when target frequency on current session is not possible */
#define NVGPU_GPU_EVENT_ALARM_LOCAL_TARGET_VF_NOT_POSSIBLE 3
/* Alarm when Clock Arbiter failed */
#define NVGPU_GPU_EVENT_ALARM_CLOCK_ARBITER_FAILED 4
/* Alarm when VF table update failed */
#define NVGPU_GPU_EVENT_ALARM_VF_TABLE_UPDATE_FAILED 5
/* Alarm on thermal condition */
#define NVGPU_GPU_EVENT_ALARM_THERMAL_ABOVE_THRESHOLD 6
/* Alarm on power condition */
#define NVGPU_GPU_EVENT_ALARM_POWER_ABOVE_THRESHOLD 7
/* Non recoverable alarm (POLLUP) */
/* Alarm on GPU shutdown/fall from bus */
#define NVGPU_GPU_EVENT_ALARM_GPU_LOST 8
struct nvgpu_gpu_event_info {
__u32 event_id; /* NVGPU_GPU_EVENT_* */
__u32 reserved;
__u64 timestamp; /* GPU timestamp */
};
struct nvgpu_gpu_set_event_filter_args {
/* in: Flags (not currently used). */
__u32 flags;
/* in: Size of event filter in 32-bit words */
__u32 size;
/* in: Address of buffer containing bit mask of events.
* Bit #n is set if event #n should be monitored.
*/
__u64 buffer;
};
#define NVGPU_EVENT_IOCTL_SET_FILTER \
_IOW(NVGPU_EVENT_IOCTL_MAGIC, 1, struct nvgpu_gpu_set_event_filter_args)
#define NVGPU_EVENT_IOCTL_LAST \
_IOC_NR(NVGPU_EVENT_IOCTL_SET_FILTER)
#define NVGPU_EVENT_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_gpu_set_event_filter_args)
/* /*
* /dev/nvhost-tsg-gpu device * /dev/nvhost-tsg-gpu device
* *