gpu: nvgpu: Add Ctrl API to read SM error state

Expose IOCTL to Ctrl node to read Single SM error under NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE bug 200412642 JIRA NVGPU-700 Change-Id: I3cbcf4d7f23a53dbd2350b38a5e259559d5fd3af Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1728931 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2018-05-23 17:22:03 -07:00
parent 40cefb666f
commit d84e822128
2 changed files with 91 additions and 1 deletions
--- a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
@@ -1575,6 +1575,56 @@ out:
 	return err;
 }

+static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g,
+		struct nvgpu_gpu_read_single_sm_error_state_args *args)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct nvgpu_gr_sm_error_state *sm_error_state;
+	struct nvgpu_gpu_sm_error_state_record sm_error_state_record;
+	u32 sm_id;
+	int err = 0;
+
+	sm_id = args->sm_id;
+	if (sm_id >= gr->no_of_sm)
+		return -EINVAL;
+
+	nvgpu_speculation_barrier();
+
+	sm_error_state = gr->sm_error_states + sm_id;
+	sm_error_state_record.global_esr =
+		sm_error_state->hww_global_esr;
+	sm_error_state_record.warp_esr =
+		sm_error_state->hww_warp_esr;
+	sm_error_state_record.warp_esr_pc =
+		sm_error_state->hww_warp_esr_pc;
+	sm_error_state_record.global_esr_report_mask =
+		sm_error_state->hww_global_esr_report_mask;
+	sm_error_state_record.warp_esr_report_mask =
+		sm_error_state->hww_warp_esr_report_mask;
+
+	if (args->record_size > 0) {
+		size_t write_size = sizeof(*sm_error_state);
+
+		if (write_size > args->record_size)
+			write_size = args->record_size;
+
+		nvgpu_mutex_acquire(&g->dbg_sessions_lock);
+		err = copy_to_user((void __user *)(uintptr_t)
+						args->record_mem,
+				   &sm_error_state_record,
+				   write_size);
+		nvgpu_mutex_release(&g->dbg_sessions_lock);
+		if (err) {
+			nvgpu_err(g, "copy_to_user failed!");
+			return err;
+		}
+
+		args->record_size = write_size;
+	}
+
+	return 0;
+}
+
 long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct gk20a_ctrl_priv *priv = filp->private_data;
@@ -1887,6 +1937,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
 			(struct nvgpu_gpu_set_deterministic_opts_args *)buf);
 		break;

+	case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE:
+		err = nvgpu_gpu_read_single_sm_error_state(g,
+			(struct nvgpu_gpu_read_single_sm_error_state_args *)buf);
+		break;
+
 	default:
 		nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd);
 		err = -ENOTTY;
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -864,6 +864,38 @@ struct nvgpu_gpu_set_deterministic_opts_args {
 	__u64 channels; /* in */
 };

+/*
+ * This struct helps to report the SM error state of a single SM.
+ * This acts upon the currently resident GR context.
+ * Global Error status register
+ * Warp Error status register
+ * Warp Error status register PC
+ * Global Error status register Report Mask
+ * Warp Error status register Report Mask
+ */
+struct nvgpu_gpu_sm_error_state_record {
+	__u32 global_esr;
+	__u32 warp_esr;
+	__u64 warp_esr_pc;
+	__u32 global_esr_report_mask;
+	__u32 warp_esr_report_mask;
+};
+
+/*
+ * This struct helps to read the SM error state.
+ */
+struct nvgpu_gpu_read_single_sm_error_state_args {
+	/* Valid SM ID */
+	__u32 sm_id;
+	__u32 reserved;
+	/*
+	 * This is pointer to the struct nvgpu_gpu_sm_error_state_record
+	 */
+	__u64 record_mem;
+	/* size of the record size to read */
+	__u64 record_size;
+};
+
 #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
 	_IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
 #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -949,8 +981,11 @@ struct nvgpu_gpu_set_deterministic_opts_args {
 #define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \
 	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \
 			struct nvgpu_gpu_set_deterministic_opts_args)
+#define NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE \
+	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 41, \
+			struct nvgpu_gpu_read_single_sm_error_state_args)
 #define NVGPU_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS)
+	_IOC_NR(NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
 #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)