gpu: nvgpu: Add Ctrl API to read SM error state

Expose IOCTL to Ctrl node to read Single SM error under NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE bug 200412642 JIRA NVGPU-700 Change-Id: I3cbcf4d7f23a53dbd2350b38a5e259559d5fd3af Signed-off-by: Vinod G <vinodg@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1728931 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
2025-12-22 17:36:20 +03:00 · 2018-05-23 17:22:03 -07:00
parent 40cefb666f
commit d84e822128
2 changed files with 91 additions and 1 deletions
--- a/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/common/linux/ioctl_ctrl.c
@@ -1575,6 +1575,56 @@ out:
 	return err;
 }
 static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g,
 		struct nvgpu_gpu_read_single_sm_error_state_args *args)
 {
 	struct gr_gk20a *gr = &g->gr;
 	struct nvgpu_gr_sm_error_state *sm_error_state;
 	struct nvgpu_gpu_sm_error_state_record sm_error_state_record;
 	u32 sm_id;
 	int err = 0;
 	sm_id = args->sm_id;
 	if (sm_id >= gr->no_of_sm)
 		return -EINVAL;
 	nvgpu_speculation_barrier();
 	sm_error_state = gr->sm_error_states + sm_id;
 	sm_error_state_record.global_esr =
 		sm_error_state->hww_global_esr;
 	sm_error_state_record.warp_esr =
 		sm_error_state->hww_warp_esr;
 	sm_error_state_record.warp_esr_pc =
 		sm_error_state->hww_warp_esr_pc;
 	sm_error_state_record.global_esr_report_mask =
 		sm_error_state->hww_global_esr_report_mask;
 	sm_error_state_record.warp_esr_report_mask =
 		sm_error_state->hww_warp_esr_report_mask;
 	if (args->record_size > 0) {
 		size_t write_size = sizeof(*sm_error_state);
 		if (write_size > args->record_size)
 			write_size = args->record_size;
 		nvgpu_mutex_acquire(&g->dbg_sessions_lock);
 		err = copy_to_user((void __user *)(uintptr_t)
 						args->record_mem,
 				   &sm_error_state_record,
 				   write_size);
 		nvgpu_mutex_release(&g->dbg_sessions_lock);
 		if (err) {
 			nvgpu_err(g, "copy_to_user failed!");
 			return err;
 		}
 		args->record_size = write_size;
 	}
 	return 0;
 }
 long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct gk20a_ctrl_priv *priv = filp->private_data;
@@ -1887,6 +1937,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
 			(struct nvgpu_gpu_set_deterministic_opts_args *)buf);
 		break;
 	case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE:
 		err = nvgpu_gpu_read_single_sm_error_state(g,
 			(struct nvgpu_gpu_read_single_sm_error_state_args *)buf);
 		break;
 	default:
 		nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd);
 		err = -ENOTTY;
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -864,6 +864,38 @@ struct nvgpu_gpu_set_deterministic_opts_args {
 	__u64 channels; /* in */
 };
 /*
 * This struct helps to report the SM error state of a single SM.
 * This acts upon the currently resident GR context.
 * Global Error status register
 * Warp Error status register
 * Warp Error status register PC
 * Global Error status register Report Mask
 * Warp Error status register Report Mask
 */
 struct nvgpu_gpu_sm_error_state_record {
 	__u32 global_esr;
 	__u32 warp_esr;
 	__u64 warp_esr_pc;
 	__u32 global_esr_report_mask;
 	__u32 warp_esr_report_mask;
 };
 /*
 * This struct helps to read the SM error state.
 */
 struct nvgpu_gpu_read_single_sm_error_state_args {
 	/* Valid SM ID */
 	__u32 sm_id;
 	__u32 reserved;
 	/*
 	 * This is pointer to the struct nvgpu_gpu_sm_error_state_record
 	 */
 	__u64 record_mem;
 	/* size of the record size to read */
 	__u64 record_size;
 };
 #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
 	_IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
 #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -949,8 +981,11 @@ struct nvgpu_gpu_set_deterministic_opts_args {
 #define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \
 	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \
 			struct nvgpu_gpu_set_deterministic_opts_args)
 #define NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE \
 	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 41, \
 			struct nvgpu_gpu_read_single_sm_error_state_args)
 #define NVGPU_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS)
+	_IOC_NR(NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
 #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)