gpu; nvgpu: IOCTL to write/clear SM error states

Add below IOCTLs to write/clear SM error states

NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE
NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE

Bug 200156699

Change-Id: I89e3ec51c33b8e131a67d28807d5acf57b3a48fd
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Reviewed-on: http://git-master/r/1120330
Reviewed-by: Automatic_Commit_Validation_User
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
Deepak Nibade
2015-12-24 18:41:15 +05:30
committed by Terje Bergstrom
parent 04e45bc943
commit c651adbeaa
5 changed files with 335 additions and 1 deletions

View File

@@ -564,6 +564,86 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
return 0;
}
static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_clear_single_sm_error_state_args *args)
{
struct gk20a *g = get_gk20a(dbg_s->dev);
struct gr_gk20a *gr = &g->gr;
u32 sm_id;
struct channel_gk20a *ch = dbg_s->ch;
int err = 0;
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
return -EINVAL;
err = gk20a_busy(g->dev);
if (err)
return err;
err = gr_gk20a_elpg_protected_call(g,
g->ops.gr.clear_sm_error_state(g, ch, sm_id));
gk20a_idle(g->dev);
return err;
}
static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_write_single_sm_error_state_args *args)
{
struct gk20a *g = get_gk20a(dbg_s->dev);
struct gr_gk20a *gr = &g->gr;
u32 sm_id;
struct channel_gk20a *ch = dbg_s->ch;
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state;
int err = 0;
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
return -EINVAL;
sm_error_state = kzalloc(sizeof(*sm_error_state), GFP_KERNEL);
if (!sm_error_state)
return -ENOMEM;
if (args->sm_error_state_record_size > 0) {
size_t read_size = sizeof(*sm_error_state);
if (read_size > args->sm_error_state_record_size)
read_size = args->sm_error_state_record_size;
mutex_lock(&g->dbg_sessions_lock);
err = copy_from_user(sm_error_state,
(void __user *)(uintptr_t)
args->sm_error_state_record_mem,
read_size);
mutex_unlock(&g->dbg_sessions_lock);
if (err) {
err = -ENOMEM;
goto err_free;
}
}
err = gk20a_busy(g->dev);
if (err)
goto err_free;
err = gr_gk20a_elpg_protected_call(g,
g->ops.gr.update_sm_error_state(g, ch,
sm_id, sm_error_state));
gk20a_idle(g->dev);
err_free:
kfree(sm_error_state);
return err;
}
long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -666,6 +746,16 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
(struct nvgpu_dbg_gpu_read_single_sm_error_state_args *)buf);
break;
case NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE:
err = nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(dbg_s,
(struct nvgpu_dbg_gpu_clear_single_sm_error_state_args *)buf);
break;
case NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE:
err = nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(dbg_s,
(struct nvgpu_dbg_gpu_write_single_sm_error_state_args *)buf);
break;
default:
gk20a_err(dev_from_gk20a(g),
"unrecognized dbg gpu ioctl cmd: 0x%x",

View File

@@ -242,6 +242,12 @@ struct gpu_ops {
u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
int (*record_sm_error_state)(struct gk20a *g,
u32 gpc, u32 tpc);
int (*update_sm_error_state)(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_dbg_gpu_sm_error_state_record *
sm_error_state);
int (*clear_sm_error_state)(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id);
} gr;
const char *name;
struct {

View File

@@ -5535,6 +5535,111 @@ static int gk20a_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
return 0;
}
static int gk20a_gr_update_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state)
{
u32 gpc, tpc, offset;
struct gr_gk20a *gr = &g->gr;
struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
int err = 0;
mutex_lock(&g->dbg_sessions_lock);
gr->sm_error_states[sm_id].hww_global_esr =
sm_error_state->hww_global_esr;
gr->sm_error_states[sm_id].hww_warp_esr =
sm_error_state->hww_warp_esr;
gr->sm_error_states[sm_id].hww_global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
err = gr_gk20a_disable_ctxsw(g);
if (err) {
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
goto fail;
}
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
offset = proj_gpc_stride_v() * gpc +
proj_tpc_in_gpc_stride_v() * tpc;
if (gk20a_is_channel_ctx_resident(ch)) {
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
} else {
err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
if (err)
goto enable_ctxsw;
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask,
true);
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
true);
gr_gk20a_ctx_patch_write_end(g, ch_ctx);
}
enable_ctxsw:
err = gr_gk20a_enable_ctxsw(g);
fail:
mutex_unlock(&g->dbg_sessions_lock);
return err;
}
static int gk20a_gr_clear_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id)
{
u32 gpc, tpc, offset;
u32 val;
struct gr_gk20a *gr = &g->gr;
int err = 0;
mutex_lock(&g->dbg_sessions_lock);
memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
err = gr_gk20a_disable_ctxsw(g);
if (err) {
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
goto fail;
}
if (gk20a_is_channel_ctx_resident(ch)) {
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
offset = proj_gpc_stride_v() * gpc +
proj_tpc_in_gpc_stride_v() * tpc;
val = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
val);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
0);
}
err = gr_gk20a_enable_ctxsw(g);
fail:
mutex_unlock(&g->dbg_sessions_lock);
return err;
}
int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
bool *post_event, struct channel_gk20a *fault_ch)
{
@@ -8415,4 +8520,6 @@ void gk20a_init_gr_ops(struct gpu_ops *gops)
gops->gr.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode;
gops->gr.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode;
gops->gr.record_sm_error_state = gk20a_gr_record_sm_error_state;
gops->gr.update_sm_error_state = gk20a_gr_update_sm_error_state;
gops->gr.clear_sm_error_state = gk20a_gr_clear_sm_error_state;
}

View File

@@ -1219,6 +1219,115 @@ static int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
return 0;
}
static int gm20b_gr_update_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state)
{
u32 gpc, tpc, offset;
struct gr_gk20a *gr = &g->gr;
struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
int err = 0;
mutex_lock(&g->dbg_sessions_lock);
gr->sm_error_states[sm_id].hww_global_esr =
sm_error_state->hww_global_esr;
gr->sm_error_states[sm_id].hww_warp_esr =
sm_error_state->hww_warp_esr;
gr->sm_error_states[sm_id].hww_warp_esr_pc =
sm_error_state->hww_warp_esr_pc;
gr->sm_error_states[sm_id].hww_global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
err = gr_gk20a_disable_ctxsw(g);
if (err) {
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
goto fail;
}
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
offset = proj_gpc_stride_v() * gpc +
proj_tpc_in_gpc_stride_v() * tpc;
if (gk20a_is_channel_ctx_resident(ch)) {
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_pc);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
} else {
err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
if (err)
goto enable_ctxsw;
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask,
true);
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
true);
gr_gk20a_ctx_patch_write_end(g, ch_ctx);
}
enable_ctxsw:
err = gr_gk20a_enable_ctxsw(g);
fail:
mutex_unlock(&g->dbg_sessions_lock);
return err;
}
static int gm20b_gr_clear_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id)
{
u32 gpc, tpc, offset;
u32 val;
struct gr_gk20a *gr = &g->gr;
int err = 0;
mutex_lock(&g->dbg_sessions_lock);
memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
err = gr_gk20a_disable_ctxsw(g);
if (err) {
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
goto fail;
}
if (gk20a_is_channel_ctx_resident(ch)) {
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
offset = proj_gpc_stride_v() * gpc +
proj_tpc_in_gpc_stride_v() * tpc;
val = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
val);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
0);
}
err = gr_gk20a_enable_ctxsw(g);
fail:
mutex_unlock(&g->dbg_sessions_lock);
return err;
}
void gm20b_init_gr(struct gpu_ops *gops)
{
gops->gr.init_gpc_mmu = gr_gm20b_init_gpc_mmu;
@@ -1286,4 +1395,6 @@ void gm20b_init_gr(struct gpu_ops *gops)
gops->gr.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode;
gops->gr.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode;
gops->gr.record_sm_error_state = gm20b_gr_record_sm_error_state;
gops->gr.update_sm_error_state = gm20b_gr_update_sm_error_state;
gops->gr.clear_sm_error_state = gm20b_gr_clear_sm_error_state;
}

View File

@@ -703,8 +703,28 @@ struct nvgpu_dbg_gpu_read_single_sm_error_state_args {
_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 14, struct nvgpu_dbg_gpu_read_single_sm_error_state_args)
struct nvgpu_dbg_gpu_clear_single_sm_error_state_args {
__u32 sm_id;
__u32 padding;
};
#define NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE \
_IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 15, struct nvgpu_dbg_gpu_clear_single_sm_error_state_args)
struct nvgpu_dbg_gpu_write_single_sm_error_state_args {
__u32 sm_id;
__u32 padding;
__u64 sm_error_state_record_mem;
__u64 sm_error_state_record_size;
};
#define NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE \
_IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 16, struct nvgpu_dbg_gpu_write_single_sm_error_state_args)
#define NVGPU_DBG_GPU_IOCTL_LAST \
_IOC_NR(NVGPU_DBG_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
_IOC_NR(NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE)
#define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_dbg_gpu_perfbuf_map_args)