mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-24 02:22:34 +03:00
gpu; nvgpu: IOCTL to write/clear SM error states
Add below IOCTLs to write/clear SM error states NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE Bug 200156699 Change-Id: I89e3ec51c33b8e131a67d28807d5acf57b3a48fd Signed-off-by: Deepak Nibade <dnibade@nvidia.com> Reviewed-on: http://git-master/r/1120330 Reviewed-by: Automatic_Commit_Validation_User GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
This commit is contained in:
committed by
Terje Bergstrom
parent
04e45bc943
commit
c651adbeaa
@@ -564,6 +564,86 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
|
||||
struct dbg_session_gk20a *dbg_s,
|
||||
struct nvgpu_dbg_gpu_clear_single_sm_error_state_args *args)
|
||||
{
|
||||
struct gk20a *g = get_gk20a(dbg_s->dev);
|
||||
struct gr_gk20a *gr = &g->gr;
|
||||
u32 sm_id;
|
||||
struct channel_gk20a *ch = dbg_s->ch;
|
||||
int err = 0;
|
||||
|
||||
sm_id = args->sm_id;
|
||||
|
||||
if (sm_id >= gr->no_of_sm)
|
||||
return -EINVAL;
|
||||
|
||||
err = gk20a_busy(g->dev);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = gr_gk20a_elpg_protected_call(g,
|
||||
g->ops.gr.clear_sm_error_state(g, ch, sm_id));
|
||||
|
||||
gk20a_idle(g->dev);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
|
||||
struct dbg_session_gk20a *dbg_s,
|
||||
struct nvgpu_dbg_gpu_write_single_sm_error_state_args *args)
|
||||
{
|
||||
struct gk20a *g = get_gk20a(dbg_s->dev);
|
||||
struct gr_gk20a *gr = &g->gr;
|
||||
u32 sm_id;
|
||||
struct channel_gk20a *ch = dbg_s->ch;
|
||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state;
|
||||
int err = 0;
|
||||
|
||||
sm_id = args->sm_id;
|
||||
if (sm_id >= gr->no_of_sm)
|
||||
return -EINVAL;
|
||||
|
||||
sm_error_state = kzalloc(sizeof(*sm_error_state), GFP_KERNEL);
|
||||
if (!sm_error_state)
|
||||
return -ENOMEM;
|
||||
|
||||
if (args->sm_error_state_record_size > 0) {
|
||||
size_t read_size = sizeof(*sm_error_state);
|
||||
|
||||
if (read_size > args->sm_error_state_record_size)
|
||||
read_size = args->sm_error_state_record_size;
|
||||
|
||||
mutex_lock(&g->dbg_sessions_lock);
|
||||
err = copy_from_user(sm_error_state,
|
||||
(void __user *)(uintptr_t)
|
||||
args->sm_error_state_record_mem,
|
||||
read_size);
|
||||
mutex_unlock(&g->dbg_sessions_lock);
|
||||
if (err) {
|
||||
err = -ENOMEM;
|
||||
goto err_free;
|
||||
}
|
||||
}
|
||||
|
||||
err = gk20a_busy(g->dev);
|
||||
if (err)
|
||||
goto err_free;
|
||||
|
||||
err = gr_gk20a_elpg_protected_call(g,
|
||||
g->ops.gr.update_sm_error_state(g, ch,
|
||||
sm_id, sm_error_state));
|
||||
|
||||
gk20a_idle(g->dev);
|
||||
|
||||
err_free:
|
||||
kfree(sm_error_state);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
{
|
||||
@@ -666,6 +746,16 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
|
||||
(struct nvgpu_dbg_gpu_read_single_sm_error_state_args *)buf);
|
||||
break;
|
||||
|
||||
case NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE:
|
||||
err = nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(dbg_s,
|
||||
(struct nvgpu_dbg_gpu_clear_single_sm_error_state_args *)buf);
|
||||
break;
|
||||
|
||||
case NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE:
|
||||
err = nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(dbg_s,
|
||||
(struct nvgpu_dbg_gpu_write_single_sm_error_state_args *)buf);
|
||||
break;
|
||||
|
||||
default:
|
||||
gk20a_err(dev_from_gk20a(g),
|
||||
"unrecognized dbg gpu ioctl cmd: 0x%x",
|
||||
|
||||
@@ -242,6 +242,12 @@ struct gpu_ops {
|
||||
u32 (*get_lrf_tex_ltc_dram_override)(struct gk20a *g);
|
||||
int (*record_sm_error_state)(struct gk20a *g,
|
||||
u32 gpc, u32 tpc);
|
||||
int (*update_sm_error_state)(struct gk20a *g,
|
||||
struct channel_gk20a *ch, u32 sm_id,
|
||||
struct nvgpu_dbg_gpu_sm_error_state_record *
|
||||
sm_error_state);
|
||||
int (*clear_sm_error_state)(struct gk20a *g,
|
||||
struct channel_gk20a *ch, u32 sm_id);
|
||||
} gr;
|
||||
const char *name;
|
||||
struct {
|
||||
|
||||
@@ -5535,6 +5535,111 @@ static int gk20a_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gk20a_gr_update_sm_error_state(struct gk20a *g,
|
||||
struct channel_gk20a *ch, u32 sm_id,
|
||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state)
|
||||
{
|
||||
u32 gpc, tpc, offset;
|
||||
struct gr_gk20a *gr = &g->gr;
|
||||
struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
|
||||
int err = 0;
|
||||
|
||||
mutex_lock(&g->dbg_sessions_lock);
|
||||
|
||||
gr->sm_error_states[sm_id].hww_global_esr =
|
||||
sm_error_state->hww_global_esr;
|
||||
gr->sm_error_states[sm_id].hww_warp_esr =
|
||||
sm_error_state->hww_warp_esr;
|
||||
gr->sm_error_states[sm_id].hww_global_esr_report_mask =
|
||||
sm_error_state->hww_global_esr_report_mask;
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
|
||||
sm_error_state->hww_warp_esr_report_mask;
|
||||
|
||||
err = gr_gk20a_disable_ctxsw(g);
|
||||
if (err) {
|
||||
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
|
||||
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
|
||||
|
||||
offset = proj_gpc_stride_v() * gpc +
|
||||
proj_tpc_in_gpc_stride_v() * tpc;
|
||||
|
||||
if (gk20a_is_channel_ctx_resident(ch)) {
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_global_esr);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr);
|
||||
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_global_esr_report_mask);
|
||||
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
|
||||
} else {
|
||||
err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
|
||||
if (err)
|
||||
goto enable_ctxsw;
|
||||
|
||||
gr_gk20a_ctx_patch_write(g, ch_ctx,
|
||||
gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_global_esr_report_mask,
|
||||
true);
|
||||
gr_gk20a_ctx_patch_write(g, ch_ctx,
|
||||
gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
|
||||
true);
|
||||
|
||||
gr_gk20a_ctx_patch_write_end(g, ch_ctx);
|
||||
}
|
||||
|
||||
enable_ctxsw:
|
||||
err = gr_gk20a_enable_ctxsw(g);
|
||||
|
||||
fail:
|
||||
mutex_unlock(&g->dbg_sessions_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int gk20a_gr_clear_sm_error_state(struct gk20a *g,
|
||||
struct channel_gk20a *ch, u32 sm_id)
|
||||
{
|
||||
u32 gpc, tpc, offset;
|
||||
u32 val;
|
||||
struct gr_gk20a *gr = &g->gr;
|
||||
int err = 0;
|
||||
|
||||
mutex_lock(&g->dbg_sessions_lock);
|
||||
|
||||
memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
|
||||
|
||||
err = gr_gk20a_disable_ctxsw(g);
|
||||
if (err) {
|
||||
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (gk20a_is_channel_ctx_resident(ch)) {
|
||||
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
|
||||
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
|
||||
|
||||
offset = proj_gpc_stride_v() * gpc +
|
||||
proj_tpc_in_gpc_stride_v() * tpc;
|
||||
|
||||
val = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
|
||||
val);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
|
||||
0);
|
||||
}
|
||||
|
||||
err = gr_gk20a_enable_ctxsw(g);
|
||||
|
||||
fail:
|
||||
mutex_unlock(&g->dbg_sessions_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc,
|
||||
bool *post_event, struct channel_gk20a *fault_ch)
|
||||
{
|
||||
@@ -8415,4 +8520,6 @@ void gk20a_init_gr_ops(struct gpu_ops *gops)
|
||||
gops->gr.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode;
|
||||
gops->gr.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode;
|
||||
gops->gr.record_sm_error_state = gk20a_gr_record_sm_error_state;
|
||||
gops->gr.update_sm_error_state = gk20a_gr_update_sm_error_state;
|
||||
gops->gr.clear_sm_error_state = gk20a_gr_clear_sm_error_state;
|
||||
}
|
||||
|
||||
@@ -1219,6 +1219,115 @@ static int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gm20b_gr_update_sm_error_state(struct gk20a *g,
|
||||
struct channel_gk20a *ch, u32 sm_id,
|
||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state)
|
||||
{
|
||||
u32 gpc, tpc, offset;
|
||||
struct gr_gk20a *gr = &g->gr;
|
||||
struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
|
||||
int err = 0;
|
||||
|
||||
mutex_lock(&g->dbg_sessions_lock);
|
||||
|
||||
gr->sm_error_states[sm_id].hww_global_esr =
|
||||
sm_error_state->hww_global_esr;
|
||||
gr->sm_error_states[sm_id].hww_warp_esr =
|
||||
sm_error_state->hww_warp_esr;
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_pc =
|
||||
sm_error_state->hww_warp_esr_pc;
|
||||
gr->sm_error_states[sm_id].hww_global_esr_report_mask =
|
||||
sm_error_state->hww_global_esr_report_mask;
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
|
||||
sm_error_state->hww_warp_esr_report_mask;
|
||||
|
||||
err = gr_gk20a_disable_ctxsw(g);
|
||||
if (err) {
|
||||
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
|
||||
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
|
||||
|
||||
offset = proj_gpc_stride_v() * gpc +
|
||||
proj_tpc_in_gpc_stride_v() * tpc;
|
||||
|
||||
if (gk20a_is_channel_ctx_resident(ch)) {
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_global_esr);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_pc);
|
||||
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_global_esr_report_mask);
|
||||
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
|
||||
} else {
|
||||
err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
|
||||
if (err)
|
||||
goto enable_ctxsw;
|
||||
|
||||
gr_gk20a_ctx_patch_write(g, ch_ctx,
|
||||
gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_global_esr_report_mask,
|
||||
true);
|
||||
gr_gk20a_ctx_patch_write(g, ch_ctx,
|
||||
gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
|
||||
gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
|
||||
true);
|
||||
|
||||
gr_gk20a_ctx_patch_write_end(g, ch_ctx);
|
||||
}
|
||||
|
||||
enable_ctxsw:
|
||||
err = gr_gk20a_enable_ctxsw(g);
|
||||
|
||||
fail:
|
||||
mutex_unlock(&g->dbg_sessions_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int gm20b_gr_clear_sm_error_state(struct gk20a *g,
|
||||
struct channel_gk20a *ch, u32 sm_id)
|
||||
{
|
||||
u32 gpc, tpc, offset;
|
||||
u32 val;
|
||||
struct gr_gk20a *gr = &g->gr;
|
||||
int err = 0;
|
||||
|
||||
mutex_lock(&g->dbg_sessions_lock);
|
||||
|
||||
memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
|
||||
|
||||
err = gr_gk20a_disable_ctxsw(g);
|
||||
if (err) {
|
||||
gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw\n");
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (gk20a_is_channel_ctx_resident(ch)) {
|
||||
gpc = g->gr.sm_to_cluster[sm_id].gpc_index;
|
||||
tpc = g->gr.sm_to_cluster[sm_id].tpc_index;
|
||||
|
||||
offset = proj_gpc_stride_v() * gpc +
|
||||
proj_tpc_in_gpc_stride_v() * tpc;
|
||||
|
||||
val = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
|
||||
val);
|
||||
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
|
||||
0);
|
||||
}
|
||||
|
||||
err = gr_gk20a_enable_ctxsw(g);
|
||||
|
||||
fail:
|
||||
mutex_unlock(&g->dbg_sessions_lock);
|
||||
return err;
|
||||
}
|
||||
|
||||
void gm20b_init_gr(struct gpu_ops *gops)
|
||||
{
|
||||
gops->gr.init_gpc_mmu = gr_gm20b_init_gpc_mmu;
|
||||
@@ -1286,4 +1395,6 @@ void gm20b_init_gr(struct gpu_ops *gops)
|
||||
gops->gr.update_smpc_ctxsw_mode = gr_gk20a_update_smpc_ctxsw_mode;
|
||||
gops->gr.update_hwpm_ctxsw_mode = gr_gk20a_update_hwpm_ctxsw_mode;
|
||||
gops->gr.record_sm_error_state = gm20b_gr_record_sm_error_state;
|
||||
gops->gr.update_sm_error_state = gm20b_gr_update_sm_error_state;
|
||||
gops->gr.clear_sm_error_state = gm20b_gr_clear_sm_error_state;
|
||||
}
|
||||
|
||||
@@ -703,8 +703,28 @@ struct nvgpu_dbg_gpu_read_single_sm_error_state_args {
|
||||
_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 14, struct nvgpu_dbg_gpu_read_single_sm_error_state_args)
|
||||
|
||||
|
||||
struct nvgpu_dbg_gpu_clear_single_sm_error_state_args {
|
||||
__u32 sm_id;
|
||||
__u32 padding;
|
||||
};
|
||||
|
||||
#define NVGPU_DBG_GPU_IOCTL_CLEAR_SINGLE_SM_ERROR_STATE \
|
||||
_IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 15, struct nvgpu_dbg_gpu_clear_single_sm_error_state_args)
|
||||
|
||||
|
||||
struct nvgpu_dbg_gpu_write_single_sm_error_state_args {
|
||||
__u32 sm_id;
|
||||
__u32 padding;
|
||||
__u64 sm_error_state_record_mem;
|
||||
__u64 sm_error_state_record_size;
|
||||
};
|
||||
|
||||
#define NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE \
|
||||
_IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 16, struct nvgpu_dbg_gpu_write_single_sm_error_state_args)
|
||||
|
||||
|
||||
#define NVGPU_DBG_GPU_IOCTL_LAST \
|
||||
_IOC_NR(NVGPU_DBG_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
|
||||
_IOC_NR(NVGPU_DBG_GPU_IOCTL_WRITE_SINGLE_SM_ERROR_STATE)
|
||||
|
||||
#define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \
|
||||
sizeof(struct nvgpu_dbg_gpu_perfbuf_map_args)
|
||||
|
||||
Reference in New Issue
Block a user