mirror of
git://nv-tegra.nvidia.com/linux-nvgpu.git
synced 2025-12-22 17:36:20 +03:00
gpu: nvgpu: Linux specific sm_error_state_record
Create an nvgpu internal nvgpu_gr_sm_error_state to store and propagate SM error state within driver. Use nvgpu_dbg_gpu_sm_error_state_record only in Linux code. JIRA NVGPU-259 Change-Id: I7365cdf5a1a42cbcdb418dfcef3e0020e02a960f Signed-off-by: Terje Bergstrom <tbergstrom@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/1585645 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
34ce21a588
commit
9eebb7831f
@@ -239,7 +239,8 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
|
|||||||
struct gr_gk20a *gr = &g->gr;
|
struct gr_gk20a *gr = &g->gr;
|
||||||
u32 sm_id;
|
u32 sm_id;
|
||||||
struct channel_gk20a *ch;
|
struct channel_gk20a *ch;
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state;
|
struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
|
||||||
|
struct nvgpu_gr_sm_error_state sm_error_state;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
|
||||||
ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
|
ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
|
||||||
@@ -250,41 +251,43 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
|
|||||||
if (sm_id >= gr->no_of_sm)
|
if (sm_id >= gr->no_of_sm)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
sm_error_state = nvgpu_kzalloc(g, sizeof(*sm_error_state));
|
|
||||||
if (!sm_error_state)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
if (args->sm_error_state_record_size > 0) {
|
if (args->sm_error_state_record_size > 0) {
|
||||||
size_t read_size = sizeof(*sm_error_state);
|
size_t read_size = sizeof(sm_error_state_record);
|
||||||
|
|
||||||
if (read_size > args->sm_error_state_record_size)
|
if (read_size > args->sm_error_state_record_size)
|
||||||
read_size = args->sm_error_state_record_size;
|
read_size = args->sm_error_state_record_size;
|
||||||
|
|
||||||
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
|
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
|
||||||
err = copy_from_user(sm_error_state,
|
err = copy_from_user(&sm_error_state_record,
|
||||||
(void __user *)(uintptr_t)
|
(void __user *)(uintptr_t)
|
||||||
args->sm_error_state_record_mem,
|
args->sm_error_state_record_mem,
|
||||||
read_size);
|
read_size);
|
||||||
nvgpu_mutex_release(&g->dbg_sessions_lock);
|
nvgpu_mutex_release(&g->dbg_sessions_lock);
|
||||||
if (err) {
|
if (err)
|
||||||
err = -ENOMEM;
|
return -ENOMEM;
|
||||||
goto err_free;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
err = gk20a_busy(g);
|
err = gk20a_busy(g);
|
||||||
if (err)
|
if (err)
|
||||||
goto err_free;
|
return err;
|
||||||
|
|
||||||
|
sm_error_state.hww_global_esr =
|
||||||
|
sm_error_state_record.hww_global_esr;
|
||||||
|
sm_error_state.hww_warp_esr =
|
||||||
|
sm_error_state_record.hww_warp_esr;
|
||||||
|
sm_error_state.hww_warp_esr_pc =
|
||||||
|
sm_error_state_record.hww_warp_esr_pc;
|
||||||
|
sm_error_state.hww_global_esr_report_mask =
|
||||||
|
sm_error_state_record.hww_global_esr_report_mask;
|
||||||
|
sm_error_state.hww_warp_esr_report_mask =
|
||||||
|
sm_error_state_record.hww_warp_esr_report_mask;
|
||||||
|
|
||||||
err = gr_gk20a_elpg_protected_call(g,
|
err = gr_gk20a_elpg_protected_call(g,
|
||||||
g->ops.gr.update_sm_error_state(g, ch,
|
g->ops.gr.update_sm_error_state(g, ch,
|
||||||
sm_id, sm_error_state));
|
sm_id, &sm_error_state));
|
||||||
|
|
||||||
gk20a_idle(g);
|
gk20a_idle(g);
|
||||||
|
|
||||||
err_free:
|
|
||||||
nvgpu_kfree(g, sm_error_state);
|
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -295,7 +298,8 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
|
|||||||
{
|
{
|
||||||
struct gk20a *g = dbg_s->g;
|
struct gk20a *g = dbg_s->g;
|
||||||
struct gr_gk20a *gr = &g->gr;
|
struct gr_gk20a *gr = &g->gr;
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state;
|
struct nvgpu_gr_sm_error_state *sm_error_state;
|
||||||
|
struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
|
||||||
u32 sm_id;
|
u32 sm_id;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
|
||||||
@@ -304,6 +308,16 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
sm_error_state = gr->sm_error_states + sm_id;
|
sm_error_state = gr->sm_error_states + sm_id;
|
||||||
|
sm_error_state_record.hww_global_esr =
|
||||||
|
sm_error_state->hww_global_esr;
|
||||||
|
sm_error_state_record.hww_warp_esr =
|
||||||
|
sm_error_state->hww_warp_esr;
|
||||||
|
sm_error_state_record.hww_warp_esr_pc =
|
||||||
|
sm_error_state->hww_warp_esr_pc;
|
||||||
|
sm_error_state_record.hww_global_esr_report_mask =
|
||||||
|
sm_error_state->hww_global_esr_report_mask;
|
||||||
|
sm_error_state_record.hww_warp_esr_report_mask =
|
||||||
|
sm_error_state->hww_warp_esr_report_mask;
|
||||||
|
|
||||||
if (args->sm_error_state_record_size > 0) {
|
if (args->sm_error_state_record_size > 0) {
|
||||||
size_t write_size = sizeof(*sm_error_state);
|
size_t write_size = sizeof(*sm_error_state);
|
||||||
@@ -314,7 +328,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
|
|||||||
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
|
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
|
||||||
err = copy_to_user((void __user *)(uintptr_t)
|
err = copy_to_user((void __user *)(uintptr_t)
|
||||||
args->sm_error_state_record_mem,
|
args->sm_error_state_record_mem,
|
||||||
sm_error_state,
|
&sm_error_state_record,
|
||||||
write_size);
|
write_size);
|
||||||
nvgpu_mutex_release(&g->dbg_sessions_lock);
|
nvgpu_mutex_release(&g->dbg_sessions_lock);
|
||||||
if (err) {
|
if (err) {
|
||||||
|
|||||||
@@ -361,8 +361,7 @@ struct gpu_ops {
|
|||||||
u32 gpc, u32 tpc);
|
u32 gpc, u32 tpc);
|
||||||
int (*update_sm_error_state)(struct gk20a *g,
|
int (*update_sm_error_state)(struct gk20a *g,
|
||||||
struct channel_gk20a *ch, u32 sm_id,
|
struct channel_gk20a *ch, u32 sm_id,
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *
|
struct nvgpu_gr_sm_error_state *sm_error_state);
|
||||||
sm_error_state);
|
|
||||||
int (*clear_sm_error_state)(struct gk20a *g,
|
int (*clear_sm_error_state)(struct gk20a *g,
|
||||||
struct channel_gk20a *ch, u32 sm_id);
|
struct channel_gk20a *ch, u32 sm_id);
|
||||||
int (*suspend_contexts)(struct gk20a *g,
|
int (*suspend_contexts)(struct gk20a *g,
|
||||||
|
|||||||
@@ -1543,7 +1543,7 @@ restore_fe_go_idle:
|
|||||||
* we initialize gr->no_of_sm in this function
|
* we initialize gr->no_of_sm in this function
|
||||||
*/
|
*/
|
||||||
gr->sm_error_states = nvgpu_kzalloc(g,
|
gr->sm_error_states = nvgpu_kzalloc(g,
|
||||||
sizeof(struct nvgpu_dbg_gpu_sm_error_state_record)
|
sizeof(struct nvgpu_gr_sm_error_state)
|
||||||
* gr->no_of_sm);
|
* gr->no_of_sm);
|
||||||
if (!gr->sm_error_states) {
|
if (!gr->sm_error_states) {
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
@@ -4566,7 +4566,7 @@ restore_fe_go_idle:
|
|||||||
* we initialize gr->no_of_sm in this function
|
* we initialize gr->no_of_sm in this function
|
||||||
*/
|
*/
|
||||||
gr->sm_error_states = nvgpu_kzalloc(g,
|
gr->sm_error_states = nvgpu_kzalloc(g,
|
||||||
sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) *
|
sizeof(struct nvgpu_gr_sm_error_state) *
|
||||||
gr->no_of_sm);
|
gr->no_of_sm);
|
||||||
if (!gr->sm_error_states) {
|
if (!gr->sm_error_states) {
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
|
|||||||
@@ -231,6 +231,14 @@ struct nvgpu_preemption_modes_rec {
|
|||||||
u32 default_compute_preempt_mode; /* default mode */
|
u32 default_compute_preempt_mode; /* default mode */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct nvgpu_gr_sm_error_state {
|
||||||
|
u32 hww_global_esr;
|
||||||
|
u32 hww_warp_esr;
|
||||||
|
u64 hww_warp_esr_pc;
|
||||||
|
u32 hww_global_esr_report_mask;
|
||||||
|
u32 hww_warp_esr_report_mask;
|
||||||
|
};
|
||||||
|
|
||||||
struct gr_gk20a {
|
struct gr_gk20a {
|
||||||
struct gk20a *g;
|
struct gk20a *g;
|
||||||
struct {
|
struct {
|
||||||
@@ -387,7 +395,7 @@ struct gr_gk20a {
|
|||||||
u32 *fbp_rop_l2_en_mask;
|
u32 *fbp_rop_l2_en_mask;
|
||||||
u32 no_of_sm;
|
u32 no_of_sm;
|
||||||
struct sm_info *sm_to_cluster;
|
struct sm_info *sm_to_cluster;
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states;
|
struct nvgpu_gr_sm_error_state *sm_error_states;
|
||||||
#if defined(CONFIG_GK20A_CYCLE_STATS)
|
#if defined(CONFIG_GK20A_CYCLE_STATS)
|
||||||
struct nvgpu_mutex cs_lock;
|
struct nvgpu_mutex cs_lock;
|
||||||
struct gk20a_cs_snapshot *cs_data;
|
struct gk20a_cs_snapshot *cs_data;
|
||||||
|
|||||||
@@ -1297,7 +1297,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc)
|
|||||||
|
|
||||||
int gm20b_gr_update_sm_error_state(struct gk20a *g,
|
int gm20b_gr_update_sm_error_state(struct gk20a *g,
|
||||||
struct channel_gk20a *ch, u32 sm_id,
|
struct channel_gk20a *ch, u32 sm_id,
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state)
|
struct nvgpu_gr_sm_error_state *sm_error_state)
|
||||||
{
|
{
|
||||||
u32 gpc, tpc, offset;
|
u32 gpc, tpc, offset;
|
||||||
struct gr_gk20a *gr = &g->gr;
|
struct gr_gk20a *gr = &g->gr;
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ void gr_gm20b_get_access_map(struct gk20a *g,
|
|||||||
int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc);
|
int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc);
|
||||||
int gm20b_gr_update_sm_error_state(struct gk20a *g,
|
int gm20b_gr_update_sm_error_state(struct gk20a *g,
|
||||||
struct channel_gk20a *ch, u32 sm_id,
|
struct channel_gk20a *ch, u32 sm_id,
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_state);
|
struct nvgpu_gr_sm_error_state *sm_error_state);
|
||||||
int gm20b_gr_clear_sm_error_state(struct gk20a *g,
|
int gm20b_gr_clear_sm_error_state(struct gk20a *g,
|
||||||
struct channel_gk20a *ch, u32 sm_id);
|
struct channel_gk20a *ch, u32 sm_id);
|
||||||
int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,
|
int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,
|
||||||
|
|||||||
@@ -899,7 +899,7 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
|
|||||||
nvgpu_mutex_init(&gr->ctx_mutex);
|
nvgpu_mutex_init(&gr->ctx_mutex);
|
||||||
|
|
||||||
gr->sm_error_states = nvgpu_kzalloc(g,
|
gr->sm_error_states = nvgpu_kzalloc(g,
|
||||||
sizeof(struct nvgpu_dbg_gpu_sm_error_state_record) *
|
sizeof(struct nvgpu_gr_sm_error_state) *
|
||||||
gr->no_of_sm);
|
gr->no_of_sm);
|
||||||
if (!gr->sm_error_states) {
|
if (!gr->sm_error_states) {
|
||||||
err = -ENOMEM;
|
err = -ENOMEM;
|
||||||
@@ -1195,7 +1195,7 @@ int vgpu_gr_resume_contexts(struct gk20a *g,
|
|||||||
void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
|
void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
|
||||||
struct tegra_vgpu_sm_esr_info *info)
|
struct tegra_vgpu_sm_esr_info *info)
|
||||||
{
|
{
|
||||||
struct nvgpu_dbg_gpu_sm_error_state_record *sm_error_states;
|
struct nvgpu_gr_sm_error_state *sm_error_states;
|
||||||
|
|
||||||
if (info->sm_id >= g->gr.no_of_sm) {
|
if (info->sm_id >= g->gr.no_of_sm) {
|
||||||
nvgpu_err(g, "invalid smd_id %d / %d",
|
nvgpu_err(g, "invalid smd_id %d / %d",
|
||||||
|
|||||||
Reference in New Issue
Block a user