gpu: nvgpu: Read sm error ioctl support for tsg

Add READ_SM_ERROR IOCTL support to TSG level.
Moved the struct to save the sm_error details
from gr to tsg as the sm_error support is context
based, not global.

Also corrected MISRA 21.1 error in header file.

nvgpu_dbg_gpu_ioctl_write_single_sm_error_state and
nvgpu_dbg_gpu_ioctl_read_single_sm_error_state
functions are modified to use the tsg struct
nvgpu_tsg_sm_error_state.

Bug 200412642

Change-Id: I9e334b059078a4bb0e360b945444cc4bf1cc56ec
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1794856
Reviewed-by: svc-misra-checker <svc-misra-checker@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2018-08-07 23:09:30 -07:00
committed by mobile promotions
parent 3bd47da095
commit bfe65407bd
15 changed files with 387 additions and 258 deletions

View File

@@ -396,7 +396,7 @@ struct gpu_ops {
u32 sm, struct channel_gk20a *fault_ch);
int (*update_sm_error_state)(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_gr_sm_error_state *sm_error_state);
struct nvgpu_tsg_sm_error_state *sm_error_state);
int (*clear_sm_error_state)(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id);
int (*suspend_contexts)(struct gk20a *g,

View File

@@ -1561,19 +1561,6 @@ restore_fe_go_idle:
if (err)
goto clean_up;
nvgpu_kfree(g, gr->sm_error_states);
/* we need to allocate this after g->ops.gr.init_fs_state() since
* we initialize gr->no_of_sm in this function
*/
gr->sm_error_states = nvgpu_kzalloc(g,
sizeof(struct nvgpu_gr_sm_error_state)
* gr->no_of_sm);
if (!gr->sm_error_states) {
err = -ENOMEM;
goto restore_fe_go_idle;
}
ctx_header_words = roundup(ctx_header_bytes, sizeof(u32));
ctx_header_words >>= 2;
@@ -3072,7 +3059,6 @@ static void gk20a_remove_gr_support(struct gr_gk20a *gr)
memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
nvgpu_kfree(g, gr->sm_error_states);
nvgpu_kfree(g, gr->gpc_tpc_count);
nvgpu_kfree(g, gr->gpc_zcb_count);
nvgpu_kfree(g, gr->gpc_ppc_count);
@@ -4545,22 +4531,6 @@ restore_fe_go_idle:
err = gr_gk20a_wait_idle(g, gk20a_get_gr_idle_timeout(g),
GR_IDLE_CHECK_DEFAULT);
if (err)
goto out;
nvgpu_kfree(g, gr->sm_error_states);
/* we need to allocate this after g->ops.gr.init_fs_state() since
* we initialize gr->no_of_sm in this function
*/
gr->sm_error_states = nvgpu_kzalloc(g,
sizeof(struct nvgpu_gr_sm_error_state) *
gr->no_of_sm);
if (!gr->sm_error_states) {
err = -ENOMEM;
goto restore_fe_go_idle;
}
out:
nvgpu_log_fn(g, "done");
return err;

View File

@@ -254,14 +254,6 @@ struct nvgpu_preemption_modes_rec {
u32 default_compute_preempt_mode; /* default mode */
};
struct nvgpu_gr_sm_error_state {
u32 hww_global_esr;
u32 hww_warp_esr;
u64 hww_warp_esr_pc;
u32 hww_global_esr_report_mask;
u32 hww_warp_esr_report_mask;
};
struct gr_gk20a {
struct gk20a *g;
struct {
@@ -427,7 +419,6 @@ struct gr_gk20a {
u32 *fbp_rop_l2_en_mask;
u32 no_of_sm;
struct sm_info *sm_to_cluster;
struct nvgpu_gr_sm_error_state *sm_error_states;
#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0)

View File

@@ -275,8 +275,23 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
int err;
tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo);
if (!tsg)
if (tsg == NULL) {
return NULL;
}
/* we need to allocate this after g->ops.gr.init_fs_state() since
* we initialize gr->no_of_sm in this function
*/
if (g->gr.no_of_sm == 0U) {
nvgpu_err(g, "no_of_sm %d not set, failed allocation",
g->gr.no_of_sm);
return NULL;
}
err = gk20a_tsg_alloc_sm_error_states_mem(g, tsg, g->gr.no_of_sm);
if (err != 0) {
return NULL;
}
tsg->g = g;
tsg->num_active_channels = 0;
@@ -295,7 +310,7 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
if (g->ops.fifo.tsg_open) {
err = g->ops.fifo.tsg_open(tsg);
if (err) {
if (err != 0) {
nvgpu_err(g, "tsg %d fifo open failed %d",
tsg->tsgid, err);
goto clean_up;
@@ -307,6 +322,12 @@ struct tsg_gk20a *gk20a_tsg_open(struct gk20a *g, pid_t pid)
return tsg;
clean_up:
if(tsg->sm_error_states != NULL) {
nvgpu_kfree(g, tsg->sm_error_states);
tsg->sm_error_states = NULL;
}
nvgpu_ref_put(&tsg->refcount, gk20a_tsg_release);
return NULL;
}
@@ -317,20 +338,28 @@ void gk20a_tsg_release(struct nvgpu_ref *ref)
struct gk20a *g = tsg->g;
struct gk20a_event_id_data *event_id_data, *event_id_data_temp;
if (g->ops.fifo.tsg_release)
if (g->ops.fifo.tsg_release != NULL) {
g->ops.fifo.tsg_release(tsg);
}
if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem))
if (nvgpu_mem_is_valid(&tsg->gr_ctx.mem)) {
gr_gk20a_free_tsg_gr_ctx(tsg);
}
if (g->ops.fifo.deinit_eng_method_buffers)
if (g->ops.fifo.deinit_eng_method_buffers != NULL) {
g->ops.fifo.deinit_eng_method_buffers(g, tsg);
}
if (tsg->vm) {
if (tsg->vm != NULL) {
nvgpu_vm_put(tsg->vm);
tsg->vm = NULL;
}
if(tsg->sm_error_states != NULL) {
nvgpu_kfree(g, tsg->sm_error_states);
tsg->sm_error_states = NULL;
}
/* unhook all events created on this TSG */
nvgpu_mutex_acquire(&tsg->event_id_list_lock);
nvgpu_list_for_each_entry_safe(event_id_data, event_id_data_temp,
@@ -360,3 +389,44 @@ struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch)
return tsg;
}
int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
struct tsg_gk20a *tsg,
u32 num_sm)
{
int err = 0;
if (tsg->sm_error_states != NULL) {
return err;
}
tsg->sm_error_states = nvgpu_kzalloc(g,
sizeof(struct nvgpu_tsg_sm_error_state)
* num_sm);
if (tsg->sm_error_states == NULL) {
nvgpu_err(g, "sm_error_states mem allocation failed");
err = -ENOMEM;
}
return err;
}
void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
u32 sm_id,
struct nvgpu_tsg_sm_error_state *sm_error_state)
{
struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
tsg_sm_error_states = tsg->sm_error_states + sm_id;
tsg_sm_error_states->hww_global_esr =
sm_error_state->hww_global_esr;
tsg_sm_error_states->hww_warp_esr =
sm_error_state->hww_warp_esr;
tsg_sm_error_states->hww_warp_esr_pc =
sm_error_state->hww_warp_esr_pc;
tsg_sm_error_states->hww_global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
tsg_sm_error_states->hww_warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
}

View File

@@ -19,8 +19,8 @@
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __TSG_GK20A_H_
#define __TSG_GK20A_H_
#ifndef TSG_GK20A_H
#define TSG_GK20A_H
#include <nvgpu/lock.h>
#include <nvgpu/kref.h>
@@ -39,6 +39,14 @@ void gk20a_tsg_release(struct nvgpu_ref *ref);
int gk20a_init_tsg_support(struct gk20a *g, u32 tsgid);
struct tsg_gk20a *tsg_gk20a_from_ch(struct channel_gk20a *ch);
struct nvgpu_tsg_sm_error_state {
u32 hww_global_esr;
u32 hww_warp_esr;
u64 hww_warp_esr_pc;
u32 hww_global_esr_report_mask;
u32 hww_warp_esr_report_mask;
};
struct tsg_gk20a {
struct gk20a *g;
@@ -69,6 +77,7 @@ struct tsg_gk20a {
bool tpc_num_initialized;
bool in_use;
struct nvgpu_tsg_sm_error_state *sm_error_states;
};
int gk20a_enable_tsg(struct tsg_gk20a *tsg);
@@ -84,6 +93,12 @@ int gk20a_tsg_set_timeslice(struct tsg_gk20a *tsg, u32 timeslice);
u32 gk20a_tsg_get_timeslice(struct tsg_gk20a *tsg);
int gk20a_tsg_set_priority(struct gk20a *g, struct tsg_gk20a *tsg,
u32 priority);
int gk20a_tsg_alloc_sm_error_states_mem(struct gk20a *g,
struct tsg_gk20a *tsg,
u32 num_sm);
void gk20a_tsg_update_sm_error_state_locked(struct tsg_gk20a *tsg,
u32 sm_id,
struct nvgpu_tsg_sm_error_state *sm_error_state);
struct gk20a_event_id_data {
struct gk20a *g;
@@ -106,4 +121,4 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
};
#endif /* __TSG_GK20A_H_ */
#endif /* TSG_GK20A_H */

View File

@@ -1268,32 +1268,68 @@ void gr_gm20b_get_access_map(struct gk20a *g,
*num_entries = ARRAY_SIZE(wl_addr_gm20b);
}
static void gm20b_gr_read_sm_error_state(struct gk20a *g,
u32 offset,
struct nvgpu_tsg_sm_error_state *sm_error_states)
{
sm_error_states->hww_global_esr = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
sm_error_states->hww_warp_esr = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset));
sm_error_states->hww_global_esr_report_mask = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset);
sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset);
}
static void gm20b_gr_write_sm_error_state(struct gk20a *g,
u32 offset,
struct nvgpu_tsg_sm_error_state *sm_error_states)
{
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
sm_error_states->hww_global_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
sm_error_states->hww_warp_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
u64_lo32(sm_error_states->hww_warp_esr_pc));
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
sm_error_states->hww_global_esr_report_mask);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
sm_error_states->hww_warp_esr_report_mask);
}
int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
struct channel_gk20a *fault_ch)
{
int sm_id;
struct gr_gk20a *gr = &g->gr;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
GPU_LIT_TPC_IN_GPC_STRIDE);
u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
struct nvgpu_tsg_sm_error_state *sm_error_states = NULL;
struct tsg_gk20a *tsg = NULL;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
sm_id = gr_gpc0_tpc0_sm_cfg_sm_id_v(gk20a_readl(g,
gr_gpc0_tpc0_sm_cfg_r() + offset));
gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_global_esr_r() + offset);
gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset);
gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset);
gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r() + offset);
gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g,
gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r() + offset);
if (fault_ch != NULL) {
tsg = tsg_gk20a_from_ch(fault_ch);
}
if (tsg == NULL) {
nvgpu_err(g, "no valid tsg");
goto record_fail;
}
sm_error_states = tsg->sm_error_states + sm_id;
gm20b_gr_read_sm_error_state(g, offset, sm_error_states);
record_fail:
nvgpu_mutex_release(&g->dbg_sessions_lock);
return sm_id;
@@ -1301,12 +1337,12 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
int gm20b_gr_update_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_gr_sm_error_state *sm_error_state)
struct nvgpu_tsg_sm_error_state *sm_error_state)
{
u32 gpc, tpc, offset;
struct gr_gk20a *gr = &g->gr;
struct tsg_gk20a *tsg;
struct nvgpu_gr_ctx *ch_ctx;
struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
GPU_LIT_TPC_IN_GPC_STRIDE);
@@ -1320,16 +1356,8 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
gr->sm_error_states[sm_id].hww_global_esr =
sm_error_state->hww_global_esr;
gr->sm_error_states[sm_id].hww_warp_esr =
sm_error_state->hww_warp_esr;
gr->sm_error_states[sm_id].hww_warp_esr_pc =
sm_error_state->hww_warp_esr_pc;
gr->sm_error_states[sm_id].hww_global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
tsg_sm_error_states = tsg->sm_error_states + sm_id;
gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state);
err = gr_gk20a_disable_ctxsw(g);
if (err) {
@@ -1343,29 +1371,20 @@ int gm20b_gr_update_sm_error_state(struct gk20a *g,
offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc;
if (gk20a_is_channel_ctx_resident(ch)) {
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr);
gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_pc_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_pc);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask);
gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
gm20b_gr_write_sm_error_state(g, offset, tsg_sm_error_states);
} else {
err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
if (err)
goto enable_ctxsw;
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask,
true);
gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r() + offset,
tsg_sm_error_states->hww_global_esr_report_mask,
true);
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
true);
gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r() + offset,
tsg_sm_error_states->hww_warp_esr_report_mask,
true);
gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
}
@@ -1383,15 +1402,20 @@ int gm20b_gr_clear_sm_error_state(struct gk20a *g,
{
u32 gpc, tpc, offset;
u32 val;
struct gr_gk20a *gr = &g->gr;
struct tsg_gk20a *tsg;
u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE);
u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g,
GPU_LIT_TPC_IN_GPC_STRIDE);
int err = 0;
tsg = tsg_gk20a_from_ch(ch);
if (tsg == NULL) {
return -EINVAL;
}
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states));
err = gr_gk20a_disable_ctxsw(g);
if (err) {

View File

@@ -119,7 +119,7 @@ int gm20b_gr_record_sm_error_state(struct gk20a *g, u32 gpc,
u32 tpc, u32 sm, struct channel_gk20a *fault_ch);
int gm20b_gr_update_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_gr_sm_error_state *sm_error_state);
struct nvgpu_tsg_sm_error_state *sm_error_state);
int gm20b_gr_clear_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id);
int gr_gm20b_get_preemption_mode_flags(struct gk20a *g,

View File

@@ -3212,18 +3212,42 @@ void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state)
}
}
static void gv11b_gr_write_sm_error_state(struct gk20a *g,
u32 offset,
struct nvgpu_tsg_sm_error_state *sm_error_states)
{
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset,
sm_error_states->hww_global_esr);
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset,
sm_error_states->hww_warp_esr);
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset,
u64_lo32(sm_error_states->hww_warp_esr_pc));
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset,
u64_hi32(sm_error_states->hww_warp_esr_pc));
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset,
sm_error_states->hww_global_esr_report_mask);
nvgpu_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset,
sm_error_states->hww_warp_esr_report_mask);
}
int gv11b_gr_update_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_gr_sm_error_state *sm_error_state)
struct nvgpu_tsg_sm_error_state *sm_error_state)
{
struct tsg_gk20a *tsg;
u32 gpc, tpc, sm, offset;
struct gr_gk20a *gr = &g->gr;
struct nvgpu_gr_ctx *ch_ctx;
int err = 0;
struct nvgpu_tsg_sm_error_state *tsg_sm_error_states;
tsg = tsg_gk20a_from_ch(ch);
if (!tsg) {
if (tsg == NULL) {
return -EINVAL;
}
@@ -3231,16 +3255,8 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
gr->sm_error_states[sm_id].hww_global_esr =
sm_error_state->hww_global_esr;
gr->sm_error_states[sm_id].hww_warp_esr =
sm_error_state->hww_warp_esr;
gr->sm_error_states[sm_id].hww_warp_esr_pc =
sm_error_state->hww_warp_esr_pc;
gr->sm_error_states[sm_id].hww_global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
gr->sm_error_states[sm_id].hww_warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
tsg_sm_error_states = tsg->sm_error_states + sm_id;
gk20a_tsg_update_sm_error_state_locked(tsg, sm_id, sm_error_state);
err = gr_gk20a_disable_ctxsw(g);
if (err) {
@@ -3257,21 +3273,7 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
gv11b_gr_sm_offset(g, sm);
if (gk20a_is_channel_ctx_resident(ch)) {
gk20a_writel(g,
gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr);
gk20a_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr);
gk20a_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_pc);
gk20a_writel(g,
gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask);
gk20a_writel(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask);
gv11b_gr_write_sm_error_state(g, offset, tsg_sm_error_states);
} else {
err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx, false);
if (err) {
@@ -3281,12 +3283,12 @@ int gv11b_gr_update_sm_error_state(struct gk20a *g,
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sms_hww_global_esr_report_mask_r() +
offset,
gr->sm_error_states[sm_id].hww_global_esr_report_mask,
tsg_sm_error_states->hww_global_esr_report_mask,
true);
gr_gk20a_ctx_patch_write(g, ch_ctx,
gr_gpcs_tpcs_sms_hww_warp_esr_report_mask_r() +
offset,
gr->sm_error_states[sm_id].hww_warp_esr_report_mask,
tsg_sm_error_states->hww_warp_esr_report_mask,
true);
gr_gk20a_ctx_patch_write_end(g, ch_ctx, false);
@@ -3362,13 +3364,36 @@ int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
return err;
}
static void gv11b_gr_read_sm_error_state(struct gk20a *g,
u32 offset,
struct nvgpu_tsg_sm_error_state *sm_error_states)
{
sm_error_states->hww_global_esr = nvgpu_readl(g,
gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset);
sm_error_states->hww_warp_esr = nvgpu_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset);
sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64((nvgpu_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r() + offset)),
(nvgpu_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset)));
sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g,
gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset);
sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset);
}
int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
struct channel_gk20a *fault_ch)
{
int sm_id;
struct gr_gk20a *gr = &g->gr;
u32 offset, sm_per_tpc, tpc_id;
u32 gpc_offset, gpc_tpc_offset;
struct nvgpu_tsg_sm_error_state *sm_error_states = NULL;
struct tsg_gk20a *tsg = NULL;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
@@ -3381,21 +3406,19 @@ int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,
offset = gpc_tpc_offset + gv11b_gr_sm_offset(g, sm);
gr->sm_error_states[sm_id].hww_global_esr = gk20a_readl(g,
gr_gpc0_tpc0_sm0_hww_global_esr_r() + offset);
if (fault_ch != NULL) {
tsg = tsg_gk20a_from_ch(fault_ch);
}
gr->sm_error_states[sm_id].hww_warp_esr = gk20a_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_r() + offset);
if (tsg == NULL) {
nvgpu_err(g, "no valid tsg");
goto record_fail;
}
gr->sm_error_states[sm_id].hww_warp_esr_pc = gk20a_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r() + offset);
gr->sm_error_states[sm_id].hww_global_esr_report_mask = gk20a_readl(g,
gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r() + offset);
gr->sm_error_states[sm_id].hww_warp_esr_report_mask = gk20a_readl(g,
gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r() + offset);
sm_error_states = tsg->sm_error_states + sm_id;
gv11b_gr_read_sm_error_state(g, offset, sm_error_states);
record_fail:
nvgpu_mutex_release(&g->dbg_sessions_lock);
return sm_id;

View File

@@ -43,7 +43,7 @@ struct zbc_entry;
struct zbc_query_params;
struct nvgpu_gr_ctx;
struct nvgpu_warpstate;
struct nvgpu_gr_sm_error_state;
struct nvgpu_tsg_sm_error_state;
struct gr_ctx_desc;
struct gr_gk20a_isr_data;
struct gk20a_debug_output;
@@ -168,7 +168,7 @@ int gv11b_gr_sm_trigger_suspend(struct gk20a *g);
void gv11b_gr_bpt_reg_info(struct gk20a *g, struct nvgpu_warpstate *w_state);
int gv11b_gr_update_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id,
struct nvgpu_gr_sm_error_state *sm_error_state);
struct nvgpu_tsg_sm_error_state *sm_error_state);
int gv11b_gr_set_sm_debug_mode(struct gk20a *g,
struct channel_gk20a *ch, u64 sms, bool enable);
int gv11b_gr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm,

View File

@@ -22,8 +22,8 @@
* DEALINGS IN THE SOFTWARE.
*/
#ifndef __TEGRA_VGPU_H
#define __TEGRA_VGPU_H
#ifndef TEGRA_VGPU_H
#define TEGRA_VGPU_H
#include <nvgpu/types.h>
#include <nvgpu/ecc.h> /* For NVGPU_ECC_STAT_NAME_MAX_SIZE */
@@ -737,6 +737,7 @@ struct tegra_vgpu_channel_event_info {
};
struct tegra_vgpu_sm_esr_info {
u32 tsg_id;
u32 sm_id;
u32 hww_global_esr;
u32 hww_warp_esr;

View File

@@ -1567,56 +1567,6 @@ out:
return err;
}
static int nvgpu_gpu_read_single_sm_error_state(struct gk20a *g,
struct nvgpu_gpu_read_single_sm_error_state_args *args)
{
struct gr_gk20a *gr = &g->gr;
struct nvgpu_gr_sm_error_state *sm_error_state;
struct nvgpu_gpu_sm_error_state_record sm_error_state_record;
u32 sm_id;
int err = 0;
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
return -EINVAL;
nvgpu_speculation_barrier();
sm_error_state = gr->sm_error_states + sm_id;
sm_error_state_record.global_esr =
sm_error_state->hww_global_esr;
sm_error_state_record.warp_esr =
sm_error_state->hww_warp_esr;
sm_error_state_record.warp_esr_pc =
sm_error_state->hww_warp_esr_pc;
sm_error_state_record.global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
sm_error_state_record.warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
if (args->record_size > 0) {
size_t write_size = sizeof(*sm_error_state);
if (write_size > args->record_size)
write_size = args->record_size;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
err = copy_to_user((void __user *)(uintptr_t)
args->record_mem,
&sm_error_state_record,
write_size);
nvgpu_mutex_release(&g->dbg_sessions_lock);
if (err) {
nvgpu_err(g, "copy_to_user failed!");
return err;
}
args->record_size = write_size;
}
return 0;
}
long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct gk20a_ctrl_priv *priv = filp->private_data;
@@ -1925,11 +1875,6 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
(struct nvgpu_gpu_set_deterministic_opts_args *)buf);
break;
case NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE:
err = nvgpu_gpu_read_single_sm_error_state(g,
(struct nvgpu_gpu_read_single_sm_error_state_args *)buf);
break;
default:
nvgpu_log_info(g, "unrecognized gpu ioctl cmd: 0x%x", cmd);
err = -ENOTTY;

View File

@@ -35,6 +35,7 @@
#include "gk20a/gk20a.h"
#include "gk20a/gr_gk20a.h"
#include "gk20a/tsg_gk20a.h"
#include "gk20a/regops_gk20a.h"
#include "gk20a/dbg_gpu_gk20a.h"
#include "os_linux.h"
@@ -271,20 +272,23 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
u32 sm_id;
struct channel_gk20a *ch;
struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
struct nvgpu_gr_sm_error_state sm_error_state;
struct nvgpu_tsg_sm_error_state sm_error_state;
int err = 0;
/* Not currently supported in the virtual case */
if (g->is_virtual)
if (g->is_virtual) {
return -ENOSYS;
}
ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
if (!ch)
if (ch == NULL) {
return -EINVAL;
}
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
if (sm_id >= gr->no_of_sm) {
return -EINVAL;
}
nvgpu_speculation_barrier();
@@ -300,13 +304,15 @@ static int nvgpu_dbg_gpu_ioctl_write_single_sm_error_state(
args->sm_error_state_record_mem,
read_size);
nvgpu_mutex_release(&g->dbg_sessions_lock);
if (err)
if (err != 0) {
return -ENOMEM;
}
}
err = gk20a_busy(g);
if (err)
if (err != 0) {
return err;
}
sm_error_state.hww_global_esr =
sm_error_state_record.hww_global_esr;
@@ -335,18 +341,36 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
{
struct gk20a *g = dbg_s->g;
struct gr_gk20a *gr = &g->gr;
struct nvgpu_gr_sm_error_state *sm_error_state;
struct nvgpu_tsg_sm_error_state *sm_error_state;
struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record;
struct channel_gk20a *ch;
struct tsg_gk20a *tsg;
u32 sm_id;
int err = 0;
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
if (ch == NULL) {
return -EINVAL;
}
tsg = tsg_gk20a_from_ch(ch);
if (tsg == NULL) {
nvgpu_err(g, "no valid tsg from ch");
return -EINVAL;
}
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm) {
return -EINVAL;
}
if (tsg->sm_error_states == NULL) {
return -EINVAL;
}
nvgpu_speculation_barrier();
sm_error_state = gr->sm_error_states + sm_id;
sm_error_state = tsg->sm_error_states + sm_id;
sm_error_state_record.hww_global_esr =
sm_error_state->hww_global_esr;
sm_error_state_record.hww_warp_esr =
@@ -370,7 +394,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state(
&sm_error_state_record,
write_size);
nvgpu_mutex_release(&g->dbg_sessions_lock);
if (err) {
if (err != 0) {
nvgpu_err(g, "copy_to_user failed!");
return err;
}
@@ -1500,8 +1524,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
int err = 0;
ch = nvgpu_dbg_gpu_get_session_channel(dbg_s);
if (!ch)
if (ch == NULL) {
return -EINVAL;
}
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
@@ -1510,8 +1535,9 @@ static int nvgpu_dbg_gpu_ioctl_clear_single_sm_error_state(
nvgpu_speculation_barrier();
err = gk20a_busy(g);
if (err)
if (err != 0) {
return err;
}
err = gr_gk20a_elpg_protected_call(g,
g->ops.gr.clear_sm_error_state(g, ch, sm_id));

View File

@@ -536,6 +536,57 @@ static int gk20a_tsg_ioctl_get_timeslice(struct gk20a *g,
return 0;
}
static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g,
struct tsg_gk20a *tsg,
struct nvgpu_tsg_read_single_sm_error_state_args *args)
{
struct gr_gk20a *gr = &g->gr;
struct nvgpu_tsg_sm_error_state *sm_error_state;
struct nvgpu_tsg_sm_error_state_record sm_error_state_record;
u32 sm_id;
int err = 0;
sm_id = args->sm_id;
if (sm_id >= gr->no_of_sm)
return -EINVAL;
nvgpu_speculation_barrier();
sm_error_state = tsg->sm_error_states + sm_id;
sm_error_state_record.global_esr =
sm_error_state->hww_global_esr;
sm_error_state_record.warp_esr =
sm_error_state->hww_warp_esr;
sm_error_state_record.warp_esr_pc =
sm_error_state->hww_warp_esr_pc;
sm_error_state_record.global_esr_report_mask =
sm_error_state->hww_global_esr_report_mask;
sm_error_state_record.warp_esr_report_mask =
sm_error_state->hww_warp_esr_report_mask;
if (args->record_size > 0) {
size_t write_size = sizeof(*sm_error_state);
if (write_size > args->record_size)
write_size = args->record_size;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
err = copy_to_user((void __user *)(uintptr_t)
args->record_mem,
&sm_error_state_record,
write_size);
nvgpu_mutex_release(&g->dbg_sessions_lock);
if (err) {
nvgpu_err(g, "copy_to_user failed!");
return err;
}
args->record_size = write_size;
}
return 0;
}
long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -670,6 +721,13 @@ long nvgpu_ioctl_tsg_dev_ioctl(struct file *filp, unsigned int cmd,
break;
}
case NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE:
{
err = gk20a_tsg_ioctl_read_single_sm_error_state(g, tsg,
(struct nvgpu_tsg_read_single_sm_error_state_args *)buf);
break;
}
default:
nvgpu_err(g, "unrecognized tsg gpu ioctl cmd: 0x%x",
cmd);

View File

@@ -882,9 +882,6 @@ static void vgpu_remove_gr_support(struct gr_gk20a *gr)
gk20a_comptag_allocator_destroy(gr->g, &gr->comp_tags);
nvgpu_kfree(gr->g, gr->sm_error_states);
gr->sm_error_states = NULL;
nvgpu_kfree(gr->g, gr->gpc_tpc_mask);
gr->gpc_tpc_mask = NULL;
@@ -935,14 +932,6 @@ static int vgpu_gr_init_gr_setup_sw(struct gk20a *g)
nvgpu_mutex_init(&gr->ctx_mutex);
nvgpu_spinlock_init(&gr->ch_tlb_lock);
gr->sm_error_states = nvgpu_kzalloc(g,
sizeof(struct nvgpu_gr_sm_error_state) *
gr->no_of_sm);
if (!gr->sm_error_states) {
err = -ENOMEM;
goto clean_up;
}
gr->remove_support = vgpu_remove_gr_support;
gr->sw_ready = true;
@@ -1152,12 +1141,17 @@ int vgpu_gr_update_hwpm_ctxsw_mode(struct gk20a *g,
int vgpu_gr_clear_sm_error_state(struct gk20a *g,
struct channel_gk20a *ch, u32 sm_id)
{
struct gr_gk20a *gr = &g->gr;
struct tegra_vgpu_cmd_msg msg;
struct tegra_vgpu_clear_sm_error_state *p =
&msg.params.clear_sm_error_state;
struct tsg_gk20a *tsg;
int err;
tsg = tsg_gk20a_from_ch(ch);
if (!tsg) {
return -EINVAL;
}
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
msg.cmd = TEGRA_VGPU_CMD_CLEAR_SM_ERROR_STATE;
msg.handle = vgpu_get_handle(g);
@@ -1167,7 +1161,7 @@ int vgpu_gr_clear_sm_error_state(struct gk20a *g,
err = vgpu_comm_sendrecv(&msg, sizeof(msg), sizeof(msg));
WARN_ON(err || msg.ret);
memset(&gr->sm_error_states[sm_id], 0, sizeof(*gr->sm_error_states));
memset(&tsg->sm_error_states[sm_id], 0, sizeof(*tsg->sm_error_states));
nvgpu_mutex_release(&g->dbg_sessions_lock);
return err ? err : msg.ret;
@@ -1264,7 +1258,8 @@ int vgpu_gr_resume_contexts(struct gk20a *g,
void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
struct tegra_vgpu_sm_esr_info *info)
{
struct nvgpu_gr_sm_error_state *sm_error_states;
struct nvgpu_tsg_sm_error_state *sm_error_states;
struct tsg_gk20a *tsg;
if (info->sm_id >= g->gr.no_of_sm) {
nvgpu_err(g, "invalid smd_id %d / %d",
@@ -1272,9 +1267,20 @@ void vgpu_gr_handle_sm_esr_event(struct gk20a *g,
return;
}
if (info->tsg_id >= g->fifo.num_channels) {
nvgpu_err(g, "invalid tsg_id in sm esr event");
return;
}
tsg = &g->fifo.tsg[info->tsg_id];
if (tsg == NULL) {
nvgpu_err(g, "invalid tsg");
return;
}
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
sm_error_states = &g->gr.sm_error_states[info->sm_id];
sm_error_states = &tsg->sm_error_states[info->sm_id];
sm_error_states->hww_global_esr = info->hww_global_esr;
sm_error_states->hww_warp_esr = info->hww_warp_esr;

View File

@@ -861,38 +861,6 @@ struct nvgpu_gpu_set_deterministic_opts_args {
__u64 channels; /* in */
};
/*
* This struct helps to report the SM error state of a single SM.
* This acts upon the currently resident GR context.
* Global Error status register
* Warp Error status register
* Warp Error status register PC
* Global Error status register Report Mask
* Warp Error status register Report Mask
*/
struct nvgpu_gpu_sm_error_state_record {
__u32 global_esr;
__u32 warp_esr;
__u64 warp_esr_pc;
__u32 global_esr_report_mask;
__u32 warp_esr_report_mask;
};
/*
* This struct helps to read the SM error state.
*/
struct nvgpu_gpu_read_single_sm_error_state_args {
/* Valid SM ID */
__u32 sm_id;
__u32 reserved;
/*
* This is pointer to the struct nvgpu_gpu_sm_error_state_record
*/
__u64 record_mem;
/* size of the record size to read */
__u64 record_size;
};
#define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
_IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
#define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -976,11 +944,8 @@ struct nvgpu_gpu_read_single_sm_error_state_args {
#define NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS \
_IOWR(NVGPU_GPU_IOCTL_MAGIC, 40, \
struct nvgpu_gpu_set_deterministic_opts_args)
#define NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE \
_IOWR(NVGPU_GPU_IOCTL_MAGIC, 41, \
struct nvgpu_gpu_read_single_sm_error_state_args)
#define NVGPU_GPU_IOCTL_LAST \
_IOC_NR(NVGPU_GPU_IOCTL_READ_SINGLE_SM_ERROR_STATE)
_IOC_NR(NVGPU_GPU_IOCTL_SET_DETERMINISTIC_OPTS)
#define NVGPU_GPU_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
@@ -1063,6 +1028,38 @@ struct nvgpu_tsg_bind_channel_ex_args {
__u8 reserved[11];
};
/*
* This struct helps to report the SM error state of a single SM.
* This acts upon the currently resident TSG context.
* Global Error status register
* Warp Error status register
* Warp Error status register PC
* Global Error status register Report Mask
* Warp Error status register Report Mask
*/
struct nvgpu_tsg_sm_error_state_record {
__u32 global_esr;
__u32 warp_esr;
__u64 warp_esr_pc;
__u32 global_esr_report_mask;
__u32 warp_esr_report_mask;
};
/*
* This struct helps to read the SM error state.
*/
struct nvgpu_tsg_read_single_sm_error_state_args {
/* Valid SM ID */
__u32 sm_id;
__u32 reserved;
/*
* This is pointer to the struct nvgpu_gpu_sm_error_state_record
*/
__u64 record_mem;
/* size of the record size to read */
__u64 record_size;
};
#define NVGPU_TSG_IOCTL_BIND_CHANNEL \
_IOW(NVGPU_TSG_IOCTL_MAGIC, 1, int)
#define NVGPU_TSG_IOCTL_UNBIND_CHANNEL \
@@ -1083,10 +1080,13 @@ struct nvgpu_tsg_bind_channel_ex_args {
_IOR(NVGPU_TSG_IOCTL_MAGIC, 10, struct nvgpu_timeslice_args)
#define NVGPU_TSG_IOCTL_BIND_CHANNEL_EX \
_IOWR(NVGPU_TSG_IOCTL_MAGIC, 11, struct nvgpu_tsg_bind_channel_ex_args)
#define NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE \
_IOR(NVGPU_TSG_IOCTL_MAGIC, 12, \
struct nvgpu_tsg_read_single_sm_error_state_args)
#define NVGPU_TSG_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_tsg_bind_channel_ex_args)
#define NVGPU_TSG_IOCTL_LAST \
_IOC_NR(NVGPU_TSG_IOCTL_BIND_CHANNEL_EX)
_IOC_NR(NVGPU_TSG_IOCTL_READ_SINGLE_SM_ERROR_STATE)
/*
* /dev/nvhost-dbg-gpu device