diff --git a/drivers/gpu/nvgpu/common/fifo/tsg.c b/drivers/gpu/nvgpu/common/fifo/tsg.c index ddbec58cd..337489f2c 100644 --- a/drivers/gpu/nvgpu/common/fifo/tsg.c +++ b/drivers/gpu/nvgpu/common/fifo/tsg.c @@ -1002,6 +1002,53 @@ int nvgpu_tsg_alloc_sm_error_states_mem(struct gk20a *g, return 0; } +int nvgpu_tsg_store_sm_error_state(struct nvgpu_tsg *tsg, u32 sm_id, + u32 hww_global_esr, u32 hww_warp_esr, u64 hww_warp_esr_pc, + u32 hww_global_esr_report_mask, u32 hww_warp_esr_report_mask) +{ + struct gk20a *g = tsg->g; + u32 num_of_sm = g->ops.gr.init.get_no_of_sm(g); + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; + + if (sm_id >= num_of_sm) { + nvgpu_err(g, "Invalid number of SMs"); + return -EINVAL; + } + + if (tsg->sm_error_states == NULL) { + nvgpu_err(g, "invalid memory"); + return -ENOMEM; + } + + sm_error_states = &tsg->sm_error_states[sm_id]; + + sm_error_states->hww_global_esr = hww_global_esr; + sm_error_states->hww_warp_esr = hww_warp_esr; + sm_error_states->hww_warp_esr_pc = hww_warp_esr_pc; + sm_error_states->hww_global_esr_report_mask = hww_global_esr_report_mask; + sm_error_states->hww_warp_esr_report_mask = hww_warp_esr_report_mask; + + return 0; +} + +const struct nvgpu_tsg_sm_error_state *nvgpu_tsg_get_sm_error_state(struct nvgpu_tsg *tsg, u32 sm_id) +{ + struct gk20a *g = tsg->g; + u32 num_of_sm = g->ops.gr.init.get_no_of_sm(g); + + if (sm_id >= num_of_sm) { + nvgpu_err(g, "Invalid number of SMs"); + return NULL; + } + + if (tsg->sm_error_states == NULL) { + nvgpu_err(g, "Invalid memory"); + return NULL; + } + + return &tsg->sm_error_states[sm_id]; +} + #ifdef CONFIG_NVGPU_DEBUGGER int nvgpu_tsg_set_sm_exception_type_mask(struct nvgpu_channel *ch, u32 exception_mask) diff --git a/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c b/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c index 6ce56786c..6c0d31b36 100644 --- a/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c +++ b/drivers/gpu/nvgpu/common/vgpu/gr/gr_vgpu.c @@ -982,7 +982,6 @@ int vgpu_gr_resume_contexts(struct gk20a *g, void vgpu_gr_handle_sm_esr_event(struct gk20a *g, struct tegra_vgpu_sm_esr_info *info) { - struct nvgpu_tsg_sm_error_state *sm_error_states; struct nvgpu_tsg *tsg; u32 no_of_sm = g->ops.gr.init.get_no_of_sm(g); @@ -1004,15 +1003,10 @@ void vgpu_gr_handle_sm_esr_event(struct gk20a *g, nvgpu_mutex_acquire(&g->dbg_sessions_lock); - sm_error_states = &tsg->sm_error_states[info->sm_id]; - - sm_error_states->hww_global_esr = info->hww_global_esr; - sm_error_states->hww_warp_esr = info->hww_warp_esr; - sm_error_states->hww_warp_esr_pc = info->hww_warp_esr_pc; - sm_error_states->hww_global_esr_report_mask = - info->hww_global_esr_report_mask; - sm_error_states->hww_warp_esr_report_mask = - info->hww_warp_esr_report_mask; + (void)nvgpu_tsg_store_sm_error_state(tsg, info->sm_id, + info->hww_global_esr, info->hww_warp_esr, + info->hww_warp_esr_pc, info->hww_global_esr_report_mask, + info->hww_warp_esr_report_mask); nvgpu_mutex_release(&g->dbg_sessions_lock); } diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c index 6559c39c1..fb6b07a0a 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gm20b.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -304,21 +304,24 @@ void gm20b_gr_intr_clear_sm_hww(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, 0); } -static void gm20b_gr_intr_read_sm_error_state(struct gk20a *g, - u32 offset, - struct nvgpu_tsg_sm_error_state *sm_error_states) +static int gm20b_gr_intr_read_sm_error_state(struct gk20a *g, + struct nvgpu_tsg *tsg, u32 offset, u32 sm_id) { - sm_error_states->hww_global_esr = gk20a_readl(g, nvgpu_safe_add_u32( + u32 hww_global_esr = gk20a_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm_hww_global_esr_r(), offset)); - sm_error_states->hww_warp_esr = gk20a_readl(g, nvgpu_safe_add_u32( + u32 hww_warp_esr = gk20a_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm_hww_warp_esr_r(), offset)); - sm_error_states->hww_warp_esr_pc = (u64)(gk20a_readl(g, nvgpu_safe_add_u32( + u64 hww_warp_esr_pc = (u64)(gk20a_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm_hww_warp_esr_pc_r(), offset))); - sm_error_states->hww_global_esr_report_mask = gk20a_readl(g, nvgpu_safe_add_u32( + u32 hww_global_esr_report_mask = gk20a_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm_hww_global_esr_report_mask_r(), offset)); - sm_error_states->hww_warp_esr_report_mask = gk20a_readl(g, nvgpu_safe_add_u32( + u32 hww_warp_esr_report_mask = gk20a_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm_hww_warp_esr_report_mask_r(), offset)); + return nvgpu_tsg_store_sm_error_state(tsg, sm_id, + hww_global_esr, hww_warp_esr, + hww_warp_esr_pc, hww_global_esr_report_mask, + hww_warp_esr_report_mask); } u32 gm20b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, @@ -329,8 +332,8 @@ u32 gm20b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 s u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 offset; - struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; struct nvgpu_tsg *tsg = NULL; + int err = 0; offset = nvgpu_safe_add_u32( nvgpu_safe_mult_u32(gpc_stride, gpc), @@ -353,8 +356,10 @@ u32 gm20b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 s goto record_fail; } - sm_error_states = tsg->sm_error_states + sm_id; - gm20b_gr_intr_read_sm_error_state(g, offset, sm_error_states); + err = gm20b_gr_intr_read_sm_error_state(g, tsg, offset, sm_id); + if (err != 0) { + nvgpu_err(g, "error writing sm_error_state"); + } record_fail: #ifdef CONFIG_NVGPU_DEBUGGER diff --git a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c index b3bd4d693..9e0b3d83c 100644 --- a/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c +++ b/drivers/gpu/nvgpu/hal/gr/intr/gr_intr_gv11b_fusa.c @@ -1852,34 +1852,36 @@ void gv11b_gr_intr_handle_ssync_hww(struct gk20a *g, u32 *ssync_esr) gr_ssync_hww_esr_reset_active_f()); } -static void gv11b_gr_intr_read_sm_error_state(struct gk20a *g, - u32 offset, - struct nvgpu_tsg_sm_error_state *sm_error_states) +static int gv11b_gr_intr_read_sm_error_state(struct gk20a *g, + struct nvgpu_tsg *tsg, u32 offset, u32 sm_id) { - u32 addr_hi, addr_lo; - - sm_error_states->hww_global_esr = nvgpu_readl(g, nvgpu_safe_add_u32( + u32 hww_global_esr = nvgpu_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm0_hww_global_esr_r(), offset)); - sm_error_states->hww_warp_esr = nvgpu_readl(g, nvgpu_safe_add_u32( + u32 hww_warp_esr = nvgpu_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm0_hww_warp_esr_r(), offset)); - addr_hi = nvgpu_readl(g, nvgpu_safe_add_u32( + u32 addr_hi = nvgpu_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm0_hww_warp_esr_pc_hi_r(), offset)); - addr_lo = nvgpu_readl(g, nvgpu_safe_add_u32( + u32 addr_lo = nvgpu_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm0_hww_warp_esr_pc_r(), offset)); - sm_error_states->hww_warp_esr_pc = hi32_lo32_to_u64(addr_hi, addr_lo); + u64 hww_warp_esr_pc = hi32_lo32_to_u64(addr_hi, addr_lo); - sm_error_states->hww_global_esr_report_mask = nvgpu_readl(g, + u32 hww_global_esr_report_mask = nvgpu_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm0_hww_global_esr_report_mask_r(), offset)); - sm_error_states->hww_warp_esr_report_mask = nvgpu_readl(g, + u32 hww_warp_esr_report_mask = nvgpu_readl(g, nvgpu_safe_add_u32( gr_gpc0_tpc0_sm0_hww_warp_esr_report_mask_r(), offset)); + + return nvgpu_tsg_store_sm_error_state(tsg, sm_id, + hww_global_esr, hww_warp_esr, + hww_warp_esr_pc, hww_global_esr_report_mask, + hww_warp_esr_report_mask); } u32 gv11b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, @@ -1888,8 +1890,8 @@ u32 gv11b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 s u32 sm_id; u32 offset, sm_per_tpc, tpc_id; u32 gpc_offset, gpc_tpc_offset; - struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; struct nvgpu_tsg *tsg = NULL; + int err = 0; #ifdef CONFIG_NVGPU_DEBUGGER nvgpu_mutex_acquire(&g->dbg_sessions_lock); @@ -1918,8 +1920,10 @@ u32 gv11b_gr_intr_record_sm_error_state(struct gk20a *g, u32 gpc, u32 tpc, u32 s goto record_fail; } - sm_error_states = &tsg->sm_error_states[sm_id]; - gv11b_gr_intr_read_sm_error_state(g, offset, sm_error_states); + err = gv11b_gr_intr_read_sm_error_state(g, tsg, offset, sm_id); + if (err != 0) { + nvgpu_err(g, "error writing sm_error_state"); + } record_fail: #ifdef CONFIG_NVGPU_DEBUGGER diff --git a/drivers/gpu/nvgpu/include/nvgpu/tsg.h b/drivers/gpu/nvgpu/include/nvgpu/tsg.h index f061ca44f..93960c2cc 100644 --- a/drivers/gpu/nvgpu/include/nvgpu/tsg.h +++ b/drivers/gpu/nvgpu/include/nvgpu/tsg.h @@ -618,6 +618,45 @@ void nvgpu_tsg_set_unserviceable(struct gk20a *g, struct nvgpu_tsg *tsg); */ void nvgpu_tsg_wakeup_wqs(struct gk20a *g, struct nvgpu_tsg *tsg); +/** @brief store error info for SM error state + * + * @param tsg [in] Pointer to the TSG struct. + * @param sm_id [in] index of SM + * @param hww_global_esr [in] hww_global_esr reg value + * @param hww_warp_esr [in] hww_warp_esr register value + * @param hww_warp_esr_pc [in] PC value of hww_warp_esr + * @param hww_global_esr_report_mask [in] hww_global_esr_report_mask + * @param hww_warp_esr_report_mask [in] hww_warp_esr_report_mask + * + * Allocate zero initialized memory to #nvgpu_tsg_sm_error_state, which stores + * SM errors for all the SMs supported by h/w. + * + * @return 0 in case of success, < 0 in case of failure. + * @retval -EINVAL if memory is already allocated to store + * SM error states. + * @retval -ENOMEM if memory could not be allocated to store + * SM error states. + */ +int nvgpu_tsg_store_sm_error_state(struct nvgpu_tsg *tsg, u32 sm_id, + u32 hww_global_esr, u32 hww_warp_esr, u64 hww_warp_esr_pc, + u32 hww_global_esr_report_mask, u32 hww_warp_esr_report_mask); + +/** + * @brief retrieve pointer to nvgpu_tsg_get_sm_error_state + * + * @param tsg [in] Pointer to the TSG struct. + * @param sm_id [in] index of SM + * + * Retrieve a pointer to a struct nvgpu_tsg_get_sm_error_state for + * the index sm_id. + * + * @retval NULL if sm_id is incorrect or no memory was allocated for storing + * SM error states. + * @retval pointer to a constant struct nvgpu_tsg_sm_error_state + */ +const struct nvgpu_tsg_sm_error_state *nvgpu_tsg_get_sm_error_state( + struct nvgpu_tsg *tsg, u32 sm_id); + #ifdef CONFIG_NVGPU_DEBUGGER int nvgpu_tsg_set_sm_exception_type_mask(struct nvgpu_channel *ch, u32 exception_mask); diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c index 03dd8da8c..6b99827a7 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_dbg.c @@ -276,7 +276,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( struct nvgpu_dbg_gpu_read_single_sm_error_state_args *args) { struct gk20a *g = dbg_s->g; - struct nvgpu_tsg_sm_error_state *sm_error_state; + const struct nvgpu_tsg_sm_error_state *sm_error_state = NULL; struct nvgpu_dbg_gpu_sm_error_state_record sm_error_state_record; struct nvgpu_channel *ch; struct nvgpu_tsg *tsg; @@ -307,7 +307,7 @@ static int nvgpu_dbg_gpu_ioctl_read_single_sm_error_state( nvgpu_speculation_barrier(); - sm_error_state = tsg->sm_error_states + sm_id; + sm_error_state = nvgpu_tsg_get_sm_error_state(tsg, sm_id); sm_error_state_record.hww_global_esr = sm_error_state->hww_global_esr; sm_error_state_record.hww_warp_esr = diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c index c62d9415d..9f5c6b6d0 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_tsg.c @@ -583,7 +583,7 @@ static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g, struct nvgpu_tsg *tsg, struct nvgpu_tsg_read_single_sm_error_state_args *args) { - struct nvgpu_tsg_sm_error_state *sm_error_state; + const struct nvgpu_tsg_sm_error_state *sm_error_state = NULL; struct nvgpu_tsg_sm_error_state_record sm_error_state_record; u32 sm_id; int err = 0; @@ -597,7 +597,7 @@ static int gk20a_tsg_ioctl_read_single_sm_error_state(struct gk20a *g, nvgpu_speculation_barrier(); - sm_error_state = tsg->sm_error_states + sm_id; + sm_error_state = nvgpu_tsg_get_sm_error_state(tsg, sm_id); sm_error_state_record.global_esr = sm_error_state->hww_global_esr; sm_error_state_record.warp_esr = diff --git a/libs/igpu/libnvgpu-drv-igpu_safe.export b/libs/igpu/libnvgpu-drv-igpu_safe.export index e82a72bdd..3b70b6fd4 100644 --- a/libs/igpu/libnvgpu-drv-igpu_safe.export +++ b/libs/igpu/libnvgpu-drv-igpu_safe.export @@ -700,6 +700,8 @@ nvgpu_timeout_expired_fault_injection nvgpu_timeout_init nvgpu_timeout_peek_expired nvgpu_timers_get_fault_injection +nvgpu_tsg_store_sm_error_state +nvgpu_tsg_get_sm_error_state nvgpu_tsg_abort nvgpu_tsg_bind_channel nvgpu_tsg_check_and_get_from_id diff --git a/userspace/units/fifo/tsg/nvgpu-tsg.c b/userspace/units/fifo/tsg/nvgpu-tsg.c index 648f21ba9..568bcb216 100644 --- a/userspace/units/fifo/tsg/nvgpu-tsg.c +++ b/userspace/units/fifo/tsg/nvgpu-tsg.c @@ -851,6 +851,137 @@ done: return ret; } +int test_tsg_sm_error_state_set_get(struct unit_module *m, + struct gk20a *g, void *args) +{ + struct gpu_ops gops = g->ops; + struct nvgpu_channel *ch = NULL; + struct nvgpu_tsg *tsg = NULL; + int ret = UNIT_FAIL; + int err = 0; + int num_sm = g->ops.gr.init.get_no_of_sm(g); + u32 valid_sm_id[][2] = {{0, num_sm - 1}}; + u32 invalid_sm_id[][2] = {{num_sm, U32_MAX}}; + u32 i = 0, j = 0, sm_id_range, states, sm_id, t = 0, z = 0; + u32 (*working_list)[2]; + const char *string_states[] = {"Min", "Max", "Mid"}; + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; + const struct nvgpu_tsg_sm_error_state *get_error_state = NULL; + u32 sm_error_states_values[] = {0, 0, 0, 0}; + u64 hww_warp_esr_pc = 0; + + tsg = nvgpu_tsg_open(g, getpid()); + unit_assert(tsg != NULL, goto done); + + ch = nvgpu_channel_open_new(g, ~0U, false, getpid(), getpid()); + unit_assert(ch != NULL, goto done); + + err = nvgpu_tsg_bind_channel(tsg, ch); + unit_assert(err == 0, goto done); + + sm_error_states = tsg->sm_error_states; + + //check for SM_ERROR_STATE null + tsg->sm_error_states = NULL; + err = nvgpu_tsg_store_sm_error_state(tsg, 0, 0, 0, 0, 0, 0); + unit_assert(err != 0, goto done); + + tsg->sm_error_states = sm_error_states; + err = nvgpu_tsg_store_sm_error_state(tsg, 0, 0, 0, 0, 0, 0); + unit_assert(err == 0, goto done); + + //check for SM_ERROR_STATE null + tsg->sm_error_states = NULL; + get_error_state = nvgpu_tsg_get_sm_error_state(tsg, 0); + unit_assert(get_error_state == NULL, goto done); + tsg->sm_error_states = sm_error_states; + + /* valid, invalid sm_ids */ + for (i = 0; i < 2; i++) { + working_list = (i == 0) ? valid_sm_id : invalid_sm_id; + sm_id_range = (i == 0) ? ARRAY_SIZE(valid_sm_id) : ARRAY_SIZE(invalid_sm_id); + for (j = 0; j < sm_id_range; j++) { + for (states = 0; states < 3; states++) { + if (states == 0) { + sm_id = working_list[j][0]; + } else if (states == 1) { + sm_id = working_list[j][1]; + } else { + if (working_list[j][1] - working_list[j][0] > 1) { + sm_id = get_random_u32(working_list[j][0] + 1, working_list[j][1] - 1); + } else { + continue; + } + } + + /* Invalid SM_ID case */ + if (i == 1) { + unit_info(m, "BVEC testing for nvgpu_tsg_store_sm_error_state with sm_id = 0x%08x(Invalid range %s) \n", sm_id, string_states[states]); + err = nvgpu_tsg_store_sm_error_state(tsg, sm_id, 0, 0, 0, 0, 0); + unit_assert(err != 0, goto done); + + unit_info(m, "BVEC testing for nvgpu_tsg_get_sm_error_state with sm_id = 0x%08x(Invalid range %s) \n", sm_id, string_states[states]); + get_error_state = nvgpu_tsg_get_sm_error_state(tsg, sm_id); + unit_assert(get_error_state == NULL, goto done); + } else { + for (t = 0; t < 3; t++) { + /* Loop to fill the SM error values */ + for (z = 0; z < 4; z++) { + if (t == 0) { + /* Default 0*/ + } else if (t == 1) { + sm_error_states_values[z] = U32_MAX; + hww_warp_esr_pc = U32_MAX; + } else { + sm_error_states_values[z] = get_random_u32(1, U32_MAX - 1); + hww_warp_esr_pc = 2ULL * U32_MAX; + } + } + + unit_info(m, "BVEC testing for nvgpu_tsg_store_sm_error_state with sm_id = 0x%08x(Valid range %s)\n", sm_id, string_states[t]); + unit_info(m, "hww_global_esr = 0x%08x\n", sm_error_states_values[0]); + unit_info(m, "hww_warp_esr = 0x%08x\n", sm_error_states_values[1]); + unit_info(m, "hww_warp_esr_pc = 0x%016llx\n", hww_warp_esr_pc); + unit_info(m, "hww_global_esr_report_mask = 0x%08x\n", sm_error_states_values[2]); + unit_info(m, "hww_warp_esr_report_mask = 0x%08x\n", sm_error_states_values[3]); + + err = nvgpu_tsg_store_sm_error_state(tsg, sm_id, + sm_error_states_values[0], sm_error_states_values[1], hww_warp_esr_pc, + sm_error_states_values[2], sm_error_states_values[3]); + unit_assert(err == 0, goto done); + + unit_info(m, "BVEC testing for nvgpu_tsg_get_sm_error_state with sm_id = %u(Valid range %s) \n", sm_id, string_states[t]); + get_error_state = nvgpu_tsg_get_sm_error_state(tsg, sm_id); + unit_assert(get_error_state != NULL, goto done); + + unit_assert(get_error_state->hww_global_esr == sm_error_states_values[0], goto done); + unit_assert(get_error_state->hww_warp_esr == sm_error_states_values[1], goto done); + unit_assert(get_error_state->hww_warp_esr_pc == hww_warp_esr_pc, goto done); + unit_assert(get_error_state->hww_global_esr_report_mask == sm_error_states_values[2], goto done); + unit_assert(get_error_state->hww_warp_esr_report_mask == sm_error_states_values[3], goto done); + } + } + } + } + } + + ret = UNIT_SUCCESS; +done: + if (ret == UNIT_FAIL) { + unit_err(m, "branches=%s\n", __func__); + } + + if (ch != NULL) { + nvgpu_tsg_force_unbind_channel(tsg, ch); + nvgpu_channel_close(ch); + } + if (tsg != NULL) { + nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release); + } + g->ops = gops; + return ret; +} + #define F_UNBIND_CHANNEL_CHECK_CTX_RELOAD_SET BIT(0) #define F_UNBIND_CHANNEL_CHECK_CTX_RELOAD_CHID_MATCH BIT(1) #define F_UNBIND_CHANNEL_CHECK_CTX_RELOAD_LAST BIT(2) @@ -1650,6 +1781,7 @@ struct unit_module_test nvgpu_tsg_tests[] = { UNIT_TEST(unbind_channel, test_tsg_unbind_channel, &unit_ctx, 0), UNIT_TEST(unbind_channel_check_hw_state, test_tsg_unbind_channel_check_hw_state, &unit_ctx, 0), + UNIT_TEST(sm_error_states, test_tsg_sm_error_state_set_get, &unit_ctx, 0), UNIT_TEST(unbind_channel_check_ctx_reload, test_tsg_unbind_channel_check_ctx_reload, &unit_ctx, 0), UNIT_TEST(enable_disable, test_tsg_enable, &unit_ctx, 0), diff --git a/userspace/units/fifo/tsg/nvgpu-tsg.h b/userspace/units/fifo/tsg/nvgpu-tsg.h index c90b7190f..5d8d0db9d 100644 --- a/userspace/units/fifo/tsg/nvgpu-tsg.h +++ b/userspace/units/fifo/tsg/nvgpu-tsg.h @@ -220,6 +220,40 @@ int test_tsg_release(struct unit_module *m, int test_tsg_unbind_channel_check_hw_state(struct unit_module *m, struct gk20a *g, void *args); +/** + * Test specification for: struct nvgpu_tsg_sm_error_state + * + * Description: Check HW state during TSG unbind channel. + * + * Test Type: Feature, Boundary Value + * + * Targets: nvgpu_tsg_store_sm_error_state, nvgpu_tsg_get_sm_error_state + * + * Input: test_fifo_init_support() run for this GPU + * Equivalence classes: + * sm_id + * - Invalid : [g->ops.gr.init.get_no_of_sm(g), U32_MAX] + * - Valid : [0, g->ops.gr.init.get_no_of_sm(g) - 1] + * struct nvgpu_tsg_sm_error_state fields + * - Valid : [0, U32_MAX] + * + * Steps: + * 1) tsg->sm_error_states = NULL (Invalid Case) + * Verify nvgpu_tsg_store_sm_error_state returns error + * Verify nvgpu_tsg_get_sm_error_state returns NULL + * 2) sm_id >= g->ops.gr.init.get_no_of_sm(g) (Invalid Case) + * Verify nvgpu_tsg_store_sm_error_state returns error + * Verify nvgpu_tsg_get_sm_error_state returns NULL + * 3) For Valid sm_id and tsg->sm_error_states != NULL + * For each value within struct nvgpu_tsg_sm_error_state, + * test with Min, Max and one random number between [0, U32_MAX]. + * a) Verify nvgpu_tsg_store_sm_error_state returns 0 + * b) Verify nvgpu_tsg_get_sm_error_state returns non NULL. + * + * Output: Returns PASS if all branches gave expected results. FAIL otherwise. + */ +int test_tsg_sm_error_state_set_get(struct unit_module *m, + struct gk20a *g, void *args); /** * Test specification for: test_tsg_unbind_channel_check_ctx_reload * diff --git a/userspace/units/gr/intr/nvgpu-gr-intr.c b/userspace/units/gr/intr/nvgpu-gr-intr.c index c4c7003b2..bae09d364 100644 --- a/userspace/units/gr/intr/nvgpu-gr-intr.c +++ b/userspace/units/gr/intr/nvgpu-gr-intr.c @@ -199,12 +199,13 @@ static int gr_test_intr_allocate_ch(struct unit_module *m, static int gr_test_intr_block_ptr_as_current_ctx(struct unit_module *m, - struct gk20a *g, struct nvgpu_channel *ch, + struct gk20a *g, struct nvgpu_channel *ch, struct nvgpu_tsg *tsg, u32 pid) { int err, i; struct nvgpu_gr_intr *intr = g->gr->intr; u32 tsgid = nvgpu_inst_block_ptr(g, &ch->inst_block); + struct nvgpu_tsg_sm_error_state *sm_error_states = NULL; err = EXPECT_BUG(g->ops.gr.intr.stall_isr(g)); if (err != 0) { @@ -219,6 +220,17 @@ static int gr_test_intr_block_ptr_as_current_ctx(struct unit_module *m, unit_return_fail(m, "failed stall isr\n"); } + /* Cover the case where gv11b_gr_intr_read_sm_error_state fails */ + sm_error_states = tsg->sm_error_states; + tsg->sm_error_states = NULL; + + err = g->ops.gr.intr.stall_isr(g); + if (err != 0) { + unit_return_fail(m, "failed stall isr\n"); + } + + tsg->sm_error_states = sm_error_states; + /* Make all entry valid so code with flush one */ for (i = 0; i < GR_TEST_CHANNEL_MAP_TLB_SIZE; i++) { intr->chid_tlb[i].curr_ctx = pid; @@ -289,7 +301,7 @@ static int gr_test_intr_allocate_ch_tsg(struct unit_module *m, goto ch_cleanup; } - err = gr_test_intr_block_ptr_as_current_ctx(m, g, ch, tsgid); + err = gr_test_intr_block_ptr_as_current_ctx(m, g, ch, tsg, tsgid); if (err != 0) { unit_err(m, "isr failed with block_ptr as current_ctx\n"); goto tsg_unbind;