gpu: nvgpu: Add IOCTL for SM_EXCEPTION_TYPE_MASK

Add new ioctl to set the SM_EXCEPTION_TYPE_MASK is
added to dbg session.
Currently support SM_EXCEPTION_TYPE_MASK_FATAL type
If this type is set then the code will skip RC recovery,
instead trigger CILP preemption.

bug  200412641
JIRA NVGPU-702

Change-Id: I4b1f18379ee792cd324ccc555939e0f4f5c9e3b4
Signed-off-by: Vinod G <vinodg@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1729792
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Vinod G
2018-05-24 14:00:19 -07:00
committed by mobile promotions
parent 32bcf21f57
commit a09b9cd587
5 changed files with 118 additions and 5 deletions

View File

@@ -72,6 +72,12 @@ struct dbg_session_gk20a {
bool broadcast_stop_trigger; bool broadcast_stop_trigger;
struct nvgpu_mutex ioctl_lock; struct nvgpu_mutex ioctl_lock;
/*
* sm set exception type mask flag, to check whether
* exception type mask is requested or not.
*/
bool is_sm_exception_type_mask_set;
}; };
struct dbg_session_data { struct dbg_session_data {

View File

@@ -437,6 +437,12 @@ struct gr_gk20a {
u32 no_of_sm; u32 no_of_sm;
struct sm_info *sm_to_cluster; struct sm_info *sm_to_cluster;
struct nvgpu_gr_sm_error_state *sm_error_states; struct nvgpu_gr_sm_error_state *sm_error_states;
#define NVGPU_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
#define NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0)
u32 sm_exception_mask_type;
u32 sm_exception_mask_refcount;
#if defined(CONFIG_GK20A_CYCLE_STATS) #if defined(CONFIG_GK20A_CYCLE_STATS)
struct nvgpu_mutex cs_lock; struct nvgpu_mutex cs_lock;
struct gk20a_cs_snapshot *cs_data; struct gk20a_cs_snapshot *cs_data;

View File

@@ -2182,9 +2182,9 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
struct warp_esr_error_table_s warp_esr_error_table[] = { struct warp_esr_error_table_s warp_esr_error_table[] = {
{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(), { gr_gpc0_tpc0_sm0_hww_warp_esr_error_stack_error_f(),
"STACK ERROR"}, "STACK ERROR"},
{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(), { gr_gpc0_tpc0_sm0_hww_warp_esr_error_api_stack_error_f(),
"API STACK ERROR"}, "API STACK ERROR"},
{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(), { gr_gpc0_tpc0_sm0_hww_warp_esr_error_pc_wrap_f(),
"PC WRAP ERROR"}, "PC WRAP ERROR"},
{ gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(), { gr_gpc0_tpc0_sm0_hww_warp_esr_error_misaligned_pc_f(),
@@ -2221,7 +2221,7 @@ static bool gr_gv11b_check_warp_esr_error(struct gk20a *g, u32 warp_esr_error)
if (warp_esr_error_table[index].error_value == warp_esr_error) { if (warp_esr_error_table[index].error_value == warp_esr_error) {
esr_err = warp_esr_error_table[index].error_value; esr_err = warp_esr_error_table[index].error_value;
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
"ESR %s(0x%x)", "WARP_ESR %s(0x%x)",
warp_esr_error_table[index].error_name, warp_esr_error_table[index].error_name,
esr_err); esr_err);
break; break;
@@ -2250,6 +2250,21 @@ static int gr_gv11b_handle_all_warp_esr_errors(struct gk20a *g,
return 0; return 0;
} }
/*
* Check SET_EXCEPTION_TYPE_MASK is being set.
* If set, skip the recovery and trigger CILP
* If not set, trigger the recovery.
*/
if ((g->gr.sm_exception_mask_type &
NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) ==
NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL) {
nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg,
"SM Exception Type Mask set %d,"
"skip recovery",
g->gr.sm_exception_mask_type);
return 0;
}
if (fault_ch) { if (fault_ch) {
tsg = &g->fifo.tsg[fault_ch->tsgid]; tsg = &g->fifo.tsg[fault_ch->tsgid];
@@ -2294,7 +2309,6 @@ int gr_gv11b_pre_process_sm_exception(struct gk20a *g,
u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr); u32 warp_esr_error = gr_gpc0_tpc0_sm0_hww_warp_esr_error_v(warp_esr);
struct tsg_gk20a *tsg; struct tsg_gk20a *tsg;
*early_exit = false; *early_exit = false;
*ignore_debugger = false; *ignore_debugger = false;

View File

@@ -151,6 +151,10 @@ static int dbg_unbind_all_channels_gk20a(struct dbg_session_gk20a *dbg_s);
static int gk20a_dbg_gpu_do_dev_open(struct inode *inode, static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
struct file *filp, bool is_profiler); struct file *filp, bool is_profiler);
static int nvgpu_set_sm_exception_type_mask_locked(
struct dbg_session_gk20a *dbg_s,
u32 exception_mask);
unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait) unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait)
{ {
unsigned int mask = 0; unsigned int mask = 0;
@@ -217,6 +221,10 @@ int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
nvgpu_kfree(g, prof_obj); nvgpu_kfree(g, prof_obj);
} }
} }
nvgpu_set_sm_exception_type_mask_locked(dbg_s,
NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
nvgpu_mutex_release(&g->dbg_sessions_lock); nvgpu_mutex_release(&g->dbg_sessions_lock);
nvgpu_mutex_destroy(&dbg_s->ch_list_lock); nvgpu_mutex_destroy(&dbg_s->ch_list_lock);
@@ -466,6 +474,7 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
dbg_s->is_profiler = is_profiler; dbg_s->is_profiler = is_profiler;
dbg_s->is_pg_disabled = false; dbg_s->is_pg_disabled = false;
dbg_s->is_timeout_disabled = false; dbg_s->is_timeout_disabled = false;
dbg_s->is_sm_exception_type_mask_set = false;
nvgpu_cond_init(&dbg_s->dbg_events.wait_queue); nvgpu_cond_init(&dbg_s->dbg_events.wait_queue);
nvgpu_init_list_node(&dbg_s->ch_list); nvgpu_init_list_node(&dbg_s->ch_list);
@@ -478,6 +487,9 @@ static int gk20a_dbg_gpu_do_dev_open(struct inode *inode,
dbg_s->dbg_events.events_enabled = false; dbg_s->dbg_events.events_enabled = false;
dbg_s->dbg_events.num_pending_events = 0; dbg_s->dbg_events.num_pending_events = 0;
nvgpu_set_sm_exception_type_mask_locked(dbg_s,
NVGPU_SM_EXCEPTION_TYPE_MASK_NONE);
return 0; return 0;
err_destroy_lock: err_destroy_lock:
@@ -1839,6 +1851,57 @@ out:
return err; return err;
} }
static int nvgpu_set_sm_exception_type_mask_locked(
struct dbg_session_gk20a *dbg_s,
u32 exception_mask)
{
struct gk20a *g = dbg_s->g;
struct gr_gk20a *gr = &g->gr;
int err = 0;
switch (exception_mask) {
case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL:
gr->sm_exception_mask_type = NVGPU_SM_EXCEPTION_TYPE_MASK_FATAL;
if (dbg_s->is_sm_exception_type_mask_set == false) {
gr->sm_exception_mask_refcount++;
dbg_s->is_sm_exception_type_mask_set = true;
}
break;
case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE:
if (dbg_s->is_sm_exception_type_mask_set) {
gr->sm_exception_mask_refcount--;
dbg_s->is_sm_exception_type_mask_set = false;
}
if (gr->sm_exception_mask_refcount == 0)
gr->sm_exception_mask_type =
NVGPU_SM_EXCEPTION_TYPE_MASK_NONE;
break;
default:
nvgpu_err(g,
"unrecognized dbg sm exception type mask: 0x%x",
exception_mask);
err = -EINVAL;
break;
}
return err;
}
static int nvgpu_dbg_gpu_set_sm_exception_type_mask(
struct dbg_session_gk20a *dbg_s,
struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *args)
{
int err = 0;
struct gk20a *g = dbg_s->g;
nvgpu_mutex_acquire(&g->dbg_sessions_lock);
err = nvgpu_set_sm_exception_type_mask_locked(dbg_s,
args->exception_type_mask);
nvgpu_mutex_release(&g->dbg_sessions_lock);
return err;
}
int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp) int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
{ {
struct nvgpu_os_linux *l = container_of(inode->i_cdev, struct nvgpu_os_linux *l = container_of(inode->i_cdev,
@@ -1994,6 +2057,11 @@ long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
(struct nvgpu_dbg_gpu_profiler_reserve_args *)buf); (struct nvgpu_dbg_gpu_profiler_reserve_args *)buf);
break; break;
case NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK:
err = nvgpu_dbg_gpu_set_sm_exception_type_mask(dbg_s,
(struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args *)buf);
break;
default: default:
nvgpu_err(g, nvgpu_err(g,
"unrecognized dbg gpu ioctl cmd: 0x%x", "unrecognized dbg gpu ioctl cmd: 0x%x",

View File

@@ -1411,8 +1411,27 @@ struct nvgpu_dbg_gpu_profiler_reserve_args {
#define NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE \ #define NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE \
_IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 22, struct nvgpu_dbg_gpu_profiler_reserve_args) _IOWR(NVGPU_DBG_GPU_IOCTL_MAGIC, 22, struct nvgpu_dbg_gpu_profiler_reserve_args)
/*
* This struct helps to set the exception mask. If mask is not set
* or set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE
* then kernel code will follow recovery path on sm exception.
* If mask is set to NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL, then
* kernel code will skip recovery path on sm exception.
*/
struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args {
#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_NONE (0x0U)
#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK_FATAL (0x1U << 0U)
/* exception type mask value */
__u32 exception_type_mask;
__u32 reserved;
};
#define NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK \
_IOW(NVGPU_DBG_GPU_IOCTL_MAGIC, 23, \
struct nvgpu_dbg_gpu_set_sm_exception_type_mask_args)
#define NVGPU_DBG_GPU_IOCTL_LAST \ #define NVGPU_DBG_GPU_IOCTL_LAST \
_IOC_NR(NVGPU_DBG_GPU_IOCTL_PROFILER_RESERVE) _IOC_NR(NVGPU_DBG_GPU_IOCTL_SET_SM_EXCEPTION_TYPE_MASK)
#define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \ #define NVGPU_DBG_GPU_IOCTL_MAX_ARG_SIZE \
sizeof(struct nvgpu_dbg_gpu_access_fb_memory_args) sizeof(struct nvgpu_dbg_gpu_access_fb_memory_args)