/* * GK20A Graphics * * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gr_gk20a.h" #include "gk20a/fecs_trace_gk20a.h" #include "gr_pri_gk20a.h" #include #include #include #include #include #define BLK_SIZE (256U) #define NV_PERF_PMM_FBP_ROUTER_STRIDE 0x0200U #define NV_PERF_PMMGPCROUTER_STRIDE 0x0200U #define NV_PCFG_BASE 0x00088000U #define NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE 0x0020U #define FE_PWR_MODE_TIMEOUT_MAX 2000U #define FE_PWR_MODE_TIMEOUT_DEFAULT 10U #define CTXSW_MEM_SCRUBBING_TIMEOUT_MAX 1000U #define CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT 10U #define FECS_ARB_CMD_TIMEOUT_MAX 40 #define FECS_ARB_CMD_TIMEOUT_DEFAULT 2 static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g); /*elcg init */ static void gr_gk20a_enable_elcg(struct gk20a *g); void gk20a_fecs_dump_falcon_stats(struct gk20a *g) { unsigned int i; nvgpu_falcon_dump_stats(g->fecs_flcn); for (i = 0; i < g->ops.gr.fecs_ctxsw_mailbox_size(); i++) { nvgpu_err(g, "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x", i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i))); } } static void gr_gk20a_load_falcon_dmem(struct gk20a *g) { u32 i, ucode_u32_size; const u32 *ucode_u32_data; u32 checksum; nvgpu_log_fn(g, " "); gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) | gr_gpccs_dmemc_blk_f(0) | gr_gpccs_dmemc_aincw_f(1))); ucode_u32_size = g->netlist_vars->ucode.gpccs.data.count; ucode_u32_data = (const u32 *)g->netlist_vars->ucode.gpccs.data.l; for (i = 0, checksum = 0; i < ucode_u32_size; i++) { gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]); checksum += ucode_u32_data[i]; } gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) | gr_fecs_dmemc_blk_f(0) | gr_fecs_dmemc_aincw_f(1))); ucode_u32_size = g->netlist_vars->ucode.fecs.data.count; ucode_u32_data = (const u32 *)g->netlist_vars->ucode.fecs.data.l; for (i = 0, checksum = 0; i < ucode_u32_size; i++) { gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]); checksum += ucode_u32_data[i]; } nvgpu_log_fn(g, "done"); } static void gr_gk20a_load_falcon_imem(struct gk20a *g) { u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size; const u32 *ucode_u32_data; u32 tag, i, pad_start, pad_end; u32 checksum; nvgpu_log_fn(g, " "); cfg = gk20a_readl(g, gr_fecs_cfg_r()); fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg); cfg = gk20a_readl(g, gr_gpc0_cfg_r()); gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg); /* Use the broadcast address to access all of the GPCCS units. */ gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) | gr_gpccs_imemc_blk_f(0) | gr_gpccs_imemc_aincw_f(1))); /* Setup the tags for the instruction memory. */ tag = 0; gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag)); ucode_u32_size = g->netlist_vars->ucode.gpccs.inst.count; ucode_u32_data = (const u32 *)g->netlist_vars->ucode.gpccs.inst.l; for (i = 0, checksum = 0; i < ucode_u32_size; i++) { if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) { tag++; gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag)); } gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]); checksum += ucode_u32_data[i]; } pad_start = i * 4U; pad_end = pad_start + (256U - pad_start % 256U) + 256U; for (i = pad_start; (i < gpccs_imem_size * 256U) && (i < pad_end); i += 4U) { if ((i != 0U) && ((i % 256U) == 0U)) { tag++; gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag)); } gk20a_writel(g, gr_gpccs_imemd_r(0), 0); } gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) | gr_fecs_imemc_blk_f(0) | gr_fecs_imemc_aincw_f(1))); /* Setup the tags for the instruction memory. */ tag = 0; gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag)); ucode_u32_size = g->netlist_vars->ucode.fecs.inst.count; ucode_u32_data = (const u32 *)g->netlist_vars->ucode.fecs.inst.l; for (i = 0, checksum = 0; i < ucode_u32_size; i++) { if ((i != 0U) && ((i % (256U/sizeof(u32))) == 0U)) { tag++; gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag)); } gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]); checksum += ucode_u32_data[i]; } pad_start = i * 4U; pad_end = pad_start + (256U - pad_start % 256U) + 256U; for (i = pad_start; (i < fecs_imem_size * 256U) && i < pad_end; i += 4U) { if ((i != 0U) && ((i % 256U) == 0U)) { tag++; gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag)); } gk20a_writel(g, gr_fecs_imemd_r(0), 0); } } int gr_gk20a_wait_idle(struct gk20a *g) { u32 delay = GR_IDLE_CHECK_DEFAULT; bool ctxsw_active; bool gr_busy; u32 gr_engine_id; struct nvgpu_engine_status_info engine_status; bool ctx_status_invalid; struct nvgpu_timeout timeout; nvgpu_log_fn(g, " "); gr_engine_id = nvgpu_engine_get_gr_eng_id(g); nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g), NVGPU_TIMER_CPU_TIMER); do { /* fmodel: host gets fifo_engine_status(gr) from gr only when gr_status is read */ (void) gk20a_readl(g, gr_status_r()); g->ops.engine_status.read_engine_status_info(g, gr_engine_id, &engine_status); ctxsw_active = engine_status.ctxsw_in_progress; ctx_status_invalid = nvgpu_engine_status_is_ctxsw_invalid( &engine_status); gr_busy = (gk20a_readl(g, gr_engine_status_r()) & gr_engine_status_value_busy_f()) != 0U; if (ctx_status_invalid || (!gr_busy && !ctxsw_active)) { nvgpu_log_fn(g, "done"); return 0; } nvgpu_usleep_range(delay, delay * 2U); delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX); } while (nvgpu_timeout_expired(&timeout) == 0); nvgpu_err(g, "timeout, ctxsw busy : %d, gr busy : %d", ctxsw_active, gr_busy); return -EAGAIN; } int gr_gk20a_wait_fe_idle(struct gk20a *g) { u32 val; u32 delay = GR_IDLE_CHECK_DEFAULT; struct nvgpu_timeout timeout; if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { return 0; } nvgpu_log_fn(g, " "); nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g), NVGPU_TIMER_CPU_TIMER); do { val = gk20a_readl(g, gr_status_r()); if (gr_status_fe_method_lower_v(val) == 0U) { nvgpu_log_fn(g, "done"); return 0; } nvgpu_usleep_range(delay, delay * 2U); delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX); } while (nvgpu_timeout_expired(&timeout) == 0); nvgpu_err(g, "timeout, fe busy : %x", val); return -EAGAIN; } int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id, u32 *mailbox_ret, u32 opc_success, u32 mailbox_ok, u32 opc_fail, u32 mailbox_fail, bool sleepduringwait) { struct nvgpu_timeout timeout; u32 delay = GR_FECS_POLL_INTERVAL; enum wait_ucode_status check = WAIT_UCODE_LOOP; u32 reg; nvgpu_log_fn(g, " "); if (sleepduringwait) { delay = GR_IDLE_CHECK_DEFAULT; } nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g), NVGPU_TIMER_CPU_TIMER); while (check == WAIT_UCODE_LOOP) { if (nvgpu_timeout_expired(&timeout) != 0) { check = WAIT_UCODE_TIMEOUT; } reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id)); if (mailbox_ret != NULL) { *mailbox_ret = reg; } switch (opc_success) { case GR_IS_UCODE_OP_EQUAL: if (reg == mailbox_ok) { check = WAIT_UCODE_OK; } break; case GR_IS_UCODE_OP_NOT_EQUAL: if (reg != mailbox_ok) { check = WAIT_UCODE_OK; } break; case GR_IS_UCODE_OP_AND: if ((reg & mailbox_ok) != 0U) { check = WAIT_UCODE_OK; } break; case GR_IS_UCODE_OP_LESSER: if (reg < mailbox_ok) { check = WAIT_UCODE_OK; } break; case GR_IS_UCODE_OP_LESSER_EQUAL: if (reg <= mailbox_ok) { check = WAIT_UCODE_OK; } break; case GR_IS_UCODE_OP_SKIP: /* do no success check */ break; default: nvgpu_err(g, "invalid success opcode 0x%x", opc_success); check = WAIT_UCODE_ERROR; break; } switch (opc_fail) { case GR_IS_UCODE_OP_EQUAL: if (reg == mailbox_fail) { check = WAIT_UCODE_ERROR; } break; case GR_IS_UCODE_OP_NOT_EQUAL: if (reg != mailbox_fail) { check = WAIT_UCODE_ERROR; } break; case GR_IS_UCODE_OP_AND: if ((reg & mailbox_fail) != 0U) { check = WAIT_UCODE_ERROR; } break; case GR_IS_UCODE_OP_LESSER: if (reg < mailbox_fail) { check = WAIT_UCODE_ERROR; } break; case GR_IS_UCODE_OP_LESSER_EQUAL: if (reg <= mailbox_fail) { check = WAIT_UCODE_ERROR; } break; case GR_IS_UCODE_OP_SKIP: /* do no check on fail*/ break; default: nvgpu_err(g, "invalid fail opcode 0x%x", opc_fail); check = WAIT_UCODE_ERROR; break; } if (sleepduringwait) { nvgpu_usleep_range(delay, delay * 2U); delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX); } else { nvgpu_udelay(delay); } } if (check == WAIT_UCODE_TIMEOUT) { nvgpu_err(g, "timeout waiting on mailbox=%d value=0x%08x", mailbox_id, reg); g->ops.gr.dump_gr_falcon_stats(g); gk20a_gr_debug_dump(g); return -1; } else if (check == WAIT_UCODE_ERROR) { nvgpu_err(g, "ucode method failed on mailbox=%d value=0x%08x", mailbox_id, reg); g->ops.gr.dump_gr_falcon_stats(g); return -1; } nvgpu_log_fn(g, "done"); return 0; } /* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...) * We should replace most, if not all, fecs method calls to this instead. */ int gr_gk20a_submit_fecs_method_op(struct gk20a *g, struct fecs_method_op_gk20a op, bool sleepduringwait) { struct gr_gk20a *gr = &g->gr; int ret; nvgpu_mutex_acquire(&gr->fecs_mutex); if (op.mailbox.id != 0U) { gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id), op.mailbox.data); } gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr)); gk20a_writel(g, gr_fecs_method_data_r(), op.method.data); gk20a_writel(g, gr_fecs_method_push_r(), gr_fecs_method_push_adr_f(op.method.addr)); /* op.mailbox.id == 4 cases require waiting for completion on * for op.mailbox.id == 0 */ if (op.mailbox.id == 4U) { op.mailbox.id = 0; } ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret, op.cond.ok, op.mailbox.ok, op.cond.fail, op.mailbox.fail, sleepduringwait); if (ret != 0) { nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x", op.method.data, op.method.addr); } nvgpu_mutex_release(&gr->fecs_mutex); return ret; } /* Sideband mailbox writes are done a bit differently */ int gr_gk20a_submit_fecs_sideband_method_op(struct gk20a *g, struct fecs_method_op_gk20a op) { struct gr_gk20a *gr = &g->gr; int ret; nvgpu_mutex_acquire(&gr->fecs_mutex); gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(op.mailbox.id), gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr)); gk20a_writel(g, gr_fecs_method_data_r(), op.method.data); gk20a_writel(g, gr_fecs_method_push_r(), gr_fecs_method_push_adr_f(op.method.addr)); ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret, op.cond.ok, op.mailbox.ok, op.cond.fail, op.mailbox.fail, false); if (ret != 0) { nvgpu_err(g,"fecs method: data=0x%08x push adr=0x%08x", op.method.data, op.method.addr); } nvgpu_mutex_release(&gr->fecs_mutex); return ret; } static int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret) { return gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a) { .method.addr = fecs_method, .method.data = ~U32(0U), .mailbox = { .id = 1U, /*sideband?*/ .data = ~U32(0U), .clr = ~U32(0U), .ret = ret, .ok = gr_fecs_ctxsw_mailbox_value_pass_v(), .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), }, .cond.ok = GR_IS_UCODE_OP_EQUAL, .cond.fail = GR_IS_UCODE_OP_EQUAL }, true); } /* Stop processing (stall) context switches at FECS. */ int gr_gk20a_disable_ctxsw(struct gk20a *g) { int err = 0; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); nvgpu_mutex_acquire(&g->ctxsw_disable_lock); g->ctxsw_disable_count++; if (g->ctxsw_disable_count == 1) { err = gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), NULL); } nvgpu_mutex_release(&g->ctxsw_disable_lock); return err; } /* Start processing (continue) context switches at FECS */ int gr_gk20a_enable_ctxsw(struct gk20a *g) { int err = 0; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); nvgpu_mutex_acquire(&g->ctxsw_disable_lock); g->ctxsw_disable_count--; WARN_ON(g->ctxsw_disable_count < 0); if (g->ctxsw_disable_count == 0) { err = gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), NULL); } nvgpu_mutex_release(&g->ctxsw_disable_lock); return err; } int gr_gk20a_halt_pipe(struct gk20a *g) { return gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a) { .method.addr = gr_fecs_method_push_adr_halt_pipeline_v(), .method.data = ~U32(0U), .mailbox = { .id = 1U, /*sideband?*/ .data = ~U32(0U), .clr = ~U32(0U), .ret = NULL, .ok = gr_fecs_ctxsw_mailbox_value_pass_v(), .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), }, .cond.ok = GR_IS_UCODE_OP_EQUAL, .cond.fail = GR_IS_UCODE_OP_EQUAL }, false); } int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va) { u32 addr_lo; u32 addr_hi; nvgpu_log_fn(c->g, " "); addr_lo = u64_lo32(gpu_va) >> 12; addr_hi = u64_hi32(gpu_va); nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_target_w(), ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() | ram_in_gr_wfi_ptr_lo_f(addr_lo)); nvgpu_mem_wr32(c->g, &c->inst_block, ram_in_gr_wfi_ptr_hi_w(), ram_in_gr_wfi_ptr_hi_f(addr_hi)); return 0; } static u32 fecs_current_ctx_data(struct gk20a *g, struct nvgpu_mem *inst_block) { u64 ptr = nvgpu_inst_block_addr(g, inst_block) >> ram_in_base_shift_v(); u32 aperture = nvgpu_aperture_mask(g, inst_block, gr_fecs_current_ctx_target_sys_mem_ncoh_f(), gr_fecs_current_ctx_target_sys_mem_coh_f(), gr_fecs_current_ctx_target_vid_mem_f()); return gr_fecs_current_ctx_ptr_f(u64_lo32(ptr)) | aperture | gr_fecs_current_ctx_valid_f(1); } int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g, struct channel_gk20a *c) { u32 inst_base_ptr = u64_lo32(nvgpu_inst_block_addr(g, &c->inst_block) >> ram_in_base_shift_v()); u32 data = fecs_current_ctx_data(g, &c->inst_block); int ret; nvgpu_log_info(g, "bind channel %d inst ptr 0x%08x", c->chid, inst_base_ptr); ret = gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a) { .method.addr = gr_fecs_method_push_adr_bind_pointer_v(), .method.data = data, .mailbox = { .id = 0, .data = 0, .clr = 0x30, .ret = NULL, .ok = 0x10, .fail = 0x20, }, .cond.ok = GR_IS_UCODE_OP_AND, .cond.fail = GR_IS_UCODE_OP_AND}, true); if (ret != 0) { nvgpu_err(g, "bind channel instance failed"); } return ret; } static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c, struct nvgpu_gr_ctx *gr_ctx) { int ret = 0; nvgpu_log_fn(g, " "); ret = gk20a_disable_channel_tsg(g, c); if (ret != 0) { nvgpu_err(g, "failed to disable channel/TSG"); return ret; } ret = gk20a_fifo_preempt(g, c); if (ret != 0) { gk20a_enable_channel_tsg(g, c); nvgpu_err(g, "failed to preempt channel/TSG"); return ret; } if (c->subctx != NULL) { ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, false); if (ret == 0) { nvgpu_gr_subctx_zcull_setup(g, c->subctx, gr_ctx); } } else { ret = nvgpu_gr_ctx_zcull_setup(g, gr_ctx, true); } gk20a_enable_channel_tsg(g, c); return ret; } u32 gk20a_gr_gpc_offset(struct gk20a *g, u32 gpc) { u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 gpc_offset = gpc_stride * gpc; return gpc_offset; } u32 gk20a_gr_tpc_offset(struct gk20a *g, u32 tpc) { u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 tpc_offset = tpc_in_gpc_stride * tpc; return tpc_offset; } int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, bool patch) { struct gr_gk20a *gr = &g->gr; u64 addr; u32 size; nvgpu_log_fn(g, " "); if (patch) { int err; err = nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, false); if (err != 0) { return err; } } /* global pagepool buffer */ addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx, NVGPU_GR_CTX_PAGEPOOL_VA) >> U64(gr_scc_pagepool_base_addr_39_8_align_bits_v()); size = (u32)nvgpu_gr_global_ctx_get_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_PAGEPOOL) / gr_scc_pagepool_total_pages_byte_granularity_v(); if (size == g->ops.gr.pagepool_default_size(g)) { size = gr_scc_pagepool_total_pages_hwmax_v(); } nvgpu_log_info(g, "pagepool buffer addr : 0x%016llx, size : %d", addr, size); g->ops.gr.commit_global_pagepool(g, gr_ctx, addr, size, patch); /* global bundle cb */ addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx, NVGPU_GR_CTX_CIRCULAR_VA) >> U64(gr_scc_bundle_cb_base_addr_39_8_align_bits_v()); size = gr->bundle_cb_default_size; nvgpu_log_info(g, "bundle cb addr : 0x%016llx, size : %d", addr, size); g->ops.gr.commit_global_bundle_cb(g, gr_ctx, addr, size, patch); /* global attrib cb */ addr = nvgpu_gr_ctx_get_global_ctx_va(gr_ctx, NVGPU_GR_CTX_ATTRIBUTE_VA) >> U64(gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()); nvgpu_log_info(g, "attrib cb addr : 0x%016llx", addr); g->ops.gr.commit_global_attrib_cb(g, gr_ctx, addr, patch); g->ops.gr.commit_global_cb_manager(g, gr_ctx, patch); if (patch) { nvgpu_gr_ctx_patch_write_end(g, gr_ctx, false); } return 0; } int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c) { struct gr_gk20a *gr = &g->gr; struct nvgpu_gr_ctx *gr_ctx = NULL; u32 gpm_pd_cfg; u32 pd_ab_dist_cfg0; u32 ds_debug; u32 mpc_vtg_debug; u32 pe_vaf; u32 pe_vsc_vpc; nvgpu_log_fn(g, " "); gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r()); pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r()); ds_debug = gk20a_readl(g, gr_ds_debug_r()); mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r()); if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) { pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r()); pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r()); gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg; pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf; pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc; pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0; ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug; mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug; nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false); } else { gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg; pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0; ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug; mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug; nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_ds_debug_r(), ds_debug, false); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, false); } return 0; } int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr) { u32 norm_entries, norm_shift; u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod; u32 map0, map1, map2, map3, map4, map5; if (gr->config->map_tiles == NULL) { return -1; } nvgpu_log_fn(g, " "); gk20a_writel(g, gr_crstr_map_table_cfg_r(), gr_crstr_map_table_cfg_row_offset_f( nvgpu_gr_config_get_map_row_offset(gr->config)) | gr_crstr_map_table_cfg_num_entries_f( nvgpu_gr_config_get_tpc_count(gr->config))); map0 = gr_crstr_gpc_map0_tile0_f(nvgpu_gr_config_get_map_tile_count(gr->config, 0)) | gr_crstr_gpc_map0_tile1_f(nvgpu_gr_config_get_map_tile_count(gr->config, 1)) | gr_crstr_gpc_map0_tile2_f(nvgpu_gr_config_get_map_tile_count(gr->config, 2)) | gr_crstr_gpc_map0_tile3_f(nvgpu_gr_config_get_map_tile_count(gr->config, 3)) | gr_crstr_gpc_map0_tile4_f(nvgpu_gr_config_get_map_tile_count(gr->config, 4)) | gr_crstr_gpc_map0_tile5_f(nvgpu_gr_config_get_map_tile_count(gr->config, 5)); map1 = gr_crstr_gpc_map1_tile6_f(nvgpu_gr_config_get_map_tile_count(gr->config, 6)) | gr_crstr_gpc_map1_tile7_f(nvgpu_gr_config_get_map_tile_count(gr->config, 7)) | gr_crstr_gpc_map1_tile8_f(nvgpu_gr_config_get_map_tile_count(gr->config, 8)) | gr_crstr_gpc_map1_tile9_f(nvgpu_gr_config_get_map_tile_count(gr->config, 9)) | gr_crstr_gpc_map1_tile10_f(nvgpu_gr_config_get_map_tile_count(gr->config, 10)) | gr_crstr_gpc_map1_tile11_f(nvgpu_gr_config_get_map_tile_count(gr->config, 11)); map2 = gr_crstr_gpc_map2_tile12_f(nvgpu_gr_config_get_map_tile_count(gr->config, 12)) | gr_crstr_gpc_map2_tile13_f(nvgpu_gr_config_get_map_tile_count(gr->config, 13)) | gr_crstr_gpc_map2_tile14_f(nvgpu_gr_config_get_map_tile_count(gr->config, 14)) | gr_crstr_gpc_map2_tile15_f(nvgpu_gr_config_get_map_tile_count(gr->config, 15)) | gr_crstr_gpc_map2_tile16_f(nvgpu_gr_config_get_map_tile_count(gr->config, 16)) | gr_crstr_gpc_map2_tile17_f(nvgpu_gr_config_get_map_tile_count(gr->config, 17)); map3 = gr_crstr_gpc_map3_tile18_f(nvgpu_gr_config_get_map_tile_count(gr->config, 18)) | gr_crstr_gpc_map3_tile19_f(nvgpu_gr_config_get_map_tile_count(gr->config, 19)) | gr_crstr_gpc_map3_tile20_f(nvgpu_gr_config_get_map_tile_count(gr->config, 20)) | gr_crstr_gpc_map3_tile21_f(nvgpu_gr_config_get_map_tile_count(gr->config, 21)) | gr_crstr_gpc_map3_tile22_f(nvgpu_gr_config_get_map_tile_count(gr->config, 22)) | gr_crstr_gpc_map3_tile23_f(nvgpu_gr_config_get_map_tile_count(gr->config, 23)); map4 = gr_crstr_gpc_map4_tile24_f(nvgpu_gr_config_get_map_tile_count(gr->config, 24)) | gr_crstr_gpc_map4_tile25_f(nvgpu_gr_config_get_map_tile_count(gr->config, 25)) | gr_crstr_gpc_map4_tile26_f(nvgpu_gr_config_get_map_tile_count(gr->config, 26)) | gr_crstr_gpc_map4_tile27_f(nvgpu_gr_config_get_map_tile_count(gr->config, 27)) | gr_crstr_gpc_map4_tile28_f(nvgpu_gr_config_get_map_tile_count(gr->config, 28)) | gr_crstr_gpc_map4_tile29_f(nvgpu_gr_config_get_map_tile_count(gr->config, 29)); map5 = gr_crstr_gpc_map5_tile30_f(nvgpu_gr_config_get_map_tile_count(gr->config, 30)) | gr_crstr_gpc_map5_tile31_f(nvgpu_gr_config_get_map_tile_count(gr->config, 31)) | gr_crstr_gpc_map5_tile32_f(0) | gr_crstr_gpc_map5_tile33_f(0) | gr_crstr_gpc_map5_tile34_f(0) | gr_crstr_gpc_map5_tile35_f(0); gk20a_writel(g, gr_crstr_gpc_map0_r(), map0); gk20a_writel(g, gr_crstr_gpc_map1_r(), map1); gk20a_writel(g, gr_crstr_gpc_map2_r(), map2); gk20a_writel(g, gr_crstr_gpc_map3_r(), map3); gk20a_writel(g, gr_crstr_gpc_map4_r(), map4); gk20a_writel(g, gr_crstr_gpc_map5_r(), map5); switch (nvgpu_gr_config_get_tpc_count(gr->config)) { case 1: norm_shift = 4; break; case 2: case 3: norm_shift = 3; break; case 4: case 5: case 6: case 7: norm_shift = 2; break; case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: norm_shift = 1; break; default: norm_shift = 0; break; } norm_entries = nvgpu_gr_config_get_tpc_count(gr->config) << norm_shift; coeff5_mod = BIT32(5) % norm_entries; coeff6_mod = BIT32(6) % norm_entries; coeff7_mod = BIT32(7) % norm_entries; coeff8_mod = BIT32(8) % norm_entries; coeff9_mod = BIT32(9) % norm_entries; coeff10_mod = BIT32(10) % norm_entries; coeff11_mod = BIT32(11) % norm_entries; gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(), gr_ppcs_wwdx_map_table_cfg_row_offset_f( nvgpu_gr_config_get_map_row_offset(gr->config)) | gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) | gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) | gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) | gr_ppcs_wwdx_map_table_cfg_num_entries_f( nvgpu_gr_config_get_tpc_count(gr->config))); gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(), gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) | gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) | gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) | gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) | gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) | gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod)); gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0); gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1); gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2); gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3); gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4); gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5); gk20a_writel(g, gr_rstr2d_map_table_cfg_r(), gr_rstr2d_map_table_cfg_row_offset_f( nvgpu_gr_config_get_map_row_offset(gr->config)) | gr_rstr2d_map_table_cfg_num_entries_f( nvgpu_gr_config_get_tpc_count(gr->config))); gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0); gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1); gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2); gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3); gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4); gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5); return 0; } int gr_gk20a_init_sm_id_table(struct gk20a *g) { u32 gpc, tpc; u32 sm_id = 0; for (tpc = 0; tpc < nvgpu_gr_config_get_max_tpc_per_gpc_count(g->gr.config); tpc++) { for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(g->gr.config); gpc++) { if (tpc < nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc)) { g->gr.sm_to_cluster[sm_id].tpc_index = tpc; g->gr.sm_to_cluster[sm_id].gpc_index = gpc; g->gr.sm_to_cluster[sm_id].sm_index = 0; g->gr.sm_to_cluster[sm_id].global_tpc_index = sm_id; sm_id++; } } } g->gr.no_of_sm = sm_id; return 0; } int gr_gk20a_init_fs_state(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; u32 tpc_index, gpc_index; u32 sm_id = 0, gpc_id = 0; u32 tpc_per_gpc; u32 fuse_tpc_mask; u32 reg_index; int err; nvgpu_log_fn(g, " "); if (g->ops.gr.init_sm_id_table != NULL) { err = g->ops.gr.init_sm_id_table(g); if (err != 0) { return err; } /* Is table empty ? */ if (g->gr.no_of_sm == 0U) { return -EINVAL; } } for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) { tpc_index = g->gr.sm_to_cluster[sm_id].tpc_index; gpc_index = g->gr.sm_to_cluster[sm_id].gpc_index; g->ops.gr.program_sm_id_numbering(g, gpc_index, tpc_index, sm_id); if (g->ops.gr.program_active_tpc_counts != NULL) { g->ops.gr.program_active_tpc_counts(g, gpc_index); } } for (reg_index = 0U, gpc_id = 0U; reg_index < gr_pd_num_tpc_per_gpc__size_1_v(); reg_index++, gpc_id += 8U) { tpc_per_gpc = gr_pd_num_tpc_per_gpc_count0_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 0U)) | gr_pd_num_tpc_per_gpc_count1_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 1U)) | gr_pd_num_tpc_per_gpc_count2_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 2U)) | gr_pd_num_tpc_per_gpc_count3_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 3U)) | gr_pd_num_tpc_per_gpc_count4_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 4U)) | gr_pd_num_tpc_per_gpc_count5_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 5U)) | gr_pd_num_tpc_per_gpc_count6_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 6U)) | gr_pd_num_tpc_per_gpc_count7_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_id + 7U)); gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(reg_index), tpc_per_gpc); gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(reg_index), tpc_per_gpc); } /* gr__setup_pd_mapping stubbed for gk20a */ g->ops.gr.setup_rop_mapping(g, gr); if (g->ops.gr.setup_alpha_beta_tables != NULL) { g->ops.gr.setup_alpha_beta_tables(g, gr); } for (gpc_index = 0; gpc_index < gr_pd_dist_skip_table__size_1_v() * 4U; gpc_index += 4U) { bool skip_mask = (gr_pd_dist_skip_table_gpc_4n0_mask_f( nvgpu_gr_config_get_gpc_skip_mask(gr->config, gpc_index)) != 0U) || (gr_pd_dist_skip_table_gpc_4n1_mask_f( nvgpu_gr_config_get_gpc_skip_mask(gr->config, gpc_index + 1U)) != 0U) || (gr_pd_dist_skip_table_gpc_4n2_mask_f( nvgpu_gr_config_get_gpc_skip_mask(gr->config, gpc_index + 2U)) != 0U) || (gr_pd_dist_skip_table_gpc_4n3_mask_f( nvgpu_gr_config_get_gpc_skip_mask(gr->config, gpc_index + 3U)) != 0U); gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4U), (u32)skip_mask); } fuse_tpc_mask = g->ops.gr.config.get_gpc_tpc_mask(g, gr->config, 0); if ((g->tpc_fs_mask_user != 0U) && (fuse_tpc_mask == BIT32(nvgpu_gr_config_get_max_tpc_count(gr->config)) - 1U)) { u32 val = g->tpc_fs_mask_user; val &= BIT32(nvgpu_gr_config_get_max_tpc_count(gr->config)) - U32(1); gk20a_writel(g, gr_cwd_fs_r(), gr_cwd_fs_num_gpcs_f(nvgpu_gr_config_get_gpc_count(gr->config)) | gr_cwd_fs_num_tpcs_f((u32)hweight32(val))); } else { gk20a_writel(g, gr_cwd_fs_r(), gr_cwd_fs_num_gpcs_f(nvgpu_gr_config_get_gpc_count(gr->config)) | gr_cwd_fs_num_tpcs_f(nvgpu_gr_config_get_tpc_count(gr->config))); } gk20a_writel(g, gr_bes_zrop_settings_r(), gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps)); gk20a_writel(g, gr_bes_crop_settings_r(), gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps)); return 0; } int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type) { struct gk20a *g = c->g; int ret; nvgpu_log_fn(g, " "); ret = gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a) { .method.addr = save_type, .method.data = fecs_current_ctx_data(g, &c->inst_block), .mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL, .ok = 1, .fail = 2, }, .cond.ok = GR_IS_UCODE_OP_AND, .cond.fail = GR_IS_UCODE_OP_AND, }, true); if (ret != 0) { nvgpu_err(g, "save context image failed"); } return ret; } int gk20a_init_sw_bundle(struct gk20a *g) { struct netlist_av_list *sw_bundle_init = &g->netlist_vars->sw_bundle_init; u32 last_bundle_data = 0; int err = 0; unsigned int i; /* disable fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_disabled_f()); /* enable pipe mode override */ gk20a_writel(g, gr_pipe_bundle_config_r(), gr_pipe_bundle_config_override_pipe_mode_enabled_f()); /* load bundle init */ for (i = 0U; i < sw_bundle_init->count; i++) { if (i == 0U || last_bundle_data != sw_bundle_init->l[i].value) { gk20a_writel(g, gr_pipe_bundle_data_r(), sw_bundle_init->l[i].value); last_bundle_data = sw_bundle_init->l[i].value; } gk20a_writel(g, gr_pipe_bundle_address_r(), sw_bundle_init->l[i].addr); if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) == GR_GO_IDLE_BUNDLE) { err = gr_gk20a_wait_idle(g); if (err != 0) { goto error; } } err = gr_gk20a_wait_fe_idle(g); if (err != 0) { goto error; } } if ((err == 0) && (g->ops.gr.init_sw_veid_bundle != NULL)) { err = g->ops.gr.init_sw_veid_bundle(g); if (err != 0) { goto error; } } if (g->ops.gr.init_sw_bundle64 != NULL) { err = g->ops.gr.init_sw_bundle64(g); if (err != 0) { goto error; } } /* disable pipe mode override */ gk20a_writel(g, gr_pipe_bundle_config_r(), gr_pipe_bundle_config_override_pipe_mode_disabled_f()); err = gr_gk20a_wait_idle(g); /* restore fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_prod_f()); return err; error: /* in case of error skip waiting for GR idle - just restore state */ gk20a_writel(g, gr_pipe_bundle_config_r(), gr_pipe_bundle_config_override_pipe_mode_disabled_f()); /* restore fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_prod_f()); return err; } /* init global golden image from a fresh gr_ctx in channel ctx. save a copy in local_golden_image in ctx_vars */ int gr_gk20a_init_golden_ctx_image(struct gk20a *g, struct channel_gk20a *c, struct nvgpu_gr_ctx *gr_ctx) { struct gr_gk20a *gr = &g->gr; u32 i; struct nvgpu_mem *gr_mem; int err = 0; struct netlist_aiv_list *sw_ctx_load = &g->netlist_vars->sw_ctx_load; struct netlist_av_list *sw_method_init = &g->netlist_vars->sw_method_init; u32 last_method_data = 0; nvgpu_log_fn(g, " "); gr_mem = &gr_ctx->mem; /* golden ctx is global to all channels. Although only the first channel initializes golden image, driver needs to prevent multiple channels from initializing golden ctx at the same time */ nvgpu_mutex_acquire(&gr->ctx_mutex); if (gr->ctx_vars.golden_image_initialized) { goto clean_up; } if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { struct nvgpu_timeout timeout; nvgpu_timeout_init(g, &timeout, FE_PWR_MODE_TIMEOUT_MAX / FE_PWR_MODE_TIMEOUT_DEFAULT, NVGPU_TIMER_RETRY_TIMER); gk20a_writel(g, gr_fe_pwr_mode_r(), gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_force_on_f()); do { u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r())); if (req == gr_fe_pwr_mode_req_done_v()) { break; } nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT); } while (nvgpu_timeout_expired_msg(&timeout, "timeout forcing FE on") == 0); } gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() | gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() | gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() | gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() | gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() | gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f()); (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r()); nvgpu_udelay(10); gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() | gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() | gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f() | gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() | gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f()); (void) gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r()); nvgpu_udelay(10); if (!nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { struct nvgpu_timeout timeout; nvgpu_timeout_init(g, &timeout, FE_PWR_MODE_TIMEOUT_MAX / FE_PWR_MODE_TIMEOUT_DEFAULT, NVGPU_TIMER_RETRY_TIMER); gk20a_writel(g, gr_fe_pwr_mode_r(), gr_fe_pwr_mode_req_send_f() | gr_fe_pwr_mode_mode_auto_f()); do { u32 req = gr_fe_pwr_mode_req_v(gk20a_readl(g, gr_fe_pwr_mode_r())); if (req == gr_fe_pwr_mode_req_done_v()) { break; } nvgpu_udelay(FE_PWR_MODE_TIMEOUT_DEFAULT); } while (nvgpu_timeout_expired_msg(&timeout, "timeout setting FE power to auto") == 0); } /* clear scc ram */ gk20a_writel(g, gr_scc_init_r(), gr_scc_init_ram_trigger_f()); err = gr_gk20a_fecs_ctx_bind_channel(g, c); if (err != 0) { goto clean_up; } err = gr_gk20a_wait_idle(g); /* load ctx init */ for (i = 0; i < sw_ctx_load->count; i++) { gk20a_writel(g, sw_ctx_load->l[i].addr, sw_ctx_load->l[i].value); } if (g->ops.gr.init_preemption_state != NULL) { g->ops.gr.init_preemption_state(g); } if (g->ops.clock_gating.blcg_gr_load_gating_prod != NULL) { g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled); } err = gr_gk20a_wait_idle(g); if (err != 0) { goto clean_up; } /* disable fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_disabled_f()); err = g->ops.gr.commit_global_ctx_buffers(g, gr_ctx, false); if (err != 0) { goto clean_up; } /* override a few ctx state registers */ g->ops.gr.commit_global_timeslice(g, c); /* floorsweep anything left */ err = g->ops.gr.init_fs_state(g); if (err != 0) { goto clean_up; } err = gr_gk20a_wait_idle(g); if (err != 0) { goto restore_fe_go_idle; } err = gk20a_init_sw_bundle(g); if (err != 0) { goto clean_up; } restore_fe_go_idle: /* restore fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_prod_f()); if ((err != 0) || (gr_gk20a_wait_idle(g) != 0)) { goto clean_up; } /* load method init */ if (sw_method_init->count != 0U) { gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(), sw_method_init->l[0].value); gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(), gr_pri_mme_shadow_raw_index_write_trigger_f() | sw_method_init->l[0].addr); last_method_data = sw_method_init->l[0].value; } for (i = 1; i < sw_method_init->count; i++) { if (sw_method_init->l[i].value != last_method_data) { gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(), sw_method_init->l[i].value); last_method_data = sw_method_init->l[i].value; } gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(), gr_pri_mme_shadow_raw_index_write_trigger_f() | sw_method_init->l[i].addr); } err = gr_gk20a_wait_idle(g); if (err != 0) { goto clean_up; } err = nvgpu_gr_ctx_init_zcull(g, gr_ctx); if (err != 0) { goto clean_up; } gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v()); gr->local_golden_image = nvgpu_gr_global_ctx_init_local_golden_image(g, gr_mem, gr->ctx_vars.golden_image_size); if (gr->local_golden_image == NULL) { err = -ENOMEM; goto clean_up; } gr->ctx_vars.golden_image_initialized = true; gk20a_writel(g, gr_fecs_current_ctx_r(), gr_fecs_current_ctx_valid_false_f()); clean_up: if (err != 0) { nvgpu_err(g, "fail"); } else { nvgpu_log_fn(g, "done"); } nvgpu_mutex_release(&gr->ctx_mutex); return err; } int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g, struct channel_gk20a *c, bool enable_smpc_ctxsw) { struct tsg_gk20a *tsg; int ret; nvgpu_log_fn(g, " "); tsg = tsg_gk20a_from_ch(c); if (tsg == NULL) { return -EINVAL; } ret = gk20a_disable_channel_tsg(g, c); if (ret != 0) { nvgpu_err(g, "failed to disable channel/TSG"); goto out; } ret = gk20a_fifo_preempt(g, c); if (ret != 0) { gk20a_enable_channel_tsg(g, c); nvgpu_err(g, "failed to preempt channel/TSG"); goto out; } ret = nvgpu_gr_ctx_set_smpc_mode(g, tsg->gr_ctx, enable_smpc_ctxsw); out: gk20a_enable_channel_tsg(g, c); return ret; } int gr_gk20a_update_hwpm_ctxsw_mode(struct gk20a *g, struct channel_gk20a *c, u64 gpu_va, u32 mode) { struct tsg_gk20a *tsg; struct nvgpu_gr_ctx *gr_ctx; bool skip_update = false; int ret; nvgpu_log_fn(g, " "); tsg = tsg_gk20a_from_ch(c); if (tsg == NULL) { return -EINVAL; } gr_ctx = tsg->gr_ctx; if (mode != NVGPU_GR_CTX_HWPM_CTXSW_MODE_NO_CTXSW) { nvgpu_gr_ctx_set_size(g->gr.gr_ctx_desc, NVGPU_GR_CTX_PM_CTX, g->gr.ctx_vars.pm_ctxsw_image_size); ret = nvgpu_gr_ctx_alloc_pm_ctx(g, gr_ctx, g->gr.gr_ctx_desc, c->vm, gpu_va); if (ret != 0) { nvgpu_err(g, "failed to allocate pm ctxt buffer"); return ret; } if ((mode == NVGPU_GR_CTX_HWPM_CTXSW_MODE_STREAM_OUT_CTXSW) && (g->ops.gr.init_hwpm_pmm_register != NULL)) { g->ops.gr.init_hwpm_pmm_register(g); } } ret = nvgpu_gr_ctx_prepare_hwpm_mode(g, gr_ctx, mode, &skip_update); if (ret != 0) { return ret; } if (skip_update) { return 0; } ret = gk20a_disable_channel_tsg(g, c); if (ret != 0) { nvgpu_err(g, "failed to disable channel/TSG"); return ret; } ret = gk20a_fifo_preempt(g, c); if (ret != 0) { gk20a_enable_channel_tsg(g, c); nvgpu_err(g, "failed to preempt channel/TSG"); return ret; } if (c->subctx != NULL) { struct channel_gk20a *ch; nvgpu_rwsem_down_read(&tsg->ch_list_lock); nvgpu_list_for_each_entry(ch, &tsg->ch_list, channel_gk20a, ch_entry) { ret = nvgpu_gr_ctx_set_hwpm_mode(g, gr_ctx, false); if (ret == 0) { nvgpu_gr_subctx_set_hwpm_mode(g, ch->subctx, gr_ctx); } } nvgpu_rwsem_up_read(&tsg->ch_list_lock); } else { ret = nvgpu_gr_ctx_set_hwpm_mode(g, gr_ctx, true); } /* enable channel */ gk20a_enable_channel_tsg(g, c); return ret; } static void gr_gk20a_start_falcon_ucode(struct gk20a *g) { nvgpu_log_fn(g, " "); gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0U), gr_fecs_ctxsw_mailbox_clear_value_f(~U32(0U))); gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0U)); gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0U)); gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1U)); gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1U)); nvgpu_log_fn(g, "done"); } static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; struct vm_gk20a *vm = mm->pmu.vm; struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info; int err; err = g->ops.mm.alloc_inst_block(g, &ucode_info->inst_blk_desc); if (err != 0) { return err; } g->ops.mm.init_inst_block(&ucode_info->inst_blk_desc, vm, 0); /* Map ucode surface to GMMU */ ucode_info->surface_desc.gpu_va = nvgpu_gmmu_map(vm, &ucode_info->surface_desc, ucode_info->surface_desc.size, 0, /* flags */ gk20a_mem_flag_read_only, false, ucode_info->surface_desc.aperture); if (ucode_info->surface_desc.gpu_va == 0ULL) { nvgpu_err(g, "failed to update gmmu ptes"); return -ENOMEM; } return 0; } static void gr_gk20a_init_ctxsw_ucode_segment( struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size) { p_seg->offset = *offset; p_seg->size = size; *offset = ALIGN(*offset + size, BLK_SIZE); } static void gr_gk20a_init_ctxsw_ucode_segments( struct gk20a_ctxsw_ucode_segments *segments, u32 *offset, struct gk20a_ctxsw_bootloader_desc *bootdesc, u32 code_size, u32 data_size) { u32 boot_size = ALIGN(bootdesc->size, sizeof(u32)); segments->boot_entry = bootdesc->entry_point; segments->boot_imem_offset = bootdesc->imem_offset; gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size); gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size); gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size); } static int gr_gk20a_copy_ctxsw_ucode_segments( struct gk20a *g, struct nvgpu_mem *dst, struct gk20a_ctxsw_ucode_segments *segments, u32 *bootimage, u32 *code, u32 *data) { unsigned int i; nvgpu_mem_wr_n(g, dst, segments->boot.offset, bootimage, segments->boot.size); nvgpu_mem_wr_n(g, dst, segments->code.offset, code, segments->code.size); nvgpu_mem_wr_n(g, dst, segments->data.offset, data, segments->data.size); /* compute a "checksum" for the boot binary to detect its version */ segments->boot_signature = 0; for (i = 0; i < segments->boot.size / sizeof(u32); i++) { segments->boot_signature += bootimage[i]; } return 0; } int gr_gk20a_init_ctxsw_ucode(struct gk20a *g) { struct mm_gk20a *mm = &g->mm; struct vm_gk20a *vm = mm->pmu.vm; struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc; struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc; struct nvgpu_firmware *fecs_fw; struct nvgpu_firmware *gpccs_fw; u32 *fecs_boot_image; u32 *gpccs_boot_image; struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info; u32 ucode_size; int err = 0; fecs_fw = nvgpu_request_firmware(g, GK20A_FECS_UCODE_IMAGE, 0); if (fecs_fw == NULL) { nvgpu_err(g, "failed to load fecs ucode!!"); return -ENOENT; } fecs_boot_desc = (void *)fecs_fw->data; fecs_boot_image = (void *)(fecs_fw->data + sizeof(struct gk20a_ctxsw_bootloader_desc)); gpccs_fw = nvgpu_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE, 0); if (gpccs_fw == NULL) { nvgpu_release_firmware(g, fecs_fw); nvgpu_err(g, "failed to load gpccs ucode!!"); return -ENOENT; } gpccs_boot_desc = (void *)gpccs_fw->data; gpccs_boot_image = (void *)(gpccs_fw->data + sizeof(struct gk20a_ctxsw_bootloader_desc)); ucode_size = 0; gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size, fecs_boot_desc, g->netlist_vars->ucode.fecs.inst.count * (u32)sizeof(u32), g->netlist_vars->ucode.fecs.data.count * (u32)sizeof(u32)); gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size, gpccs_boot_desc, g->netlist_vars->ucode.gpccs.inst.count * (u32)sizeof(u32), g->netlist_vars->ucode.gpccs.data.count * (u32)sizeof(u32)); err = nvgpu_dma_alloc_sys(g, ucode_size, &ucode_info->surface_desc); if (err != 0) { goto clean_up; } gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc, &ucode_info->fecs, fecs_boot_image, g->netlist_vars->ucode.fecs.inst.l, g->netlist_vars->ucode.fecs.data.l); nvgpu_release_firmware(g, fecs_fw); fecs_fw = NULL; gr_gk20a_copy_ctxsw_ucode_segments(g, &ucode_info->surface_desc, &ucode_info->gpccs, gpccs_boot_image, g->netlist_vars->ucode.gpccs.inst.l, g->netlist_vars->ucode.gpccs.data.l); nvgpu_release_firmware(g, gpccs_fw); gpccs_fw = NULL; err = gr_gk20a_init_ctxsw_ucode_vaspace(g); if (err != 0) { goto clean_up; } return 0; clean_up: if (ucode_info->surface_desc.gpu_va != 0ULL) { nvgpu_gmmu_unmap(vm, &ucode_info->surface_desc, ucode_info->surface_desc.gpu_va); } nvgpu_dma_free(g, &ucode_info->surface_desc); nvgpu_release_firmware(g, gpccs_fw); gpccs_fw = NULL; nvgpu_release_firmware(g, fecs_fw); fecs_fw = NULL; return err; } static void gr_gk20a_wait_for_fecs_arb_idle(struct gk20a *g) { int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT; u32 val; val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()); while ((gr_fecs_arb_ctx_cmd_cmd_v(val) != 0U) && (retries != 0)) { nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT); retries--; val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()); } if (retries == 0) { nvgpu_err(g, "arbiter cmd timeout, fecs arb ctx cmd: 0x%08x", gk20a_readl(g, gr_fecs_arb_ctx_cmd_r())); } retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT; while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) & gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) && (retries != 0)) { nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT); retries--; } if (retries == 0) { nvgpu_err(g, "arbiter idle timeout, fecs ctxsw status: 0x%08x", gk20a_readl(g, gr_fecs_ctxsw_status_1_r())); } } void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g) { struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info; int retries = FECS_ARB_CMD_TIMEOUT_MAX / FECS_ARB_CMD_TIMEOUT_DEFAULT; u64 inst_ptr_shifted_u64; u32 inst_ptr_shifted_u32; while (((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) & gr_fecs_ctxsw_status_1_arb_busy_m()) != 0U) && (retries != 0)) { nvgpu_udelay(FECS_ARB_CMD_TIMEOUT_DEFAULT); retries--; } if (retries == 0) { nvgpu_err(g, "arbiter idle timeout, status: %08x", gk20a_readl(g, gr_fecs_ctxsw_status_1_r())); } gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0); inst_ptr_shifted_u64 = nvgpu_inst_block_addr(g, &ucode_info->inst_blk_desc); inst_ptr_shifted_u64 >>= 12; BUG_ON(u64_hi32(inst_ptr_shifted_u64) != 0U); inst_ptr_shifted_u32 = (u32)inst_ptr_shifted_u64; gk20a_writel(g, gr_fecs_new_ctx_r(), gr_fecs_new_ctx_ptr_f(inst_ptr_shifted_u32) | nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc, gr_fecs_new_ctx_target_sys_mem_ncoh_f(), gr_fecs_new_ctx_target_sys_mem_coh_f(), gr_fecs_new_ctx_target_vid_mem_f()) | gr_fecs_new_ctx_valid_m()); gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(), gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr_shifted_u32) | nvgpu_aperture_mask(g, &ucode_info->inst_blk_desc, gr_fecs_arb_ctx_ptr_target_sys_mem_ncoh_f(), gr_fecs_arb_ctx_ptr_target_sys_mem_coh_f(), gr_fecs_arb_ctx_ptr_target_vid_mem_f())); gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7); /* Wait for arbiter command to complete */ gr_gk20a_wait_for_fecs_arb_idle(g); gk20a_writel(g, gr_fecs_current_ctx_r(), gr_fecs_current_ctx_ptr_f(inst_ptr_shifted_u32) | gr_fecs_current_ctx_target_m() | gr_fecs_current_ctx_valid_m()); /* Send command to arbiter to flush */ gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s()); gr_gk20a_wait_for_fecs_arb_idle(g); } void gr_gk20a_load_ctxsw_ucode_header(struct gk20a *g, u64 addr_base, struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset) { u32 addr_code32; u32 addr_data32; addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8); addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8); /* * Copy falcon bootloader header into dmem at offset 0. * Configure dmem port 0 for auto-incrementing writes starting at dmem * offset 0. */ gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0), gr_fecs_dmemc_offs_f(0) | gr_fecs_dmemc_blk_f(0) | gr_fecs_dmemc_aincw_f(1)); /* Write out the actual data */ switch (segments->boot_signature) { case FALCON_UCODE_SIG_T18X_GPCCS_WITH_RESERVED: case FALCON_UCODE_SIG_T21X_FECS_WITH_DMEM_SIZE: case FALCON_UCODE_SIG_T21X_FECS_WITH_RESERVED: case FALCON_UCODE_SIG_T21X_GPCCS_WITH_RESERVED: case FALCON_UCODE_SIG_T12X_FECS_WITH_RESERVED: case FALCON_UCODE_SIG_T12X_GPCCS_WITH_RESERVED: gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); /* fallthrough */ case FALCON_UCODE_SIG_T12X_FECS_WITHOUT_RESERVED: case FALCON_UCODE_SIG_T12X_GPCCS_WITHOUT_RESERVED: case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED: case FALCON_UCODE_SIG_T21X_FECS_WITHOUT_RESERVED2: case FALCON_UCODE_SIG_T21X_GPCCS_WITHOUT_RESERVED: gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 4); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size); break; case FALCON_UCODE_SIG_T12X_FECS_OLDER: case FALCON_UCODE_SIG_T12X_GPCCS_OLDER: gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0); break; default: nvgpu_err(g, "unknown falcon ucode boot signature 0x%08x" " with reg_offset 0x%08x", segments->boot_signature, reg_offset); BUG(); } } void gr_gk20a_load_ctxsw_ucode_boot(struct gk20a *g, u64 addr_base, struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset) { u32 addr_load32; u32 blocks; u32 b; u32 dst; addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8); blocks = ((segments->boot.size + 0xFFU) & ~0xFFU) >> 8; /* * Set the base FB address for the DMA transfer. Subtract off the 256 * byte IMEM block offset such that the relative FB and IMEM offsets * match, allowing the IMEM tags to be properly created. */ dst = segments->boot_imem_offset; gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(), (addr_load32 - (dst >> 8))); for (b = 0; b < blocks; b++) { /* Setup destination IMEM offset */ gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(), dst + (b << 8)); /* Setup source offset (relative to BASE) */ gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(), dst + (b << 8)); gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(), gr_fecs_dmatrfcmd_imem_f(0x01) | gr_fecs_dmatrfcmd_write_f(0x00) | gr_fecs_dmatrfcmd_size_f(0x06) | gr_fecs_dmatrfcmd_ctxdma_f(0)); } /* Specify the falcon boot vector */ gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(), gr_fecs_bootvec_vec_f(segments->boot_entry)); } static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g) { struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info; u64 addr_base = ucode_info->surface_desc.gpu_va; gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0); gr_gk20a_load_falcon_bind_instblk(g); g->ops.gr.falcon_load_ucode(g, addr_base, &g->ctxsw_ucode_info.fecs, 0); g->ops.gr.falcon_load_ucode(g, addr_base, &g->ctxsw_ucode_info.gpccs, gr_gpcs_gpccs_falcon_hwcfg_r() - gr_fecs_falcon_hwcfg_r()); } int gr_gk20a_load_ctxsw_ucode(struct gk20a *g) { int err; nvgpu_log_fn(g, " "); if (nvgpu_is_enabled(g, NVGPU_IS_FMODEL)) { gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7), gr_fecs_ctxsw_mailbox_value_f(0xc0de7777U)); gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7), gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777U)); } /* * In case bootloader is not supported, revert to the old way of * loading gr ucode, without the faster bootstrap routine. */ if (!nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP)) { gr_gk20a_load_falcon_dmem(g); gr_gk20a_load_falcon_imem(g); gr_gk20a_start_falcon_ucode(g); } else { if (!g->gr.skip_ucode_init) { err = gr_gk20a_init_ctxsw_ucode(g); if (err != 0) { return err; } } gr_gk20a_load_falcon_with_bootloader(g); g->gr.skip_ucode_init = true; } nvgpu_log_fn(g, "done"); return 0; } static int gr_gk20a_wait_ctxsw_ready(struct gk20a *g) { int ret; nvgpu_log_fn(g, " "); ret = gr_gk20a_ctx_wait_ucode(g, 0, NULL, GR_IS_UCODE_OP_EQUAL, eUcodeHandshakeInitComplete, GR_IS_UCODE_OP_SKIP, 0, false); if (ret != 0) { nvgpu_err(g, "falcon ucode init timeout"); return ret; } if (nvgpu_is_enabled(g, NVGPU_GR_USE_DMA_FOR_FW_BOOTSTRAP) || nvgpu_is_enabled(g, NVGPU_SEC_SECUREGPCCS)) { gk20a_writel(g, gr_fecs_current_ctx_r(), gr_fecs_current_ctx_valid_false_f()); } gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffffU); gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff); gk20a_writel(g, gr_fecs_method_push_r(), gr_fecs_method_push_adr_set_watchdog_timeout_f()); nvgpu_log_fn(g, "done"); return 0; } int gr_gk20a_init_ctx_state(struct gk20a *g) { int ret; struct fecs_method_op_gk20a op = { .mailbox = { .id = 0U, .data = 0U, .clr = ~U32(0U), .ok = 0U, .fail = 0U}, .method.data = 0U, .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL, .cond.fail = GR_IS_UCODE_OP_SKIP, }; nvgpu_log_fn(g, " "); /* query ctxsw image sizes, if golden context is not created */ if (!g->gr.ctx_vars.golden_image_initialized) { op.method.addr = gr_fecs_method_push_adr_discover_image_size_v(); op.mailbox.ret = &g->gr.ctx_vars.golden_image_size; ret = gr_gk20a_submit_fecs_method_op(g, op, false); if (ret != 0) { nvgpu_err(g, "query golden image size failed"); return ret; } op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v(); op.mailbox.ret = &g->gr.ctx_vars.zcull_ctxsw_image_size; ret = gr_gk20a_submit_fecs_method_op(g, op, false); if (ret != 0) { nvgpu_err(g, "query zcull ctx image size failed"); return ret; } op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v(); op.mailbox.ret = &g->gr.ctx_vars.pm_ctxsw_image_size; ret = gr_gk20a_submit_fecs_method_op(g, op, false); if (ret != 0) { nvgpu_err(g, "query pm ctx image size failed"); return ret; } g->gr.ctx_vars.priv_access_map_size = 512 * 1024; #ifdef CONFIG_GK20A_CTXSW_TRACE g->gr.ctx_vars.fecs_trace_buffer_size = gk20a_fecs_trace_buffer_size(g); #endif } nvgpu_log_fn(g, "done"); return 0; } int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; int err; u32 size; nvgpu_log_fn(g, " "); size = gr->bundle_cb_default_size * gr_scc_bundle_cb_size_div_256b_byte_granularity_v(); nvgpu_log_info(g, "cb_buffer_size : %d", size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_CIRCULAR, size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_CIRCULAR_VPR, size); size = g->ops.gr.pagepool_default_size(g) * gr_scc_pagepool_total_pages_byte_granularity_v(); nvgpu_log_info(g, "pagepool_buffer_size : %d", size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_PAGEPOOL, size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_PAGEPOOL_VPR, size); size = g->ops.gr.calc_global_ctx_buffer_size(g); nvgpu_log_info(g, "attr_buffer_size : %u", size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_ATTRIBUTE, size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_ATTRIBUTE_VPR, size); nvgpu_log_info(g, "priv_access_map_size : %d", gr->ctx_vars.priv_access_map_size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_PRIV_ACCESS_MAP, gr->ctx_vars.priv_access_map_size); #ifdef CONFIG_GK20A_CTXSW_TRACE nvgpu_log_info(g, "fecs_trace_buffer_size : %d", gr->ctx_vars.fecs_trace_buffer_size); nvgpu_gr_global_ctx_set_size(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_FECS_TRACE_BUFFER, gr->ctx_vars.fecs_trace_buffer_size); #endif err = nvgpu_gr_global_ctx_buffer_alloc(g, gr->global_ctx_buffer); if (err != 0) { return err; } nvgpu_log_fn(g, "done"); return 0; } int gr_gk20a_alloc_gr_ctx(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, struct vm_gk20a *vm) { struct gr_gk20a *gr = &g->gr; int err = 0; nvgpu_log_fn(g, " "); nvgpu_gr_ctx_set_size(gr->gr_ctx_desc, NVGPU_GR_CTX_CTX, gr->ctx_vars.golden_image_size); err = nvgpu_gr_ctx_alloc(g, gr_ctx, gr->gr_ctx_desc, vm); if (err != 0) { return err; } return 0; } void gr_gk20a_free_gr_ctx(struct gk20a *g, struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx) { nvgpu_log_fn(g, " "); if (gr_ctx != NULL) { if ((g->ops.gr.ctxsw_prog.dump_ctxsw_stats != NULL) && g->gr.ctx_vars.dump_ctxsw_stats_on_channel_close) { g->ops.gr.ctxsw_prog.dump_ctxsw_stats(g, &gr_ctx->mem); } nvgpu_gr_ctx_free(g, gr_ctx, g->gr.global_ctx_buffer, vm); } } void gr_gk20a_free_tsg_gr_ctx(struct tsg_gk20a *tsg) { struct gk20a *g = tsg->g; if (tsg->vm == NULL) { nvgpu_err(g, "No address space bound"); return; } tsg->g->ops.gr.free_gr_ctx(g, tsg->vm, tsg->gr_ctx); } u32 gr_gk20a_get_patch_slots(struct gk20a *g) { return PATCH_CTX_SLOTS_PER_PAGE; } int gk20a_alloc_obj_ctx(struct channel_gk20a *c, u32 class_num, u32 flags) { struct gk20a *g = c->g; struct nvgpu_gr_ctx *gr_ctx; struct tsg_gk20a *tsg = NULL; int err = 0; nvgpu_log_fn(g, " "); /* an address space needs to have been bound at this point.*/ if (!gk20a_channel_as_bound(c) && (c->vm == NULL)) { nvgpu_err(g, "not bound to address space at time" " of grctx allocation"); return -EINVAL; } if (!g->ops.gr.is_valid_class(g, class_num)) { nvgpu_err(g, "invalid obj class 0x%x", class_num); err = -EINVAL; goto out; } c->obj_class = class_num; tsg = tsg_gk20a_from_ch(c); if (tsg == NULL) { return -EINVAL; } gr_ctx = tsg->gr_ctx; if (!nvgpu_mem_is_valid(&gr_ctx->mem)) { tsg->vm = c->vm; nvgpu_vm_get(tsg->vm); err = g->ops.gr.alloc_gr_ctx(g, gr_ctx, tsg->vm); if (err != 0) { nvgpu_err(g, "fail to allocate TSG gr ctx buffer"); nvgpu_vm_put(tsg->vm); tsg->vm = NULL; goto out; } gr_ctx->tsgid = tsg->tsgid; /* allocate patch buffer */ if (!nvgpu_mem_is_valid(&gr_ctx->patch_ctx.mem)) { gr_ctx->patch_ctx.data_count = 0; nvgpu_gr_ctx_set_size(g->gr.gr_ctx_desc, NVGPU_GR_CTX_PATCH_CTX, g->ops.gr.get_patch_slots(g) * PATCH_CTX_SLOTS_REQUIRED_PER_ENTRY); err = nvgpu_gr_ctx_alloc_patch_ctx(g, gr_ctx, g->gr.gr_ctx_desc, c->vm); if (err != 0) { nvgpu_err(g, "fail to allocate patch buffer"); goto out; } } g->ops.gr.init_ctxsw_preemption_mode(g, gr_ctx, tsg->vm, class_num, flags); /* map global buffer to channel gpu_va and commit */ err = nvgpu_gr_ctx_map_global_ctx_buffers(g, gr_ctx, g->gr.global_ctx_buffer, tsg->vm, c->vpr); if (err != 0) { nvgpu_err(g, "fail to map global ctx buffer"); goto out; } g->ops.gr.commit_global_ctx_buffers(g, gr_ctx, true); /* commit gr ctx buffer */ err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va); if (err != 0) { nvgpu_err(g, "fail to commit gr ctx buffer"); goto out; } /* init golden image, ELPG enabled after this is done */ err = gr_gk20a_init_golden_ctx_image(g, c, gr_ctx); if (err != 0) { nvgpu_err(g, "fail to init golden ctx image"); goto out; } /* load golden image */ nvgpu_gr_ctx_load_golden_ctx_image(g, gr_ctx, g->gr.local_golden_image, c->cde); if (err != 0) { nvgpu_err(g, "fail to load golden ctx image"); goto out; } if (g->ops.gr.update_ctxsw_preemption_mode != NULL) { g->ops.gr.update_ctxsw_preemption_mode(g, gr_ctx, c->subctx); } #ifdef CONFIG_GK20A_CTXSW_TRACE if (g->ops.fecs_trace.bind_channel && !c->vpr) { err = g->ops.fecs_trace.bind_channel(g, c, 0, gr_ctx); if (err != 0) { nvgpu_warn(g, "fail to bind channel for ctxsw trace"); } } #endif if (g->ops.gr.set_czf_bypass != NULL) { g->ops.gr.set_czf_bypass(g, c); } /* PM ctxt switch is off by default */ gr_ctx->pm_ctx.pm_mode = g->ops.gr.ctxsw_prog.hw_get_pm_mode_no_ctxsw(); } else { /* commit gr ctx buffer */ err = g->ops.gr.commit_inst(c, gr_ctx->mem.gpu_va); if (err != 0) { nvgpu_err(g, "fail to commit gr ctx buffer"); goto out; } #ifdef CONFIG_GK20A_CTXSW_TRACE if (g->ops.fecs_trace.bind_channel && !c->vpr) { err = g->ops.fecs_trace.bind_channel(g, c, 0, gr_ctx); if (err != 0) { nvgpu_warn(g, "fail to bind channel for ctxsw trace"); } } #endif } nvgpu_log_fn(g, "done"); return 0; out: /* 1. gr_ctx, patch_ctx and global ctx buffer mapping can be reused so no need to release them. 2. golden image init and load is a one time thing so if they pass, no need to undo. */ nvgpu_err(g, "fail"); return err; } static void gk20a_remove_gr_support(struct gr_gk20a *gr) { struct gk20a *g = gr->g; nvgpu_log_fn(g, " "); gr_gk20a_free_cyclestats_snapshot_data(g); nvgpu_gr_global_ctx_buffer_free(g, gr->global_ctx_buffer); nvgpu_gr_global_ctx_desc_free(g, gr->global_ctx_buffer); nvgpu_gr_ctx_desc_free(g, gr->gr_ctx_desc); nvgpu_dma_free(g, &gr->compbit_store.mem); (void) memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc)); nvgpu_gr_config_deinit(g, gr->config); nvgpu_kfree(g, gr->sm_to_cluster); nvgpu_kfree(g, gr->fbp_rop_l2_en_mask); gr->fbp_rop_l2_en_mask = NULL; nvgpu_netlist_deinit_ctx_vars(g); if (gr->local_golden_image != NULL) { nvgpu_gr_global_ctx_deinit_local_golden_image(g, gr->local_golden_image); gr->local_golden_image = NULL; gr->ctx_vars.golden_image_initialized = false; } if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map != NULL) { nvgpu_big_free(g, gr->ctx_vars.hwpm_ctxsw_buffer_offset_map); } gr->ctx_vars.hwpm_ctxsw_buffer_offset_map = NULL; gk20a_comptag_allocator_destroy(g, &gr->comp_tags); nvgpu_ecc_remove_support(g); } static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr) { u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); u32 tmp; gr->config = nvgpu_gr_config_init(g); if (gr->config == NULL) { return -ENOMEM; } tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r()); gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp); tmp = gk20a_readl(g, top_num_fbps_r()); gr->max_fbps_count = top_num_fbps_value_v(tmp); gr->fbp_en_mask = g->ops.gr.get_fbp_en_mask(g); if (gr->fbp_rop_l2_en_mask == NULL) { gr->fbp_rop_l2_en_mask = nvgpu_kzalloc(g, gr->max_fbps_count * sizeof(u32)); if (gr->fbp_rop_l2_en_mask == NULL) { goto clean_up; } } else { (void) memset(gr->fbp_rop_l2_en_mask, 0, gr->max_fbps_count * sizeof(u32)); } /* allocate for max tpc per gpc */ if (gr->sm_to_cluster == NULL) { gr->sm_to_cluster = nvgpu_kzalloc(g, (size_t)nvgpu_gr_config_get_gpc_count(gr->config) * (size_t)nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) * (size_t)sm_per_tpc * sizeof(struct sm_info)); } else { (void) memset(gr->sm_to_cluster, 0, (size_t)nvgpu_gr_config_get_gpc_count(gr->config) * (size_t)nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) * (size_t)sm_per_tpc * sizeof(struct sm_info)); } gr->no_of_sm = 0; nvgpu_log_info(g, "fbps: %d", gr->num_fbps); nvgpu_log_info(g, "max_fbps_count: %d", gr->max_fbps_count); g->ops.gr.bundle_cb_defaults(g); g->ops.gr.cb_size_default(g); g->ops.gr.calc_global_ctx_buffer_size(g); gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v(); nvgpu_log_info(g, "bundle_cb_default_size: %d", gr->bundle_cb_default_size); nvgpu_log_info(g, "min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth); nvgpu_log_info(g, "bundle_cb_token_limit: %d", gr->bundle_cb_token_limit); nvgpu_log_info(g, "attrib_cb_default_size: %d", gr->attrib_cb_default_size); nvgpu_log_info(g, "attrib_cb_size: %d", gr->attrib_cb_size); nvgpu_log_info(g, "alpha_cb_default_size: %d", gr->alpha_cb_default_size); nvgpu_log_info(g, "alpha_cb_size: %d", gr->alpha_cb_size); nvgpu_log_info(g, "timeslice_mode: %d", gr->timeslice_mode); return 0; clean_up: return -ENOMEM; } static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr) { struct gr_zcull_gk20a *zcull = &gr->zcull; zcull->aliquot_width = nvgpu_gr_config_get_tpc_count(gr->config) * 16U; zcull->aliquot_height = 16; zcull->width_align_pixels = nvgpu_gr_config_get_tpc_count(gr->config) * 16U; zcull->height_align_pixels = 32; zcull->aliquot_size = zcull->aliquot_width * zcull->aliquot_height; /* assume no floor sweeping since we only have 1 tpc in 1 gpc */ zcull->pixel_squares_by_aliquots = nvgpu_gr_config_get_zcb_count(gr->config) * 16U * 16U * nvgpu_gr_config_get_tpc_count(gr->config) / (nvgpu_gr_config_get_gpc_count(gr->config) * nvgpu_gr_config_get_gpc_tpc_count(gr->config, 0U)); zcull->total_aliquots = gr_gpc0_zcull_total_ram_size_num_aliquots_f( gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r())); return 0; } u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr) { /* assuming gr has already been initialized */ return gr->ctx_vars.zcull_ctxsw_image_size; } int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr, struct channel_gk20a *c, u64 zcull_va, u32 mode) { struct tsg_gk20a *tsg; struct nvgpu_gr_ctx *gr_ctx; tsg = tsg_gk20a_from_ch(c); if (tsg == NULL) { return -EINVAL; } gr_ctx = tsg->gr_ctx; nvgpu_gr_ctx_set_zcull_ctx(g, gr_ctx, mode, zcull_va); /* TBD: don't disable channel in sw method processing */ return gr_gk20a_ctx_zcull_setup(g, c, gr_ctx); } int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr, struct gr_zcull_info *zcull_params) { struct gr_zcull_gk20a *zcull = &gr->zcull; zcull_params->width_align_pixels = zcull->width_align_pixels; zcull_params->height_align_pixels = zcull->height_align_pixels; zcull_params->pixel_squares_by_aliquots = zcull->pixel_squares_by_aliquots; zcull_params->aliquot_total = zcull->total_aliquots; zcull_params->region_byte_multiplier = nvgpu_gr_config_get_gpc_count(gr->config) * gr_zcull_bytes_per_aliquot_per_gpu_v(); zcull_params->region_header_size = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) * gr_zcull_save_restore_header_bytes_per_gpc_v(); zcull_params->subregion_header_size = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS) * gr_zcull_save_restore_subregion_header_bytes_per_gpc_v(); zcull_params->subregion_width_align_pixels = nvgpu_gr_config_get_tpc_count(gr->config) * gr_gpc0_zcull_zcsize_width_subregion__multiple_v(); zcull_params->subregion_height_align_pixels = gr_gpc0_zcull_zcsize_height_subregion__multiple_v(); zcull_params->subregion_count = gr_zcull_subregion_qty_v(); return 0; } int nvgpu_gr_zbc_add_color(struct gk20a *g, struct gr_gk20a *gr, struct zbc_entry *color_val, u32 index) { u32 i; /* update l2 table */ g->ops.ltc.set_zbc_color_entry(g, color_val->color_l2, index + GK20A_STARTOF_ZBC_TABLE); /* update local copy */ for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) { gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i]; gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i]; } gr->zbc_col_tbl[index].format = color_val->format; gr->zbc_col_tbl[index].ref_cnt++; /* update zbc registers */ g->ops.gr.zbc.add_color(g, gr, color_val, index); return 0; } int gk20a_gr_zbc_add_color(struct gk20a *g, struct gr_gk20a *gr, struct zbc_entry *color_val, u32 index) { /* update ds table */ nvgpu_writel(g, gr_ds_zbc_color_r_r(), gr_ds_zbc_color_r_val_f(color_val->color_ds[0])); nvgpu_writel(g, gr_ds_zbc_color_g_r(), gr_ds_zbc_color_g_val_f(color_val->color_ds[1])); nvgpu_writel(g, gr_ds_zbc_color_b_r(), gr_ds_zbc_color_b_val_f(color_val->color_ds[2])); nvgpu_writel(g, gr_ds_zbc_color_a_r(), gr_ds_zbc_color_a_val_f(color_val->color_ds[3])); nvgpu_writel(g, gr_ds_zbc_color_fmt_r(), gr_ds_zbc_color_fmt_val_f(color_val->format)); nvgpu_writel(g, gr_ds_zbc_tbl_index_r(), gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE)); /* trigger the write */ nvgpu_writel(g, gr_ds_zbc_tbl_ld_r(), gr_ds_zbc_tbl_ld_select_c_f() | gr_ds_zbc_tbl_ld_action_write_f() | gr_ds_zbc_tbl_ld_trigger_active_f()); return 0; } int nvgpu_gr_zbc_add_depth(struct gk20a *g, struct gr_gk20a *gr, struct zbc_entry *depth_val, u32 index) { /* update l2 table */ g->ops.ltc.set_zbc_depth_entry(g, depth_val->depth, index + GK20A_STARTOF_ZBC_TABLE); /* update local copy */ gr->zbc_dep_tbl[index].depth = depth_val->depth; gr->zbc_dep_tbl[index].format = depth_val->format; gr->zbc_dep_tbl[index].ref_cnt++; /* update zbc registers */ g->ops.gr.zbc.add_depth(g, gr, depth_val, index); return 0; } int gk20a_gr_zbc_add_depth(struct gk20a *g, struct gr_gk20a *gr, struct zbc_entry *depth_val, u32 index) { /* update ds table */ nvgpu_writel(g, gr_ds_zbc_z_r(), gr_ds_zbc_z_val_f(depth_val->depth)); nvgpu_writel(g, gr_ds_zbc_z_fmt_r(), gr_ds_zbc_z_fmt_val_f(depth_val->format)); nvgpu_writel(g, gr_ds_zbc_tbl_index_r(), gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE)); /* trigger the write */ nvgpu_writel(g, gr_ds_zbc_tbl_ld_r(), gr_ds_zbc_tbl_ld_select_z_f() | gr_ds_zbc_tbl_ld_action_write_f() | gr_ds_zbc_tbl_ld_trigger_active_f()); return 0; } int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr, struct zbc_entry *zbc_val) { struct zbc_color_table *c_tbl; struct zbc_depth_table *d_tbl; u32 i; int ret = -ENOSPC; bool added = false; u32 entries; /* no endian swap ? */ nvgpu_mutex_acquire(&gr->zbc_lock); nvgpu_speculation_barrier(); switch (zbc_val->type) { case GK20A_ZBC_TYPE_COLOR: /* search existing tables */ for (i = 0; i < gr->max_used_color_index; i++) { c_tbl = &gr->zbc_col_tbl[i]; if ((c_tbl->ref_cnt != 0U) && (c_tbl->format == zbc_val->format) && (nvgpu_memcmp((u8 *)c_tbl->color_ds, (u8 *)zbc_val->color_ds, sizeof(zbc_val->color_ds)) == 0) && (nvgpu_memcmp((u8 *)c_tbl->color_l2, (u8 *)zbc_val->color_l2, sizeof(zbc_val->color_l2)) == 0)) { added = true; c_tbl->ref_cnt++; ret = 0; break; } } /* add new table */ if (!added && gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) { c_tbl = &gr->zbc_col_tbl[gr->max_used_color_index]; WARN_ON(c_tbl->ref_cnt != 0U); ret = nvgpu_gr_zbc_add_color(g, gr, zbc_val, gr->max_used_color_index); if (ret == 0) { gr->max_used_color_index++; } } break; case GK20A_ZBC_TYPE_DEPTH: /* search existing tables */ for (i = 0; i < gr->max_used_depth_index; i++) { d_tbl = &gr->zbc_dep_tbl[i]; if ((d_tbl->ref_cnt != 0U) && (d_tbl->depth == zbc_val->depth) && (d_tbl->format == zbc_val->format)) { added = true; d_tbl->ref_cnt++; ret = 0; break; } } /* add new table */ if (!added && gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) { d_tbl = &gr->zbc_dep_tbl[gr->max_used_depth_index]; WARN_ON(d_tbl->ref_cnt != 0U); ret = nvgpu_gr_zbc_add_depth(g, gr, zbc_val, gr->max_used_depth_index); if (ret == 0) { gr->max_used_depth_index++; } } break; case T19X_ZBC: if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) { added = nvgpu_gr_zbc_add_type_stencil(g, gr, zbc_val, &ret); } else { nvgpu_err(g, "invalid zbc table type %d", zbc_val->type); ret = -EINVAL; goto err_mutex; } break; default: nvgpu_err(g, "invalid zbc table type %d", zbc_val->type); ret = -EINVAL; goto err_mutex; } if (!added && ret == 0) { /* update zbc for elpg only when new entry is added */ entries = max(gr->max_used_color_index, gr->max_used_depth_index); g->ops.pmu.save_zbc(g, entries); } err_mutex: nvgpu_mutex_release(&gr->zbc_lock); return ret; } /* get a zbc table entry specified by index * return table size when type is invalid */ int nvgpu_gr_zbc_query_table(struct gk20a *g, struct gr_gk20a *gr, struct zbc_query_params *query_params) { u32 index = query_params->index_size; u32 i; nvgpu_speculation_barrier(); switch (query_params->type) { case GK20A_ZBC_TYPE_INVALID: query_params->index_size = GK20A_ZBC_TABLE_SIZE; break; case GK20A_ZBC_TYPE_COLOR: if (index >= GK20A_ZBC_TABLE_SIZE) { nvgpu_err(g, "invalid zbc color table index"); return -EINVAL; } nvgpu_speculation_barrier(); for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) { query_params->color_l2[i] = gr->zbc_col_tbl[index].color_l2[i]; query_params->color_ds[i] = gr->zbc_col_tbl[index].color_ds[i]; } query_params->format = gr->zbc_col_tbl[index].format; query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt; break; case GK20A_ZBC_TYPE_DEPTH: if (index >= GK20A_ZBC_TABLE_SIZE) { nvgpu_err(g, "invalid zbc depth table index"); return -EINVAL; } nvgpu_speculation_barrier(); query_params->depth = gr->zbc_dep_tbl[index].depth; query_params->format = gr->zbc_dep_tbl[index].format; query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt; break; case T19X_ZBC: if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) { return nvgpu_gr_zbc_stencil_query_table(g, gr, query_params); } else { nvgpu_err(g, "invalid zbc table type"); return -EINVAL; } break; default: nvgpu_err(g, "invalid zbc table type"); return -EINVAL; } return 0; } static int gr_gk20a_load_zbc_table(struct gk20a *g, struct gr_gk20a *gr) { unsigned int i; int ret; for (i = 0; i < gr->max_used_color_index; i++) { struct zbc_color_table *c_tbl = &gr->zbc_col_tbl[i]; struct zbc_entry zbc_val; zbc_val.type = GK20A_ZBC_TYPE_COLOR; nvgpu_memcpy((u8 *)zbc_val.color_ds, (u8 *)c_tbl->color_ds, sizeof(zbc_val.color_ds)); nvgpu_memcpy((u8 *)zbc_val.color_l2, (u8 *)c_tbl->color_l2, sizeof(zbc_val.color_l2)); zbc_val.format = c_tbl->format; ret = nvgpu_gr_zbc_add_color(g, gr, &zbc_val, i); if (ret != 0) { return ret; } } for (i = 0; i < gr->max_used_depth_index; i++) { struct zbc_depth_table *d_tbl = &gr->zbc_dep_tbl[i]; struct zbc_entry zbc_val; zbc_val.type = GK20A_ZBC_TYPE_DEPTH; zbc_val.depth = d_tbl->depth; zbc_val.format = d_tbl->format; ret = nvgpu_gr_zbc_add_depth(g, gr, &zbc_val, i); if (ret != 0) { return ret; } } if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) { ret = nvgpu_gr_zbc_load_stencil_tbl(g, gr); if (ret != 0) { return ret; } } return 0; } int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr) { struct zbc_entry zbc_val; u32 i = 0; int err = 0; err = nvgpu_mutex_init(&gr->zbc_lock); if (err != 0) { nvgpu_err(g, "Error in zbc_lock mutex initialization"); return err; } /* load default color table */ zbc_val.type = GK20A_ZBC_TYPE_COLOR; /* Opaque black (i.e. solid black, fmt 0x28 = A8B8G8R8) */ zbc_val.format = gr_ds_zbc_color_fmt_val_a8_b8_g8_r8_v(); for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) { zbc_val.color_ds[i] = 0U; zbc_val.color_l2[i] = 0U; } zbc_val.color_l2[0] = 0xff000000U; zbc_val.color_ds[3] = 0x3f800000U; err = gr_gk20a_add_zbc(g, gr, &zbc_val); if (err != 0) { goto color_fail; } /* Transparent black = (fmt 1 = zero) */ zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v(); for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) { zbc_val.color_ds[i] = 0U; zbc_val.color_l2[i] = 0U; } err = gr_gk20a_add_zbc(g, gr, &zbc_val); if (err != 0) { goto color_fail; } /* Opaque white (i.e. solid white) = (fmt 2 = uniform 1) */ zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v(); for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) { zbc_val.color_ds[i] = 0x3f800000U; zbc_val.color_l2[i] = 0xffffffffU; } err = gr_gk20a_add_zbc(g, gr, &zbc_val); if (err != 0) { goto color_fail; } gr->max_default_color_index = 3; /* load default depth table */ zbc_val.type = GK20A_ZBC_TYPE_DEPTH; zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v(); zbc_val.depth = 0x3f800000; err = gr_gk20a_add_zbc(g, gr, &zbc_val); if (err != 0) { goto depth_fail; } zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v(); zbc_val.depth = 0; err = gr_gk20a_add_zbc(g, gr, &zbc_val); if (err != 0) { goto depth_fail; } gr->max_default_depth_index = 2; if (nvgpu_is_enabled(g, NVGPU_SUPPORT_ZBC_STENCIL)) { err = nvgpu_gr_zbc_load_stencil_default_tbl(g, gr); if (err != 0) { return err; } } return 0; color_fail: nvgpu_err(g, "fail to load default zbc color table"); return err; depth_fail: nvgpu_err(g, "fail to load default zbc depth table"); return err; } int nvgpu_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr, struct zbc_entry *zbc_val) { nvgpu_log_fn(g, " "); return gr_gk20a_elpg_protected_call(g, gr_gk20a_add_zbc(g, gr, zbc_val)); } void gr_gk20a_init_cg_mode(struct gk20a *g, u32 cgmode, u32 mode_config) { u32 engine_idx; u32 active_engine_id = 0; struct fifo_engine_info_gk20a *engine_info = NULL; struct fifo_gk20a *f = &g->fifo; for (engine_idx = 0; engine_idx < f->num_engines; ++engine_idx) { active_engine_id = f->active_engines_list[engine_idx]; engine_info = &f->engine_info[active_engine_id]; /* gr_engine supports both BLCG and ELCG */ if ((cgmode == BLCG_MODE) && (engine_info->engine_enum == NVGPU_ENGINE_GR_GK20A)) { g->ops.therm.init_blcg_mode(g, mode_config, active_engine_id); break; } else if (cgmode == ELCG_MODE) { g->ops.therm.init_elcg_mode(g, mode_config, active_engine_id); } else { nvgpu_err(g, "invalid cg mode %d, config %d for " "act_eng_id %d", cgmode, mode_config, active_engine_id); } } } void gr_gk20a_program_zcull_mapping(struct gk20a *g, u32 zcull_num_entries, u32 *zcull_map_tiles) { u32 val; nvgpu_log_fn(g, " "); if (zcull_num_entries >= 8U) { nvgpu_log_fn(g, "map0"); val = gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f( zcull_map_tiles[0]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f( zcull_map_tiles[1]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f( zcull_map_tiles[2]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f( zcull_map_tiles[3]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f( zcull_map_tiles[4]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f( zcull_map_tiles[5]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f( zcull_map_tiles[6]) | gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f( zcull_map_tiles[7]); gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(), val); } if (zcull_num_entries >= 16U) { nvgpu_log_fn(g, "map1"); val = gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f( zcull_map_tiles[8]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f( zcull_map_tiles[9]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f( zcull_map_tiles[10]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f( zcull_map_tiles[11]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f( zcull_map_tiles[12]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f( zcull_map_tiles[13]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f( zcull_map_tiles[14]) | gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f( zcull_map_tiles[15]); gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(), val); } if (zcull_num_entries >= 24U) { nvgpu_log_fn(g, "map2"); val = gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f( zcull_map_tiles[16]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f( zcull_map_tiles[17]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f( zcull_map_tiles[18]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f( zcull_map_tiles[19]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f( zcull_map_tiles[20]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f( zcull_map_tiles[21]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f( zcull_map_tiles[22]) | gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f( zcull_map_tiles[23]); gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(), val); } if (zcull_num_entries >= 32U) { nvgpu_log_fn(g, "map3"); val = gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f( zcull_map_tiles[24]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f( zcull_map_tiles[25]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f( zcull_map_tiles[26]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f( zcull_map_tiles[27]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f( zcull_map_tiles[28]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f( zcull_map_tiles[29]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f( zcull_map_tiles[30]) | gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f( zcull_map_tiles[31]); gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(), val); } } static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr) { u32 gpc_index, gpc_tpc_count, gpc_zcull_count; u32 *zcull_map_tiles, *zcull_bank_counters; u32 map_counter; u32 rcp_conserv; u32 offset; bool floorsweep = false; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 num_gpcs = nvgpu_get_litter_value(g, GPU_LIT_NUM_GPCS); u32 num_tpc_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_TPC_PER_GPC); u32 zcull_alloc_num = num_gpcs * num_tpc_per_gpc; u32 map_tile_count; if (gr->config->map_tiles == NULL) { return -1; } if (zcull_alloc_num % 8U != 0U) { /* Total 8 fields per map reg i.e. tile_0 to tile_7*/ zcull_alloc_num += (zcull_alloc_num % 8U); } zcull_map_tiles = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32)); if (zcull_map_tiles == NULL) { nvgpu_err(g, "failed to allocate zcull map titles"); return -ENOMEM; } zcull_bank_counters = nvgpu_kzalloc(g, zcull_alloc_num * sizeof(u32)); if (zcull_bank_counters == NULL) { nvgpu_err(g, "failed to allocate zcull bank counters"); nvgpu_kfree(g, zcull_map_tiles); return -ENOMEM; } for (map_counter = 0; map_counter < nvgpu_gr_config_get_tpc_count(gr->config); map_counter++) { map_tile_count = nvgpu_gr_config_get_map_tile_count(gr->config, map_counter); zcull_map_tiles[map_counter] = zcull_bank_counters[map_tile_count]; zcull_bank_counters[map_tile_count]++; } if (g->ops.gr.program_zcull_mapping != NULL) { g->ops.gr.program_zcull_mapping(g, zcull_alloc_num, zcull_map_tiles); } nvgpu_kfree(g, zcull_map_tiles); nvgpu_kfree(g, zcull_bank_counters); for (gpc_index = 0; gpc_index < nvgpu_gr_config_get_gpc_count(gr->config); gpc_index++) { gpc_tpc_count = nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_index); gpc_zcull_count = nvgpu_gr_config_get_gpc_zcb_count(gr->config, gpc_index); if (gpc_zcull_count != nvgpu_gr_config_get_max_zcull_per_gpc_count(gr->config) && gpc_zcull_count < gpc_tpc_count) { nvgpu_err(g, "zcull_banks (%d) less than tpcs (%d) for gpc (%d)", gpc_zcull_count, gpc_tpc_count, gpc_index); return -EINVAL; } if (gpc_zcull_count != nvgpu_gr_config_get_max_zcull_per_gpc_count(gr->config) && gpc_zcull_count != 0U) { floorsweep = true; } } /* ceil(1.0f / SM_NUM * gr_gpc0_zcull_sm_num_rcp_conservative__max_v()) */ rcp_conserv = DIV_ROUND_UP(gr_gpc0_zcull_sm_num_rcp_conservative__max_v(), nvgpu_gr_config_get_gpc_tpc_count(gr->config, 0U)); for (gpc_index = 0; gpc_index < nvgpu_gr_config_get_gpc_count(gr->config); gpc_index++) { offset = gpc_index * gpc_stride; if (floorsweep) { gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset, gr_gpc0_zcull_ram_addr_row_offset_f( nvgpu_gr_config_get_map_row_offset(gr->config)) | gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f( nvgpu_gr_config_get_max_zcull_per_gpc_count(gr->config))); } else { gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset, gr_gpc0_zcull_ram_addr_row_offset_f( nvgpu_gr_config_get_map_row_offset(gr->config)) | gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f( nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc_index))); } gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset, gr_gpc0_zcull_fs_num_active_banks_f( nvgpu_gr_config_get_gpc_zcb_count(gr->config, gpc_index)) | gr_gpc0_zcull_fs_num_sms_f( nvgpu_gr_config_get_tpc_count(gr->config))); gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset, gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv)); } gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(), gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv)); return 0; } void gk20a_gr_enable_exceptions(struct gk20a *g) { gk20a_writel(g, gr_exception_r(), 0xFFFFFFFFU); gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFFU); gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFFU); gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFFU); gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFFU); gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFFU); } void gk20a_gr_enable_gpc_exceptions(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; u32 tpc_mask; gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), gr_gpcs_tpcs_tpccs_tpc_exception_en_tex_enabled_f() | gr_gpcs_tpcs_tpccs_tpc_exception_en_sm_enabled_f()); tpc_mask = gr_gpcs_gpccs_gpc_exception_en_tpc_f( BIT32(nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config)) - 1U); gk20a_writel(g, gr_gpcs_gpccs_gpc_exception_en_r(), tpc_mask); } void gr_gk20a_enable_hww_exceptions(struct gk20a *g) { /* enable exceptions */ gk20a_writel(g, gr_fe_hww_esr_r(), gr_fe_hww_esr_en_enable_f() | gr_fe_hww_esr_reset_active_f()); gk20a_writel(g, gr_memfmt_hww_esr_r(), gr_memfmt_hww_esr_en_enable_f() | gr_memfmt_hww_esr_reset_active_f()); } void gr_gk20a_fecs_host_int_enable(struct gk20a *g) { gk20a_writel(g, gr_fecs_host_int_enable_r(), gr_fecs_host_int_enable_ctxsw_intr1_enable_f() | gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() | gr_fecs_host_int_enable_umimp_firmware_method_enable_f() | gr_fecs_host_int_enable_umimp_illegal_method_enable_f() | gr_fecs_host_int_enable_watchdog_enable_f()); } static int gk20a_init_gr_setup_hw(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; struct netlist_aiv_list *sw_ctx_load = &g->netlist_vars->sw_ctx_load; struct netlist_av_list *sw_method_init = &g->netlist_vars->sw_method_init; u32 data; u32 last_method_data = 0; u32 i; int err; nvgpu_log_fn(g, " "); if (g->ops.gr.init_gpc_mmu != NULL) { g->ops.gr.init_gpc_mmu(g); } /* load gr floorsweeping registers */ data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r()); data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(), gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f()); gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data); gr_gk20a_zcull_init_hw(g, gr); if (g->ops.priv_ring.set_ppriv_timeout_settings != NULL) { g->ops.priv_ring.set_ppriv_timeout_settings(g); } /* enable fifo access */ gk20a_writel(g, gr_gpfifo_ctl_r(), gr_gpfifo_ctl_access_enabled_f() | gr_gpfifo_ctl_semaphore_access_enabled_f()); /* TBD: reload gr ucode when needed */ /* enable interrupts */ gk20a_writel(g, gr_intr_r(), 0xFFFFFFFFU); gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFFU); /* enable fecs error interrupts */ g->ops.gr.fecs_host_int_enable(g); g->ops.gr.enable_hww_exceptions(g); g->ops.gr.set_hww_esr_report_mask(g); /* enable TPC exceptions per GPC */ if (g->ops.gr.enable_gpc_exceptions != NULL) { g->ops.gr.enable_gpc_exceptions(g); } /* enable ECC for L1/SM */ if (g->ops.gr.ecc_init_scrub_reg != NULL) { g->ops.gr.ecc_init_scrub_reg(g); } /* TBD: enable per BE exceptions */ /* reset and enable exceptions */ g->ops.gr.enable_exceptions(g); gr_gk20a_load_zbc_table(g, gr); if (g->ops.ltc.init_cbc != NULL) { g->ops.ltc.init_cbc(g, gr); } if (g->ops.fb.init_cbc != NULL) { g->ops.fb.init_cbc(g, gr); } if (g->ops.gr.disable_rd_coalesce != NULL) { g->ops.gr.disable_rd_coalesce(g); } /* load ctx init */ for (i = 0; i < sw_ctx_load->count; i++) { gk20a_writel(g, sw_ctx_load->l[i].addr, sw_ctx_load->l[i].value); } err = gr_gk20a_wait_idle(g); if (err != 0) { goto out; } if (g->ops.gr.init_preemption_state != NULL) { err = g->ops.gr.init_preemption_state(g); if (err != 0) { goto out; } } /* disable fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_disabled_f()); /* override a few ctx state registers */ g->ops.gr.commit_global_timeslice(g, NULL); /* floorsweep anything left */ err = g->ops.gr.init_fs_state(g); if (err != 0) { goto out; } err = gr_gk20a_wait_idle(g); if (err != 0) { goto restore_fe_go_idle; } restore_fe_go_idle: /* restore fe_go_idle */ gk20a_writel(g, gr_fe_go_idle_timeout_r(), gr_fe_go_idle_timeout_count_prod_f()); if ((err != 0) || (gr_gk20a_wait_idle(g) != 0)) { goto out; } /* load method init */ if (sw_method_init->count != 0U) { gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(), sw_method_init->l[0].value); gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(), gr_pri_mme_shadow_raw_index_write_trigger_f() | sw_method_init->l[0].addr); last_method_data = sw_method_init->l[0].value; } for (i = 1; i < sw_method_init->count; i++) { if (sw_method_init->l[i].value != last_method_data) { gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(), sw_method_init->l[i].value); last_method_data = sw_method_init->l[i].value; } gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(), gr_pri_mme_shadow_raw_index_write_trigger_f() | sw_method_init->l[i].addr); } err = gr_gk20a_wait_idle(g); out: nvgpu_log_fn(g, "done"); return err; } static void gr_gk20a_load_gating_prod(struct gk20a *g) { nvgpu_log_fn(g, " "); /* slcg prod values */ if (g->ops.clock_gating.slcg_bus_load_gating_prod != NULL) { g->ops.clock_gating.slcg_bus_load_gating_prod(g, g->slcg_enabled); } if (g->ops.clock_gating.slcg_chiplet_load_gating_prod != NULL) { g->ops.clock_gating.slcg_chiplet_load_gating_prod(g, g->slcg_enabled); } if (g->ops.clock_gating.slcg_gr_load_gating_prod != NULL) { g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled); } if (g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod != NULL) { g->ops.clock_gating.slcg_ctxsw_firmware_load_gating_prod(g, g->slcg_enabled); } if (g->ops.clock_gating.slcg_perf_load_gating_prod != NULL) { g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled); } if (g->ops.clock_gating.slcg_xbar_load_gating_prod != NULL) { g->ops.clock_gating.slcg_xbar_load_gating_prod(g, g->slcg_enabled); } /* blcg prod values */ if (g->ops.clock_gating.blcg_bus_load_gating_prod != NULL) { g->ops.clock_gating.blcg_bus_load_gating_prod(g, g->blcg_enabled); } if (g->ops.clock_gating.blcg_gr_load_gating_prod != NULL) { g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled); } if (g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod != NULL) { g->ops.clock_gating.blcg_ctxsw_firmware_load_gating_prod(g, g->blcg_enabled); } if (g->ops.clock_gating.blcg_xbar_load_gating_prod != NULL) { g->ops.clock_gating.blcg_xbar_load_gating_prod(g, g->blcg_enabled); } if (g->ops.clock_gating.pg_gr_load_gating_prod != NULL) { g->ops.clock_gating.pg_gr_load_gating_prod(g, true); } nvgpu_log_fn(g, "done"); } static void gk20a_init_gr_prepare(struct gk20a *g) { /* reset gr engine */ g->ops.mc.reset(g, g->ops.mc.reset_mask(g, NVGPU_UNIT_GRAPH) | g->ops.mc.reset_mask(g, NVGPU_UNIT_BLG) | g->ops.mc.reset_mask(g, NVGPU_UNIT_PERFMON)); gr_gk20a_load_gating_prod(g); /* Disable elcg until it gets enabled later in the init*/ gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); /* enable fifo access */ gk20a_writel(g, gr_gpfifo_ctl_r(), gr_gpfifo_ctl_access_enabled_f() | gr_gpfifo_ctl_semaphore_access_enabled_f()); } static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g) { struct nvgpu_timeout timeout; bool fecs_scrubbing; bool gpccs_scrubbing; nvgpu_log_fn(g, " "); nvgpu_timeout_init(g, &timeout, CTXSW_MEM_SCRUBBING_TIMEOUT_MAX / CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT, NVGPU_TIMER_RETRY_TIMER); do { fecs_scrubbing = (gk20a_readl(g, gr_fecs_dmactl_r()) & (gr_fecs_dmactl_imem_scrubbing_m() | gr_fecs_dmactl_dmem_scrubbing_m())) != 0U; gpccs_scrubbing = (gk20a_readl(g, gr_gpccs_dmactl_r()) & (gr_gpccs_dmactl_imem_scrubbing_m() | gr_gpccs_dmactl_imem_scrubbing_m())) != 0U; if (!fecs_scrubbing && !gpccs_scrubbing) { nvgpu_log_fn(g, "done"); return 0; } nvgpu_udelay(CTXSW_MEM_SCRUBBING_TIMEOUT_DEFAULT); } while (nvgpu_timeout_expired(&timeout) == 0); nvgpu_err(g, "Falcon mem scrubbing timeout"); return -ETIMEDOUT; } static int gr_gk20a_init_ctxsw(struct gk20a *g) { int err = 0; err = g->ops.gr.load_ctxsw_ucode(g); if (err != 0) { goto out; } err = gr_gk20a_wait_ctxsw_ready(g); if (err != 0) { goto out; } out: if (err != 0) { nvgpu_err(g, "fail"); } else { nvgpu_log_fn(g, "done"); } return err; } static int gk20a_init_gr_reset_enable_hw(struct gk20a *g) { struct netlist_av_list *sw_non_ctx_load = &g->netlist_vars->sw_non_ctx_load; u32 i; int err = 0; nvgpu_log_fn(g, " "); /* enable interrupts */ gk20a_writel(g, gr_intr_r(), ~U32(0U)); gk20a_writel(g, gr_intr_en_r(), ~U32(0U)); /* load non_ctx init */ for (i = 0; i < sw_non_ctx_load->count; i++) { gk20a_writel(g, sw_non_ctx_load->l[i].addr, sw_non_ctx_load->l[i].value); } err = gr_gk20a_wait_mem_scrubbing(g); if (err != 0) { goto out; } err = gr_gk20a_wait_idle(g); if (err != 0) { goto out; } out: if (err != 0) { nvgpu_err(g, "fail"); } else { nvgpu_log_fn(g, "done"); } return 0; } static int gr_gk20a_init_access_map(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; struct nvgpu_mem *mem; u32 nr_pages = DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size, PAGE_SIZE); u32 *whitelist = NULL; int w, num_entries = 0; mem = nvgpu_gr_global_ctx_buffer_get_mem(gr->global_ctx_buffer, NVGPU_GR_GLOBAL_CTX_PRIV_ACCESS_MAP); if (mem == NULL) { return -EINVAL; } nvgpu_memset(g, mem, 0, 0, PAGE_SIZE * nr_pages); g->ops.gr.get_access_map(g, &whitelist, &num_entries); for (w = 0; w < num_entries; w++) { u32 map_bit, map_byte, map_shift, x; map_bit = whitelist[w] >> 2; map_byte = map_bit >> 3; map_shift = map_bit & 0x7U; /* i.e. 0-7 */ nvgpu_log_info(g, "access map addr:0x%x byte:0x%x bit:%d", whitelist[w], map_byte, map_shift); x = nvgpu_mem_rd32(g, mem, map_byte / (u32)sizeof(u32)); x |= BIT32( (map_byte % sizeof(u32) * BITS_PER_BYTE) + map_shift); nvgpu_mem_wr32(g, mem, map_byte / (u32)sizeof(u32), x); } return 0; } static int gk20a_init_gr_setup_sw(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; int err = 0; nvgpu_log_fn(g, " "); if (gr->sw_ready) { nvgpu_log_fn(g, "skip init"); return 0; } gr->g = g; #if defined(CONFIG_GK20A_CYCLE_STATS) err = nvgpu_mutex_init(&g->gr.cs_lock); if (err != 0) { nvgpu_err(g, "Error in gr.cs_lock mutex initialization"); return err; } #endif err = gr_gk20a_init_gr_config(g, gr); if (err != 0) { goto clean_up; } err = nvgpu_gr_config_init_map_tiles(g, gr->config); if (err != 0) { goto clean_up; } if (g->ops.ltc.init_comptags != NULL) { err = g->ops.ltc.init_comptags(g, gr); if (err != 0) { goto clean_up; } } err = gr_gk20a_init_zcull(g, gr); if (err != 0) { goto clean_up; } gr->gr_ctx_desc = nvgpu_gr_ctx_desc_alloc(g); if (gr->gr_ctx_desc == NULL) { goto clean_up; } gr->global_ctx_buffer = nvgpu_gr_global_ctx_desc_alloc(g); if (gr->global_ctx_buffer == NULL) { goto clean_up; } err = g->ops.gr.alloc_global_ctx_buffers(g); if (err != 0) { goto clean_up; } err = gr_gk20a_init_access_map(g); if (err != 0) { goto clean_up; } gr_gk20a_load_zbc_default_table(g, gr); if (g->ops.gr.init_czf_bypass != NULL) { g->ops.gr.init_czf_bypass(g); } if (g->ops.gr.init_gfxp_wfi_timeout_count != NULL) { g->ops.gr.init_gfxp_wfi_timeout_count(g); } err = nvgpu_mutex_init(&gr->ctx_mutex); if (err != 0) { nvgpu_err(g, "Error in gr.ctx_mutex initialization"); goto clean_up; } nvgpu_spinlock_init(&gr->ch_tlb_lock); gr->remove_support = gk20a_remove_gr_support; gr->sw_ready = true; err = nvgpu_ecc_init_support(g); if (err != 0) { goto clean_up; } nvgpu_log_fn(g, "done"); return 0; clean_up: nvgpu_err(g, "fail"); gk20a_remove_gr_support(gr); return err; } static int gk20a_init_gr_bind_fecs_elpg(struct gk20a *g) { struct nvgpu_pmu *pmu = &g->pmu; struct mm_gk20a *mm = &g->mm; struct vm_gk20a *vm = mm->pmu.vm; int err = 0; u32 size; nvgpu_log_fn(g, " "); size = 0; err = gr_gk20a_fecs_get_reglist_img_size(g, &size); if (err != 0) { nvgpu_err(g, "fail to query fecs pg buffer size"); return err; } if (pmu->pg_buf.cpu_va == NULL) { err = nvgpu_dma_alloc_map_sys(vm, size, &pmu->pg_buf); if (err != 0) { nvgpu_err(g, "failed to allocate memory"); return -ENOMEM; } } err = gr_gk20a_fecs_set_reglist_bind_inst(g, &mm->pmu.inst_block); if (err != 0) { nvgpu_err(g, "fail to bind pmu inst to gr"); return err; } err = gr_gk20a_fecs_set_reglist_virtual_addr(g, pmu->pg_buf.gpu_va); if (err != 0) { nvgpu_err(g, "fail to set pg buffer pmu va"); return err; } return err; } int gk20a_init_gr_support(struct gk20a *g) { int err = 0; nvgpu_log_fn(g, " "); /* this is required before gr_gk20a_init_ctx_state */ err = nvgpu_mutex_init(&g->gr.fecs_mutex); if (err != 0) { nvgpu_err(g, "Error in gr.fecs_mutex initialization"); return err; } err = gr_gk20a_init_ctxsw(g); if (err != 0) { return err; } /* this appears query for sw states but fecs actually init ramchain, etc so this is hw init */ err = g->ops.gr.init_ctx_state(g); if (err != 0) { return err; } err = gk20a_init_gr_setup_sw(g); if (err != 0) { return err; } err = gk20a_init_gr_setup_hw(g); if (err != 0) { return err; } if (g->can_elpg) { err = gk20a_init_gr_bind_fecs_elpg(g); if (err != 0) { return err; } } gr_gk20a_enable_elcg(g); /* GR is inialized, signal possible waiters */ g->gr.initialized = true; nvgpu_cond_signal(&g->gr.init_wq); return 0; } /* Wait until GR is initialized */ void gk20a_gr_wait_initialized(struct gk20a *g) { NVGPU_COND_WAIT(&g->gr.init_wq, g->gr.initialized, 0U); } #define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE 0x02dcU #define NVA297_SET_CIRCULAR_BUFFER_SIZE 0x1280U #define NVA297_SET_SHADER_EXCEPTIONS 0x1528U #define NVA0C0_SET_SHADER_EXCEPTIONS 0x1528U #define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE U32(0) void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data) { nvgpu_log_fn(g, " "); if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) { gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0); gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0); } else { /* setup sm warp esr report masks */ gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() | gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f()); /* setup sm global esr report mask */ gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() | gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() | gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() | gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() | gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() | gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() | gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f()); } } int gk20a_enable_gr_hw(struct gk20a *g) { int err; nvgpu_log_fn(g, " "); gk20a_init_gr_prepare(g); err = nvgpu_netlist_init_ctx_vars(g); if (err != 0) { nvgpu_err(g, "failed to parse netlist"); return err; } err = gk20a_init_gr_reset_enable_hw(g); if (err != 0) { return err; } nvgpu_log_fn(g, "done"); return 0; } static void gr_gk20a_enable_elcg(struct gk20a *g) { if (g->elcg_enabled) { gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_AUTO); } else { gr_gk20a_init_cg_mode(g, ELCG_MODE, ELCG_RUN); } } int gk20a_gr_reset(struct gk20a *g) { int err; u32 size; nvgpu_mutex_acquire(&g->gr.fecs_mutex); err = gk20a_enable_gr_hw(g); if (err != 0) { nvgpu_mutex_release(&g->gr.fecs_mutex); return err; } err = gk20a_init_gr_setup_hw(g); if (err != 0) { nvgpu_mutex_release(&g->gr.fecs_mutex); return err; } err = gr_gk20a_init_ctxsw(g); if (err != 0) { nvgpu_mutex_release(&g->gr.fecs_mutex); return err; } nvgpu_mutex_release(&g->gr.fecs_mutex); /* this appears query for sw states but fecs actually init ramchain, etc so this is hw init */ err = g->ops.gr.init_ctx_state(g); if (err != 0) { return err; } size = 0; err = gr_gk20a_fecs_get_reglist_img_size(g, &size); if (err != 0) { nvgpu_err(g, "fail to query fecs pg buffer size"); return err; } err = gr_gk20a_fecs_set_reglist_bind_inst(g, &g->mm.pmu.inst_block); if (err != 0) { nvgpu_err(g, "fail to bind pmu inst to gr"); return err; } err = gr_gk20a_fecs_set_reglist_virtual_addr(g, g->pmu.pg_buf.gpu_va); if (err != 0) { nvgpu_err(g, "fail to set pg buffer pmu va"); return err; } gr_gk20a_load_gating_prod(g); gr_gk20a_enable_elcg(g); return err; } static void gk20a_gr_set_error_notifier(struct gk20a *g, struct gr_gk20a_isr_data *isr_data, u32 error_notifier) { struct channel_gk20a *ch; struct tsg_gk20a *tsg; struct channel_gk20a *ch_tsg; ch = isr_data->ch; if (ch == NULL) { return; } tsg = tsg_gk20a_from_ch(ch); if (tsg != NULL) { nvgpu_rwsem_down_read(&tsg->ch_list_lock); nvgpu_list_for_each_entry(ch_tsg, &tsg->ch_list, channel_gk20a, ch_entry) { if (gk20a_channel_get(ch_tsg) != NULL) { g->ops.fifo.set_error_notifier(ch_tsg, error_notifier); gk20a_channel_put(ch_tsg); } } nvgpu_rwsem_up_read(&tsg->ch_list_lock); } else { nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); } } static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { nvgpu_log_fn(g, " "); gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_SEMAPHORE_TIMEOUT); nvgpu_err(g, "gr semaphore timeout"); return -EINVAL; } static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { nvgpu_log_fn(g, " "); gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY); /* This is an unrecoverable error, reset is needed */ nvgpu_err(g, "gr semaphore timeout"); return -EINVAL; } static int gk20a_gr_handle_illegal_method(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { int ret = g->ops.gr.handle_sw_method(g, isr_data->addr, isr_data->class_num, isr_data->offset, isr_data->data_lo); if (ret != 0) { gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_ILLEGAL_NOTIFY); nvgpu_err(g, "invalid method class 0x%08x" ", offset 0x%08x address 0x%08x", isr_data->class_num, isr_data->offset, isr_data->addr); } return ret; } static int gk20a_gr_handle_illegal_class(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { nvgpu_log_fn(g, " "); gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY); nvgpu_err(g, "invalid class 0x%08x, offset 0x%08x", isr_data->class_num, isr_data->offset); return -EINVAL; } int gk20a_gr_handle_fecs_error(struct gk20a *g, struct channel_gk20a *ch, struct gr_gk20a_isr_data *isr_data) { u32 gr_fecs_intr = gk20a_readl(g, gr_fecs_host_int_status_r()); int ret = 0; u32 chid = isr_data->ch != NULL ? isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID; if (gr_fecs_intr == 0U) { return 0; } if ((gr_fecs_intr & gr_fecs_host_int_status_umimp_firmware_method_f(1)) != 0U) { gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_FECS_ERR_UNIMP_FIRMWARE_METHOD); nvgpu_err(g, "firmware method error 0x%08x for offset 0x%04x", gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)), isr_data->data_lo); ret = -1; } else if ((gr_fecs_intr & gr_fecs_host_int_status_watchdog_active_f()) != 0U) { /* currently, recovery is not initiated */ nvgpu_err(g, "fecs watchdog triggered for channel %u, " "cannot ctxsw anymore !!", chid); g->ops.gr.dump_gr_falcon_stats(g); } else if ((gr_fecs_intr & gr_fecs_host_int_status_ctxsw_intr_f(CTXSW_INTR0)) != 0U) { u32 mailbox_value = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(6)); #ifdef CONFIG_GK20A_CTXSW_TRACE if (mailbox_value == g->ops.fecs_trace.get_buffer_full_mailbox_val()) { nvgpu_info(g, "ctxsw intr0 set by ucode, " "timestamp buffer full"); gk20a_fecs_trace_reset_buffer(g); } else { nvgpu_err(g, "ctxsw intr0 set by ucode, error_code: 0x%08x", mailbox_value); ret = -1; } #else nvgpu_err(g, "ctxsw intr0 set by ucode, error_code: 0x%08x", mailbox_value); ret = -1; #endif } else { nvgpu_err(g, "unhandled fecs error interrupt 0x%08x for channel %u", gr_fecs_intr, chid); g->ops.gr.dump_gr_falcon_stats(g); } gk20a_writel(g, gr_fecs_host_int_clear_r(), gr_fecs_intr); return ret; } static int gk20a_gr_handle_class_error(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { u32 gr_class_error; u32 chid = isr_data->ch != NULL ? isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID; nvgpu_log_fn(g, " "); gr_class_error = gr_class_error_code_v(gk20a_readl(g, gr_class_error_r())); gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY); nvgpu_err(g, "class error 0x%08x, offset 0x%08x," "sub channel 0x%08x mme generated %d," " mme pc 0x%08xdata high %d priv status %d" " unhandled intr 0x%08x for channel %u", isr_data->class_num, (isr_data->offset << 2), gr_trapped_addr_subch_v(isr_data->addr), gr_trapped_addr_mme_generated_v(isr_data->addr), gr_trapped_data_mme_pc_v( gk20a_readl(g, gr_trapped_data_mme_r())), gr_trapped_addr_datahigh_v(isr_data->addr), gr_trapped_addr_priv_v(isr_data->addr), gr_class_error, chid); nvgpu_err(g, "trapped data low 0x%08x", gk20a_readl(g, gr_trapped_data_lo_r())); if (gr_trapped_addr_datahigh_v(isr_data->addr) != 0U) { nvgpu_err(g, "trapped data high 0x%08x", gk20a_readl(g, gr_trapped_data_hi_r())); } return -EINVAL; } static int gk20a_gr_handle_firmware_method(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { u32 chid = isr_data->ch != NULL ? isr_data->ch->chid : FIFO_INVAL_CHANNEL_ID; nvgpu_log_fn(g, " "); gk20a_gr_set_error_notifier(g, isr_data, NVGPU_ERR_NOTIFIER_GR_ERROR_SW_NOTIFY); nvgpu_err(g, "firmware method 0x%08x, offset 0x%08x for channel %u", isr_data->class_num, isr_data->offset, chid); return -EINVAL; } int gk20a_gr_handle_semaphore_pending(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { struct channel_gk20a *ch = isr_data->ch; struct tsg_gk20a *tsg; if (ch == NULL) { return 0; } tsg = tsg_gk20a_from_ch(ch); if (tsg != NULL) { g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_GR_SEMAPHORE_WRITE_AWAKEN); nvgpu_cond_broadcast(&ch->semaphore_wq); } else { nvgpu_err(g, "chid: %d is not bound to tsg", ch->chid); } return 0; } #if defined(CONFIG_GK20A_CYCLE_STATS) static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g, u32 offset) { /* support only 24-bit 4-byte aligned offsets */ bool valid = !(offset & 0xFF000003U); if (g->allow_all) { return true; } /* whitelist check */ valid = valid && is_bar0_global_offset_whitelisted_gk20a(g, offset); /* resource size check in case there was a problem * with allocating the assumed size of bar0 */ valid = valid && gk20a_io_valid_reg(g, offset); return valid; } #endif int gk20a_gr_handle_notify_pending(struct gk20a *g, struct gr_gk20a_isr_data *isr_data) { struct channel_gk20a *ch = isr_data->ch; #if defined(CONFIG_GK20A_CYCLE_STATS) void *virtual_address; u32 buffer_size; u32 offset; bool exit; #endif if (ch == NULL || tsg_gk20a_from_ch(ch) == NULL) { return 0; } #if defined(CONFIG_GK20A_CYCLE_STATS) /* GL will never use payload 0 for cycle state */ if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0)) { return 0; } nvgpu_mutex_acquire(&ch->cyclestate.cyclestate_buffer_mutex); virtual_address = ch->cyclestate.cyclestate_buffer; buffer_size = ch->cyclestate.cyclestate_buffer_size; offset = isr_data->data_lo; exit = false; while (!exit) { struct share_buffer_head *sh_hdr; u32 min_element_size; /* validate offset */ if (offset + sizeof(struct share_buffer_head) > buffer_size || offset + sizeof(struct share_buffer_head) < offset) { nvgpu_err(g, "cyclestats buffer overrun at offset 0x%x", offset); break; } sh_hdr = (struct share_buffer_head *) ((char *)virtual_address + offset); min_element_size = (sh_hdr->operation == OP_END ? sizeof(struct share_buffer_head) : sizeof(struct gk20a_cyclestate_buffer_elem)); /* validate sh_hdr->size */ if (sh_hdr->size < min_element_size || offset + sh_hdr->size > buffer_size || offset + sh_hdr->size < offset) { nvgpu_err(g, "bad cyclestate buffer header size at offset 0x%x", offset); sh_hdr->failed = true; break; } switch (sh_hdr->operation) { case OP_END: exit = true; break; case BAR0_READ32: case BAR0_WRITE32: { struct gk20a_cyclestate_buffer_elem *op_elem = (struct gk20a_cyclestate_buffer_elem *)sh_hdr; bool valid = is_valid_cyclestats_bar0_offset_gk20a( g, op_elem->offset_bar0); u32 raw_reg; u64 mask_orig; u64 v; if (!valid) { nvgpu_err(g, "invalid cycletstats op offset: 0x%x", op_elem->offset_bar0); sh_hdr->failed = exit = true; break; } mask_orig = ((1ULL << (op_elem->last_bit + 1)) -1)&~((1ULL << op_elem->first_bit)-1); raw_reg = gk20a_readl(g, op_elem->offset_bar0); switch (sh_hdr->operation) { case BAR0_READ32: op_elem->data = (raw_reg & mask_orig) >> op_elem->first_bit; break; case BAR0_WRITE32: v = 0; if ((unsigned int)mask_orig != ~((unsigned int)0)) { v = (unsigned int) (raw_reg & ~mask_orig); } v |= ((op_elem->data << op_elem->first_bit) & mask_orig); gk20a_writel(g, op_elem->offset_bar0, (unsigned int)v); break; default: /* nop ok?*/ break; } } break; default: /* no operation content case */ exit = true; break; } sh_hdr->completed = true; offset += sh_hdr->size; } nvgpu_mutex_release(&ch->cyclestate.cyclestate_buffer_mutex); #endif nvgpu_log_fn(g, " "); nvgpu_cond_broadcast_interruptible(&ch->notifier_wq); return 0; } /* Used by sw interrupt thread to translate current ctx to chid. * Also used by regops to translate current ctx to chid and tsgid. * For performance, we don't want to go through 128 channels every time. * curr_ctx should be the value read from gr_fecs_current_ctx_r(). * A small tlb is used here to cache translation. * * Returned channel must be freed with gk20a_channel_put() */ static struct channel_gk20a *gk20a_gr_get_channel_from_ctx( struct gk20a *g, u32 curr_ctx, u32 *curr_tsgid) { struct fifo_gk20a *f = &g->fifo; struct gr_gk20a *gr = &g->gr; u32 chid; u32 tsgid = NVGPU_INVALID_TSG_ID; u32 i; struct channel_gk20a *ret = NULL; /* when contexts are unloaded from GR, the valid bit is reset * but the instance pointer information remains intact. * This might be called from gr_isr where contexts might be * unloaded. No need to check ctx_valid bit */ nvgpu_spinlock_acquire(&gr->ch_tlb_lock); /* check cache first */ for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) { if (gr->chid_tlb[i].curr_ctx == curr_ctx) { chid = gr->chid_tlb[i].chid; tsgid = gr->chid_tlb[i].tsgid; ret = gk20a_channel_from_id(g, chid); goto unlock; } } /* slow path */ for (chid = 0; chid < f->num_channels; chid++) { struct channel_gk20a *ch = gk20a_channel_from_id(g, chid); if (ch == NULL) { continue; } if ((u32)(nvgpu_inst_block_addr(g, &ch->inst_block) >> ram_in_base_shift_v()) == gr_fecs_current_ctx_ptr_v(curr_ctx)) { tsgid = ch->tsgid; /* found it */ ret = ch; break; } gk20a_channel_put(ch); } if (ret == NULL) { goto unlock; } /* add to free tlb entry */ for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) { if (gr->chid_tlb[i].curr_ctx == 0U) { gr->chid_tlb[i].curr_ctx = curr_ctx; gr->chid_tlb[i].chid = chid; gr->chid_tlb[i].tsgid = tsgid; goto unlock; } } /* no free entry, flush one */ gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx; gr->chid_tlb[gr->channel_tlb_flush_index].chid = chid; gr->chid_tlb[gr->channel_tlb_flush_index].tsgid = tsgid; gr->channel_tlb_flush_index = (gr->channel_tlb_flush_index + 1U) & (GR_CHANNEL_MAP_TLB_SIZE - 1U); unlock: nvgpu_spinlock_release(&gr->ch_tlb_lock); if (curr_tsgid != NULL) { *curr_tsgid = tsgid; } return ret; } int gk20a_gr_lock_down_sm(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors) { u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); u32 dbgr_control0; nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d SM%d: assert stop trigger", gpc, tpc, sm); /* assert stop trigger */ dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset); dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f(); gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0); return g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask, check_errors); } bool gk20a_gr_sm_debugger_attached(struct gk20a *g) { u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r()); /* check if an sm debugger is attached. * assumption: all SMs will have debug mode enabled/disabled * uniformly. */ if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) == gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v()) { return true; } return false; } int gr_gk20a_handle_sm_exception(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr) { int ret = 0; bool do_warp_sync = false, early_exit = false, ignore_debugger = false; bool disable_sm_exceptions = true; u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); bool sm_debugger_attached; u32 global_esr, warp_esr, global_mask; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); sm_debugger_attached = g->ops.gr.sm_debugger_attached(g); global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); *hww_global_esr = global_esr; warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g); if (!sm_debugger_attached) { nvgpu_err(g, "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr); return -EFAULT; } nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "sm hww global 0x%08x warp 0x%08x", global_esr, warp_esr); gr_gk20a_elpg_protected_call(g, g->ops.gr.record_sm_error_state(g, gpc, tpc, sm, fault_ch)); if (g->ops.gr.pre_process_sm_exception != NULL) { ret = g->ops.gr.pre_process_sm_exception(g, gpc, tpc, sm, global_esr, warp_esr, sm_debugger_attached, fault_ch, &early_exit, &ignore_debugger); if (ret != 0) { nvgpu_err(g, "could not pre-process sm error!"); return ret; } } if (early_exit) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "returning early"); return ret; } /* * Disable forwarding of tpc exceptions, * the debugger will reenable exceptions after servicing them. * * Do not disable exceptions if the only SM exception is BPT_INT */ if ((global_esr == gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) && (warp_esr == 0U)) { disable_sm_exceptions = false; } if (!ignore_debugger && disable_sm_exceptions) { u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset); tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f(); gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset, tpc_exception_en); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM Exceptions disabled"); } /* if a debugger is present and an error has occurred, do a warp sync */ if (!ignore_debugger && ((warp_esr != 0U) || ((global_esr & ~global_mask) != 0U))) { nvgpu_log(g, gpu_dbg_intr, "warp sync needed"); do_warp_sync = true; } if (do_warp_sync) { ret = g->ops.gr.lock_down_sm(g, gpc, tpc, sm, global_mask, true); if (ret != 0) { nvgpu_err(g, "sm did not lock down!"); return ret; } } if (ignore_debugger) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "ignore_debugger set, skipping event posting"); } else { *post_event = true; } return ret; } int gr_gk20a_handle_tex_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event) { int ret = 0; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 offset = gpc_stride * gpc + tpc_in_gpc_stride * tpc; u32 esr; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, " "); esr = gk20a_readl(g, gr_gpc0_tpc0_tex_m_hww_esr_r() + offset); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "0x%08x", esr); gk20a_writel(g, gr_gpc0_tpc0_tex_m_hww_esr_r() + offset, esr); return ret; } void gk20a_gr_get_esr_sm_sel(struct gk20a *g, u32 gpc, u32 tpc, u32 *esr_sm_sel) { *esr_sm_sel = 1; } static int gk20a_gr_handle_tpc_exception(struct gk20a *g, u32 gpc, u32 tpc, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr) { int tmp_ret, ret = 0; u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); u32 tpc_exception = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_r() + offset); u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: pending exception 0x%x", gpc, tpc, tpc_exception); /* check if an sm exeption is pending */ if (gr_gpc0_tpc0_tpccs_tpc_exception_sm_v(tpc_exception) == gr_gpc0_tpc0_tpccs_tpc_exception_sm_pending_v()) { u32 esr_sm_sel, sm; nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: SM exception pending", gpc, tpc); if (g->ops.gr.handle_tpc_sm_ecc_exception != NULL) { g->ops.gr.handle_tpc_sm_ecc_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); } g->ops.gr.get_esr_sm_sel(g, gpc, tpc, &esr_sm_sel); for (sm = 0; sm < sm_per_tpc; sm++) { if ((esr_sm_sel & BIT32(sm)) == 0U) { continue; } nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: SM%d exception pending", gpc, tpc, sm); tmp_ret = g->ops.gr.handle_sm_exception(g, gpc, tpc, sm, post_event, fault_ch, hww_global_esr); ret = (ret != 0) ? ret : tmp_ret; /* clear the hwws, also causes tpc and gpc * exceptions to be cleared. Should be cleared * only if SM is locked down or empty. */ g->ops.gr.clear_sm_hww(g, gpc, tpc, sm, *hww_global_esr); } } /* check if a tex exeption is pending */ if (gr_gpc0_tpc0_tpccs_tpc_exception_tex_v(tpc_exception) == gr_gpc0_tpc0_tpccs_tpc_exception_tex_pending_v()) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d: TEX exception pending", gpc, tpc); tmp_ret = g->ops.gr.handle_tex_exception(g, gpc, tpc, post_event); ret = (ret != 0) ? ret : tmp_ret; } if (g->ops.gr.handle_tpc_mpc_exception != NULL) { tmp_ret = g->ops.gr.handle_tpc_mpc_exception(g, gpc, tpc, post_event); ret = (ret != 0) ? ret : tmp_ret; } return ret; } static int gk20a_gr_handle_gpc_exception(struct gk20a *g, bool *post_event, struct channel_gk20a *fault_ch, u32 *hww_global_esr) { int tmp_ret, ret = 0; u32 gpc_offset, gpc, tpc; struct gr_gk20a *gr = &g->gr; u32 exception1 = gk20a_readl(g, gr_exception1_r()); u32 gpc_exception; nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, " "); for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) { if ((exception1 & BIT32(gpc)) == 0U) { continue; } nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d exception pending", gpc); gpc_offset = gk20a_gr_gpc_offset(g, gpc); gpc_exception = gk20a_readl(g, gr_gpc0_gpccs_gpc_exception_r() + gpc_offset); /* check if any tpc has an exception */ for (tpc = 0; tpc < nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc); tpc++) { if ((gr_gpc0_gpccs_gpc_exception_tpc_v(gpc_exception) & BIT32(tpc)) == 0U) { continue; } nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d: TPC%d exception pending", gpc, tpc); tmp_ret = gk20a_gr_handle_tpc_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); ret = (ret != 0) ? ret : tmp_ret; } /* Handle GCC exception */ if ((gr_gpc0_gpccs_gpc_exception_gcc_v(gpc_exception) != 0U) && (g->ops.gr.handle_gcc_exception != NULL)) { tmp_ret = g->ops.gr.handle_gcc_exception(g, gpc, tpc, post_event, fault_ch, hww_global_esr); ret = (ret != 0) ? ret : tmp_ret; } /* Handle GPCCS exceptions */ if (g->ops.gr.handle_gpc_gpccs_exception != NULL) { tmp_ret = g->ops.gr.handle_gpc_gpccs_exception(g, gpc, gpc_exception); ret = (ret != 0) ? ret : tmp_ret; } /* Handle GPCMMU exceptions */ if (g->ops.gr.handle_gpc_gpcmmu_exception != NULL) { tmp_ret = g->ops.gr.handle_gpc_gpcmmu_exception(g, gpc, gpc_exception); ret = (ret != 0) ? ret : tmp_ret; } } return ret; } static int gk20a_gr_post_bpt_events(struct gk20a *g, struct tsg_gk20a *tsg, u32 global_esr) { if ((global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()) != 0U) { g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_INT); } if ((global_esr & gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f()) != 0U) { g->ops.fifo.post_event_id(tsg, NVGPU_EVENT_ID_BPT_PAUSE); } return 0; } int gk20a_gr_isr(struct gk20a *g) { struct gr_gk20a_isr_data isr_data; u32 grfifo_ctl; u32 obj_table; bool need_reset = false; u32 gr_intr = gk20a_readl(g, gr_intr_r()); struct channel_gk20a *ch = NULL; struct channel_gk20a *fault_ch = NULL; u32 tsgid = NVGPU_INVALID_TSG_ID; struct tsg_gk20a *tsg = NULL; u32 gr_engine_id; u32 global_esr = 0; u32 chid; nvgpu_log_fn(g, " "); nvgpu_log(g, gpu_dbg_intr, "pgraph intr 0x%08x", gr_intr); if (gr_intr == 0U) { return 0; } gr_engine_id = nvgpu_engine_get_gr_eng_id(g); if (gr_engine_id != FIFO_INVAL_ENGINE_ID) { gr_engine_id = BIT32(gr_engine_id); } grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r()); grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1); grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1); gk20a_writel(g, gr_gpfifo_ctl_r(), grfifo_ctl | gr_gpfifo_ctl_access_f(0) | gr_gpfifo_ctl_semaphore_access_f(0)); isr_data.addr = gk20a_readl(g, gr_trapped_addr_r()); isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r()); isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r()); isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r()); isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr); isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr); obj_table = (isr_data.sub_chan < 4U) ? gk20a_readl(g, gr_fe_object_table_r(isr_data.sub_chan)) : 0U; isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table); ch = gk20a_gr_get_channel_from_ctx(g, isr_data.curr_ctx, &tsgid); isr_data.ch = ch; chid = ch != NULL ? ch->chid : FIFO_INVAL_CHANNEL_ID; if (ch == NULL) { nvgpu_err(g, "pgraph intr: 0x%08x, chid: INVALID", gr_intr); } else { tsg = tsg_gk20a_from_ch(ch); if (tsg == NULL) { nvgpu_err(g, "pgraph intr: 0x%08x, chid: %d " "not bound to tsg", gr_intr, chid); } } nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "channel %d: addr 0x%08x, " "data 0x%08x 0x%08x," "ctx 0x%08x, offset 0x%08x, " "subchannel 0x%08x, class 0x%08x", chid, isr_data.addr, isr_data.data_hi, isr_data.data_lo, isr_data.curr_ctx, isr_data.offset, isr_data.sub_chan, isr_data.class_num); if ((gr_intr & gr_intr_notify_pending_f()) != 0U) { g->ops.gr.handle_notify_pending(g, &isr_data); gk20a_writel(g, gr_intr_r(), gr_intr_notify_reset_f()); gr_intr &= ~gr_intr_notify_pending_f(); } if ((gr_intr & gr_intr_semaphore_pending_f()) != 0U) { g->ops.gr.handle_semaphore_pending(g, &isr_data); gk20a_writel(g, gr_intr_r(), gr_intr_semaphore_reset_f()); gr_intr &= ~gr_intr_semaphore_pending_f(); } if ((gr_intr & gr_intr_semaphore_timeout_pending_f()) != 0U) { if (gk20a_gr_handle_semaphore_timeout_pending(g, &isr_data) != 0) { need_reset = true; } gk20a_writel(g, gr_intr_r(), gr_intr_semaphore_reset_f()); gr_intr &= ~gr_intr_semaphore_pending_f(); } if ((gr_intr & gr_intr_illegal_notify_pending_f()) != 0U) { if (gk20a_gr_intr_illegal_notify_pending(g, &isr_data) != 0) { need_reset = true; } gk20a_writel(g, gr_intr_r(), gr_intr_illegal_notify_reset_f()); gr_intr &= ~gr_intr_illegal_notify_pending_f(); } if ((gr_intr & gr_intr_illegal_method_pending_f()) != 0U) { if (gk20a_gr_handle_illegal_method(g, &isr_data) != 0) { need_reset = true; } gk20a_writel(g, gr_intr_r(), gr_intr_illegal_method_reset_f()); gr_intr &= ~gr_intr_illegal_method_pending_f(); } if ((gr_intr & gr_intr_illegal_class_pending_f()) != 0U) { if (gk20a_gr_handle_illegal_class(g, &isr_data) != 0) { need_reset = true; } gk20a_writel(g, gr_intr_r(), gr_intr_illegal_class_reset_f()); gr_intr &= ~gr_intr_illegal_class_pending_f(); } if ((gr_intr & gr_intr_fecs_error_pending_f()) != 0U) { if (g->ops.gr.handle_fecs_error(g, ch, &isr_data) != 0) { need_reset = true; } gk20a_writel(g, gr_intr_r(), gr_intr_fecs_error_reset_f()); gr_intr &= ~gr_intr_fecs_error_pending_f(); } if ((gr_intr & gr_intr_class_error_pending_f()) != 0U) { if (gk20a_gr_handle_class_error(g, &isr_data) != 0) { need_reset = true; } gk20a_writel(g, gr_intr_r(), gr_intr_class_error_reset_f()); gr_intr &= ~gr_intr_class_error_pending_f(); } /* this one happens if someone tries to hit a non-whitelisted * register using set_falcon[4] */ if ((gr_intr & gr_intr_firmware_method_pending_f()) != 0U) { if (gk20a_gr_handle_firmware_method(g, &isr_data) != 0) { need_reset = true; } nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n"); gk20a_writel(g, gr_intr_r(), gr_intr_firmware_method_reset_f()); gr_intr &= ~gr_intr_firmware_method_pending_f(); } if ((gr_intr & gr_intr_exception_pending_f()) != 0U) { u32 exception = gk20a_readl(g, gr_exception_r()); nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception); if ((exception & gr_exception_fe_m()) != 0U) { u32 fe = gk20a_readl(g, gr_fe_hww_esr_r()); u32 info = gk20a_readl(g, gr_fe_hww_esr_info_r()); nvgpu_err(g, "fe exception: esr 0x%08x, info 0x%08x", fe, info); gk20a_writel(g, gr_fe_hww_esr_r(), gr_fe_hww_esr_reset_active_f()); need_reset = true; } if ((exception & gr_exception_memfmt_m()) != 0U) { u32 memfmt = gk20a_readl(g, gr_memfmt_hww_esr_r()); nvgpu_err(g, "memfmt exception: esr %08x", memfmt); gk20a_writel(g, gr_memfmt_hww_esr_r(), gr_memfmt_hww_esr_reset_active_f()); need_reset = true; } if ((exception & gr_exception_pd_m()) != 0U) { u32 pd = gk20a_readl(g, gr_pd_hww_esr_r()); nvgpu_err(g, "pd exception: esr 0x%08x", pd); gk20a_writel(g, gr_pd_hww_esr_r(), gr_pd_hww_esr_reset_active_f()); need_reset = true; } if ((exception & gr_exception_scc_m()) != 0U) { u32 scc = gk20a_readl(g, gr_scc_hww_esr_r()); nvgpu_err(g, "scc exception: esr 0x%08x", scc); gk20a_writel(g, gr_scc_hww_esr_r(), gr_scc_hww_esr_reset_active_f()); need_reset = true; } if ((exception & gr_exception_ds_m()) != 0U) { u32 ds = gk20a_readl(g, gr_ds_hww_esr_r()); nvgpu_err(g, "ds exception: esr: 0x%08x", ds); gk20a_writel(g, gr_ds_hww_esr_r(), gr_ds_hww_esr_reset_task_f()); need_reset = true; } if ((exception & gr_exception_ssync_m()) != 0U) { if (g->ops.gr.handle_ssync_hww != NULL) { if (g->ops.gr.handle_ssync_hww(g) != 0) { need_reset = true; } } else { nvgpu_err(g, "unhandled ssync exception"); } } if ((exception & gr_exception_mme_m()) != 0U) { u32 mme = gk20a_readl(g, gr_mme_hww_esr_r()); u32 info = gk20a_readl(g, gr_mme_hww_esr_info_r()); nvgpu_err(g, "mme exception: esr 0x%08x info:0x%08x", mme, info); if (g->ops.gr.log_mme_exception != NULL) { g->ops.gr.log_mme_exception(g); } gk20a_writel(g, gr_mme_hww_esr_r(), gr_mme_hww_esr_reset_active_f()); need_reset = true; } if ((exception & gr_exception_sked_m()) != 0U) { u32 sked = gk20a_readl(g, gr_sked_hww_esr_r()); nvgpu_err(g, "sked exception: esr 0x%08x", sked); gk20a_writel(g, gr_sked_hww_esr_r(), gr_sked_hww_esr_reset_active_f()); need_reset = true; } /* check if a gpc exception has occurred */ if (((exception & gr_exception_gpc_m()) != 0U) && !need_reset) { bool post_event = false; nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending"); if (tsg != NULL) { fault_ch = isr_data.ch; } /* fault_ch can be NULL */ /* check if any gpc has an exception */ if (gk20a_gr_handle_gpc_exception(g, &post_event, fault_ch, &global_esr) != 0) { need_reset = true; } #ifdef NVGPU_DEBUGGER /* signal clients waiting on an event */ if (g->ops.gr.sm_debugger_attached(g) && post_event && (fault_ch != NULL)) { g->ops.debugger.post_events(fault_ch); } #endif } gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f()); gr_intr &= ~gr_intr_exception_pending_f(); if (need_reset) { nvgpu_err(g, "set gr exception notifier"); gk20a_gr_set_error_notifier(g, &isr_data, NVGPU_ERR_NOTIFIER_GR_EXCEPTION); } } if (need_reset) { if (tsg != NULL) { gk20a_fifo_recover(g, gr_engine_id, tsgid, true, true, true, RC_TYPE_GR_FAULT); } else { if (ch != NULL) { nvgpu_err(g, "chid: %d referenceable but not " "bound to tsg", chid); } gk20a_fifo_recover(g, gr_engine_id, 0, false, false, true, RC_TYPE_GR_FAULT); } } if (gr_intr != 0U) { /* clear unhandled interrupts */ if (ch == NULL) { /* * This is probably an interrupt during * gk20a_free_channel() */ nvgpu_err(g, "unhandled gr intr 0x%08x for " "unreferenceable channel, clearing", gr_intr); } else { nvgpu_err(g, "unhandled gr intr 0x%08x for chid: %d", gr_intr, chid); } gk20a_writel(g, gr_intr_r(), gr_intr); } gk20a_writel(g, gr_gpfifo_ctl_r(), grfifo_ctl | gr_gpfifo_ctl_access_f(1) | gr_gpfifo_ctl_semaphore_access_f(1)); /* Posting of BPT events should be the last thing in this function */ if ((global_esr != 0U) && (tsg != NULL) && (need_reset == false)) { gk20a_gr_post_bpt_events(g, tsg, global_esr); } if (ch != NULL) { gk20a_channel_put(ch); } return 0; } u32 gk20a_gr_nonstall_isr(struct gk20a *g) { u32 ops = 0; u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r()); nvgpu_log(g, gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr); if ((gr_intr & gr_intr_nonstall_trap_pending_f()) != 0U) { /* Clear the interrupt */ gk20a_writel(g, gr_intr_nonstall_r(), gr_intr_nonstall_trap_pending_f()); ops |= (GK20A_NONSTALL_OPS_WAKEUP_SEMAPHORE | GK20A_NONSTALL_OPS_POST_EVENTS); } return ops; } int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size) { BUG_ON(size == NULL); return gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a) { .mailbox.id = 0U, .mailbox.data = 0U, .mailbox.clr = ~U32(0U), .method.data = 1U, .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(), .mailbox.ret = size, .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL, .mailbox.ok = 0U, .cond.fail = GR_IS_UCODE_OP_SKIP, .mailbox.fail = 0U}, false); } int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, struct nvgpu_mem *inst_block) { u32 data = fecs_current_ctx_data(g, inst_block); return gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a){ .mailbox.id = 4U, .mailbox.data = data, .mailbox.clr = ~U32(0U), .method.data = 1U, .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(), .mailbox.ret = NULL, .cond.ok = GR_IS_UCODE_OP_EQUAL, .mailbox.ok = 1U, .cond.fail = GR_IS_UCODE_OP_SKIP, .mailbox.fail = 0U}, false); } int gr_gk20a_fecs_set_reglist_virtual_addr(struct gk20a *g, u64 pmu_va) { return gr_gk20a_submit_fecs_method_op(g, (struct fecs_method_op_gk20a) { .mailbox.id = 4U, .mailbox.data = u64_lo32(pmu_va >> 8), .mailbox.clr = ~U32(0U), .method.data = 1U, .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(), .mailbox.ret = NULL, .cond.ok = GR_IS_UCODE_OP_EQUAL, .mailbox.ok = 1U, .cond.fail = GR_IS_UCODE_OP_SKIP, .mailbox.fail = 0U}, false); } int gk20a_gr_suspend(struct gk20a *g) { int ret = 0; nvgpu_log_fn(g, " "); ret = g->ops.gr.wait_empty(g); if (ret != 0) { return ret; } gk20a_writel(g, gr_gpfifo_ctl_r(), gr_gpfifo_ctl_access_disabled_f()); /* disable gr intr */ gk20a_writel(g, gr_intr_r(), 0); gk20a_writel(g, gr_intr_en_r(), 0); /* disable all exceptions */ gk20a_writel(g, gr_exception_r(), 0); gk20a_writel(g, gr_exception_en_r(), 0); gk20a_writel(g, gr_exception1_r(), 0); gk20a_writel(g, gr_exception1_en_r(), 0); gk20a_writel(g, gr_exception2_r(), 0); gk20a_writel(g, gr_exception2_en_r(), 0); gk20a_gr_flush_channel_tlb(&g->gr); g->gr.initialized = false; nvgpu_log_fn(g, "done"); return ret; } static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, u32 addr, bool is_quad, u32 quad, u32 *context_buffer, u32 context_buffer_size, u32 *priv_offset); static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g, u32 addr, u32 *priv_offset); /* This function will decode a priv address and return the partition type and numbers. */ int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr, enum ctxsw_addr_type *addr_type, u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num, u32 *broadcast_flags) { u32 gpc_addr; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); /* setup defaults */ *addr_type = CTXSW_ADDR_TYPE_SYS; *broadcast_flags = PRI_BROADCAST_FLAGS_NONE; *gpc_num = 0; *tpc_num = 0; *ppc_num = 0; *be_num = 0; if (pri_is_gpc_addr(g, addr)) { *addr_type = CTXSW_ADDR_TYPE_GPC; gpc_addr = pri_gpccs_addr_mask(addr); if (pri_is_gpc_addr_shared(g, addr)) { *addr_type = CTXSW_ADDR_TYPE_GPC; *broadcast_flags |= PRI_BROADCAST_FLAGS_GPC; } else { *gpc_num = pri_get_gpc_num(g, addr); } if (pri_is_ppc_addr(g, gpc_addr)) { *addr_type = CTXSW_ADDR_TYPE_PPC; if (pri_is_ppc_addr_shared(g, gpc_addr)) { *broadcast_flags |= PRI_BROADCAST_FLAGS_PPC; return 0; } } if (g->ops.gr.is_tpc_addr(g, gpc_addr)) { *addr_type = CTXSW_ADDR_TYPE_TPC; if (pri_is_tpc_addr_shared(g, gpc_addr)) { *broadcast_flags |= PRI_BROADCAST_FLAGS_TPC; return 0; } *tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr); } return 0; } else if (pri_is_be_addr(g, addr)) { *addr_type = CTXSW_ADDR_TYPE_BE; if (pri_is_be_addr_shared(g, addr)) { *broadcast_flags |= PRI_BROADCAST_FLAGS_BE; return 0; } *be_num = pri_get_be_num(g, addr); return 0; } else if (g->ops.ltc.pri_is_ltc_addr(g, addr)) { *addr_type = CTXSW_ADDR_TYPE_LTCS; if (g->ops.ltc.is_ltcs_ltss_addr(g, addr)) { *broadcast_flags |= PRI_BROADCAST_FLAGS_LTCS; } else if (g->ops.ltc.is_ltcn_ltss_addr(g, addr)) { *broadcast_flags |= PRI_BROADCAST_FLAGS_LTSS; } return 0; } else if (pri_is_fbpa_addr(g, addr)) { *addr_type = CTXSW_ADDR_TYPE_FBPA; if (pri_is_fbpa_addr_shared(g, addr)) { *broadcast_flags |= PRI_BROADCAST_FLAGS_FBPA; return 0; } return 0; } else if ((g->ops.gr.is_egpc_addr != NULL) && g->ops.gr.is_egpc_addr(g, addr)) { return g->ops.gr.decode_egpc_addr(g, addr, addr_type, gpc_num, tpc_num, broadcast_flags); } else { *addr_type = CTXSW_ADDR_TYPE_SYS; return 0; } /* PPC!?!?!?! */ /*NOTREACHED*/ return -EINVAL; } void gr_gk20a_split_fbpa_broadcast_addr(struct gk20a *g, u32 addr, u32 num_fbpas, u32 *priv_addr_table, u32 *t) { u32 fbpa_id; for (fbpa_id = 0; fbpa_id < num_fbpas; fbpa_id++) { priv_addr_table[(*t)++] = pri_fbpa_addr(g, pri_fbpa_addr_mask(g, addr), fbpa_id); } } int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr, u32 gpc_num, u32 *priv_addr_table, u32 *t) { u32 ppc_num; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); for (ppc_num = 0; ppc_num < nvgpu_gr_config_get_gpc_ppc_count(g->gr.config, gpc_num); ppc_num++) { priv_addr_table[(*t)++] = pri_ppc_addr(g, pri_ppccs_addr_mask(addr), gpc_num, ppc_num); } return 0; } /* * The context buffer is indexed using BE broadcast addresses and GPC/TPC * unicast addresses. This function will convert a BE unicast address to a BE * broadcast address and split a GPC/TPC broadcast address into a table of * GPC/TPC addresses. The addresses generated by this function can be * successfully processed by gr_gk20a_find_priv_offset_in_buffer */ int gr_gk20a_create_priv_addr_table(struct gk20a *g, u32 addr, u32 *priv_addr_table, u32 *num_registers) { enum ctxsw_addr_type addr_type; u32 gpc_num, tpc_num, ppc_num, be_num; u32 priv_addr, gpc_addr; u32 broadcast_flags; u32 t; int err; t = 0; *num_registers = 0; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); err = g->ops.gr.decode_priv_addr(g, addr, &addr_type, &gpc_num, &tpc_num, &ppc_num, &be_num, &broadcast_flags); nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type = %d", addr_type); if (err != 0) { return err; } if ((addr_type == CTXSW_ADDR_TYPE_SYS) || (addr_type == CTXSW_ADDR_TYPE_BE)) { /* The BE broadcast registers are included in the compressed PRI * table. Convert a BE unicast address to a broadcast address * so that we can look up the offset. */ if ((addr_type == CTXSW_ADDR_TYPE_BE) && ((broadcast_flags & PRI_BROADCAST_FLAGS_BE) == 0U)) { priv_addr_table[t++] = pri_be_shared_addr(g, addr); } else { priv_addr_table[t++] = addr; } *num_registers = t; return 0; } /* The GPC/TPC unicast registers are included in the compressed PRI * tables. Convert a GPC/TPC broadcast address to unicast addresses so * that we can look up the offsets. */ if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) != 0U) { for (gpc_num = 0; gpc_num < nvgpu_gr_config_get_gpc_count(g->gr.config); gpc_num++) { if ((broadcast_flags & PRI_BROADCAST_FLAGS_TPC) != 0U) { for (tpc_num = 0; tpc_num < nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc_num); tpc_num++) { priv_addr_table[t++] = pri_tpc_addr(g, pri_tpccs_addr_mask(addr), gpc_num, tpc_num); } } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_PPC) != 0U) { err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num, priv_addr_table, &t); if (err != 0) { return err; } } else { priv_addr = pri_gpc_addr(g, pri_gpccs_addr_mask(addr), gpc_num); gpc_addr = pri_gpccs_addr_mask(priv_addr); tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr); if (tpc_num >= nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc_num)) { continue; } priv_addr_table[t++] = priv_addr; } } } else if (((addr_type == CTXSW_ADDR_TYPE_EGPC) || (addr_type == CTXSW_ADDR_TYPE_ETPC)) && (g->ops.gr.egpc_etpc_priv_addr_table != NULL)) { nvgpu_log(g, gpu_dbg_gpu_dbg, "addr_type : EGPC/ETPC"); g->ops.gr.egpc_etpc_priv_addr_table(g, addr, gpc_num, tpc_num, broadcast_flags, priv_addr_table, &t); } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_LTSS) != 0U) { g->ops.ltc.split_lts_broadcast_addr(g, addr, priv_addr_table, &t); } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_LTCS) != 0U) { g->ops.ltc.split_ltc_broadcast_addr(g, addr, priv_addr_table, &t); } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_FBPA) != 0U) { g->ops.gr.split_fbpa_broadcast_addr(g, addr, nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS), priv_addr_table, &t); } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_GPC) == 0U) { if ((broadcast_flags & PRI_BROADCAST_FLAGS_TPC) != 0U) { for (tpc_num = 0; tpc_num < nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc_num); tpc_num++) { priv_addr_table[t++] = pri_tpc_addr(g, pri_tpccs_addr_mask(addr), gpc_num, tpc_num); } } else if ((broadcast_flags & PRI_BROADCAST_FLAGS_PPC) != 0U) { err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num, priv_addr_table, &t); } else { priv_addr_table[t++] = addr; } } *num_registers = t; return 0; } int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g, u32 addr, u32 max_offsets, u32 *offsets, u32 *offset_addrs, u32 *num_offsets, bool is_quad, u32 quad) { u32 i; u32 priv_offset = 0; u32 *priv_registers; u32 num_registers = 0; int err = 0; struct gr_gk20a *gr = &g->gr; u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); u32 potential_offsets = nvgpu_gr_config_get_max_gpc_count(gr->config) * nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) * sm_per_tpc; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); /* implementation is crossed-up if either of these happen */ if (max_offsets > potential_offsets) { nvgpu_log_fn(g, "max_offsets > potential_offsets"); return -EINVAL; } if (!g->gr.ctx_vars.golden_image_initialized) { return -ENODEV; } priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets); if (priv_registers == NULL) { nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets); err = -ENOMEM; goto cleanup; } (void) memset(offsets, 0, sizeof(u32) * max_offsets); (void) memset(offset_addrs, 0, sizeof(u32) * max_offsets); *num_offsets = 0; g->ops.gr.create_priv_addr_table(g, addr, &priv_registers[0], &num_registers); if ((max_offsets > 1U) && (num_registers > max_offsets)) { nvgpu_log_fn(g, "max_offsets = %d, num_registers = %d", max_offsets, num_registers); err = -EINVAL; goto cleanup; } if ((max_offsets == 1U) && (num_registers > 1U)) { num_registers = 1; } if (!g->gr.ctx_vars.golden_image_initialized) { nvgpu_log_fn(g, "no context switch header info to work with"); err = -EINVAL; goto cleanup; } for (i = 0; i < num_registers; i++) { err = gr_gk20a_find_priv_offset_in_buffer(g, priv_registers[i], is_quad, quad, nvgpu_gr_global_ctx_get_local_golden_image_ptr( g->gr.local_golden_image), g->gr.ctx_vars.golden_image_size, &priv_offset); if (err != 0) { nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x", addr); /*, grPriRegStr(addr)));*/ goto cleanup; } offsets[i] = priv_offset; offset_addrs[i] = priv_registers[i]; } *num_offsets = num_registers; cleanup: if (!IS_ERR_OR_NULL(priv_registers)) { nvgpu_kfree(g, priv_registers); } return err; } int gr_gk20a_get_pm_ctx_buffer_offsets(struct gk20a *g, u32 addr, u32 max_offsets, u32 *offsets, u32 *offset_addrs, u32 *num_offsets) { u32 i; u32 priv_offset = 0; u32 *priv_registers; u32 num_registers = 0; int err = 0; struct gr_gk20a *gr = &g->gr; u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); u32 potential_offsets = nvgpu_gr_config_get_max_gpc_count(gr->config) * nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) * sm_per_tpc; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); /* implementation is crossed-up if either of these happen */ if (max_offsets > potential_offsets) { return -EINVAL; } if (!g->gr.ctx_vars.golden_image_initialized) { return -ENODEV; } priv_registers = nvgpu_kzalloc(g, sizeof(u32) * potential_offsets); if (priv_registers == NULL) { nvgpu_log_fn(g, "failed alloc for potential_offsets=%d", potential_offsets); return -ENOMEM; } (void) memset(offsets, 0, sizeof(u32) * max_offsets); (void) memset(offset_addrs, 0, sizeof(u32) * max_offsets); *num_offsets = 0; g->ops.gr.create_priv_addr_table(g, addr, priv_registers, &num_registers); if ((max_offsets > 1U) && (num_registers > max_offsets)) { err = -EINVAL; goto cleanup; } if ((max_offsets == 1U) && (num_registers > 1U)) { num_registers = 1; } if (!g->gr.ctx_vars.golden_image_initialized) { nvgpu_log_fn(g, "no context switch header info to work with"); err = -EINVAL; goto cleanup; } for (i = 0; i < num_registers; i++) { err = gr_gk20a_find_priv_offset_in_pm_buffer(g, priv_registers[i], &priv_offset); if (err != 0) { nvgpu_log_fn(g, "Could not determine priv_offset for addr:0x%x", addr); /*, grPriRegStr(addr)));*/ goto cleanup; } offsets[i] = priv_offset; offset_addrs[i] = priv_registers[i]; } *num_offsets = num_registers; cleanup: nvgpu_kfree(g, priv_registers); return err; } /* Setup some register tables. This looks hacky; our * register/offset functions are just that, functions. * So they can't be used as initializers... TBD: fix to * generate consts at least on an as-needed basis. */ static const u32 _num_ovr_perf_regs = 17; static u32 _ovr_perf_regs[17] = { 0, }; /* Following are the blocks of registers that the ucode stores in the extended region.*/ void gk20a_gr_init_ovr_sm_dsm_perf(void) { if (_ovr_perf_regs[0] != 0U) { return; } _ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r(); _ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r(); _ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r(); _ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r(); _ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r(); _ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r(); _ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r(); _ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r(); _ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r(); _ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r(); _ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r(); _ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r(); _ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r(); _ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r(); _ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r(); _ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r(); _ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r(); } /* TBD: would like to handle this elsewhere, at a higher level. * these are currently constructed in a "test-then-write" style * which makes it impossible to know externally whether a ctx * write will actually occur. so later we should put a lazy, * map-and-hold system in the patch write state */ static int gr_gk20a_ctx_patch_smpc(struct gk20a *g, struct channel_gk20a *ch, u32 addr, u32 data, struct nvgpu_gr_ctx *gr_ctx) { u32 num_gpc = nvgpu_gr_config_get_gpc_count(g->gr.config); u32 num_tpc; u32 tpc, gpc, reg; u32 chk_addr; u32 num_ovr_perf_regs = 0; u32 *ovr_perf_regs = NULL; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); g->ops.gr.init_ovr_sm_dsm_perf(); g->ops.gr.init_sm_dsm_reg_info(); g->ops.gr.get_ovr_perf_regs(g, &num_ovr_perf_regs, &ovr_perf_regs); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); for (reg = 0; reg < num_ovr_perf_regs; reg++) { for (gpc = 0; gpc < num_gpc; gpc++) { num_tpc = nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc); for (tpc = 0; tpc < num_tpc; tpc++) { chk_addr = ((gpc_stride * gpc) + (tpc_in_gpc_stride * tpc) + ovr_perf_regs[reg]); if (chk_addr != addr) { continue; } /* reset the patch count from previous runs,if ucode has already processed it */ nvgpu_gr_ctx_reset_patch_count(g, gr_ctx); nvgpu_gr_ctx_patch_write(g, gr_ctx, addr, data, true); if (ch->subctx != NULL) { nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx, false); nvgpu_gr_subctx_set_patch_ctx(g, ch->subctx, gr_ctx); } else { nvgpu_gr_ctx_set_patch_ctx(g, gr_ctx, true); } /* we're not caching these on cpu side, but later watch for it */ return 0; } } } return 0; } #define ILLEGAL_ID ~U32(0U) void gk20a_gr_get_ovr_perf_regs(struct gk20a *g, u32 *num_ovr_perf_regs, u32 **ovr_perf_regs) { *num_ovr_perf_regs = _num_ovr_perf_regs; *ovr_perf_regs = _ovr_perf_regs; } static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g, u32 addr, bool is_quad, u32 quad, u32 *context_buffer, u32 context_buffer_size, u32 *priv_offset) { u32 i; u32 gpc_num, tpc_num; u32 num_gpcs; u32 chk_addr; u32 ext_priv_offset, ext_priv_size; u8 *context; u32 offset_to_segment, offset_to_segment_end; u32 sm_dsm_perf_reg_id = ILLEGAL_ID; u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID; u32 num_ext_gpccs_ext_buffer_segments; u32 inter_seg_offset; u32 max_tpc_count; u32 *sm_dsm_perf_ctrl_regs = NULL; u32 num_sm_dsm_perf_ctrl_regs = 0; u32 *sm_dsm_perf_regs = NULL; u32 num_sm_dsm_perf_regs = 0; u32 buffer_segments_size = 0; u32 marker_size = 0; u32 control_register_stride = 0; u32 perf_register_stride = 0; struct gr_gk20a *gr = &g->gr; u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE); u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); u32 tpc_gpc_mask = (tpc_in_gpc_stride - 1U); /* Only have TPC registers in extended region, so if not a TPC reg, then return error so caller can look elsewhere. */ if (pri_is_gpc_addr(g, addr)) { u32 gpc_addr = 0; gpc_num = pri_get_gpc_num(g, addr); gpc_addr = pri_gpccs_addr_mask(addr); if (g->ops.gr.is_tpc_addr(g, gpc_addr)) { tpc_num = g->ops.gr.get_tpc_num(g, gpc_addr); } else { return -EINVAL; } nvgpu_log_info(g, " gpc = %d tpc = %d", gpc_num, tpc_num); } else if ((g->ops.gr.is_etpc_addr != NULL) && g->ops.gr.is_etpc_addr(g, addr)) { g->ops.gr.get_egpc_etpc_num(g, addr, &gpc_num, &tpc_num); gpc_base = g->ops.gr.get_egpc_base(g); } else { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "does not exist in extended region"); return -EINVAL; } buffer_segments_size = g->ops.gr.ctxsw_prog.hw_get_extended_buffer_segments_size_in_bytes(); /* note below is in words/num_registers */ marker_size = g->ops.gr.ctxsw_prog.hw_extended_marker_size_in_bytes() >> 2; context = (u8 *)context_buffer; /* sanity check main header */ if (!g->ops.gr.ctxsw_prog.check_main_image_header_magic(context)) { nvgpu_err(g, "Invalid main header: magic value"); return -EINVAL; } num_gpcs = g->ops.gr.ctxsw_prog.get_num_gpcs(context); if (gpc_num >= num_gpcs) { nvgpu_err(g, "GPC 0x%08x is greater than total count 0x%08x!", gpc_num, num_gpcs); return -EINVAL; } g->ops.gr.ctxsw_prog.get_extended_buffer_size_offset(context, &ext_priv_size, &ext_priv_offset); if (0U == ext_priv_size) { nvgpu_log_info(g, " No extended memory in context buffer"); return -EINVAL; } offset_to_segment = ext_priv_offset * 256U; offset_to_segment_end = offset_to_segment + (ext_priv_size * buffer_segments_size); /* check local header magic */ context += g->ops.gr.ctxsw_prog.hw_get_fecs_header_size(); if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) { nvgpu_err(g, "Invalid local header: magic value"); return -EINVAL; } /* * See if the incoming register address is in the first table of * registers. We check this by decoding only the TPC addr portion. * If we get a hit on the TPC bit, we then double check the address * by computing it from the base gpc/tpc strides. Then make sure * it is a real match. */ g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs, &sm_dsm_perf_regs, &perf_register_stride); g->ops.gr.init_sm_dsm_reg_info(); for (i = 0; i < num_sm_dsm_perf_regs; i++) { if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) { sm_dsm_perf_reg_id = i; nvgpu_log_info(g, "register match: 0x%08x", sm_dsm_perf_regs[i]); chk_addr = (gpc_base + gpc_stride * gpc_num) + tpc_in_gpc_base + (tpc_in_gpc_stride * tpc_num) + (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask); if (chk_addr != addr) { nvgpu_err(g, "Oops addr miss-match! : 0x%08x != 0x%08x", addr, chk_addr); return -EINVAL; } break; } } /* Didn't find reg in supported group 1. * so try the second group now */ g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs, &sm_dsm_perf_ctrl_regs, &control_register_stride); if (ILLEGAL_ID == sm_dsm_perf_reg_id) { for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) { if ((addr & tpc_gpc_mask) == (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) { sm_dsm_perf_ctrl_reg_id = i; nvgpu_log_info(g, "register match: 0x%08x", sm_dsm_perf_ctrl_regs[i]); chk_addr = (gpc_base + gpc_stride * gpc_num) + tpc_in_gpc_base + tpc_in_gpc_stride * tpc_num + (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] & tpc_gpc_mask); if (chk_addr != addr) { nvgpu_err(g, "Oops addr miss-match! : 0x%08x != 0x%08x", addr, chk_addr); return -EINVAL; } break; } } } if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) && (ILLEGAL_ID == sm_dsm_perf_reg_id)) { return -EINVAL; } /* Skip the FECS extended header, nothing there for us now. */ offset_to_segment += buffer_segments_size; /* skip through the GPCCS extended headers until we get to the data for * our GPC. The size of each gpc extended segment is enough to hold the * max tpc count for the gpcs,in 256b chunks. */ max_tpc_count = nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config); num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1U) / 2U); offset_to_segment += (num_ext_gpccs_ext_buffer_segments * buffer_segments_size * gpc_num); /* skip the head marker to start with */ inter_seg_offset = marker_size; if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) { /* skip over control regs of TPC's before the one we want. * then skip to the register in this tpc */ inter_seg_offset = inter_seg_offset + (tpc_num * control_register_stride) + sm_dsm_perf_ctrl_reg_id; } else { return -EINVAL; } /* set the offset to the segment offset plus the inter segment offset to * our register */ offset_to_segment += (inter_seg_offset * 4U); /* last sanity check: did we somehow compute an offset outside the * extended buffer? */ if (offset_to_segment > offset_to_segment_end) { nvgpu_err(g, "Overflow ctxsw buffer! 0x%08x > 0x%08x", offset_to_segment, offset_to_segment_end); return -EINVAL; } *priv_offset = offset_to_segment; return 0; } static int gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g, enum ctxsw_addr_type addr_type, u32 pri_addr, u32 gpc_num, u32 num_tpcs, u32 num_ppcs, u32 ppc_mask, u32 *priv_offset) { u32 i; u32 address, base_address; u32 sys_offset, gpc_offset, tpc_offset, ppc_offset; u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr; struct netlist_aiv *reg; u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE); u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE); u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE); u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr); if (!g->netlist_valid) { return -EINVAL; } /* Process the SYS/BE segment. */ if ((addr_type == CTXSW_ADDR_TYPE_SYS) || (addr_type == CTXSW_ADDR_TYPE_BE)) { for (i = 0; i < g->netlist_vars->ctxsw_regs.sys.count; i++) { reg = &g->netlist_vars->ctxsw_regs.sys.l[i]; address = reg->addr; sys_offset = reg->index; if (pri_addr == address) { *priv_offset = sys_offset; return 0; } } } /* Process the TPC segment. */ if (addr_type == CTXSW_ADDR_TYPE_TPC) { for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) { for (i = 0; i < g->netlist_vars->ctxsw_regs.tpc.count; i++) { reg = &g->netlist_vars->ctxsw_regs.tpc.l[i]; address = reg->addr; tpc_addr = pri_tpccs_addr_mask(address); base_address = gpc_base + (gpc_num * gpc_stride) + tpc_in_gpc_base + (tpc_num * tpc_in_gpc_stride); address = base_address + tpc_addr; /* * The data for the TPCs is interleaved in the context buffer. * Example with num_tpcs = 2 * 0 1 2 3 4 5 6 7 8 9 10 11 ... * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ... */ tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4U); if (pri_addr == address) { *priv_offset = tpc_offset; return 0; } } } } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) || (addr_type == CTXSW_ADDR_TYPE_ETPC)) { if (g->ops.gr.get_egpc_base == NULL) { return -EINVAL; } for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) { for (i = 0; i < g->netlist_vars->ctxsw_regs.etpc.count; i++) { reg = &g->netlist_vars->ctxsw_regs.etpc.l[i]; address = reg->addr; tpc_addr = pri_tpccs_addr_mask(address); base_address = g->ops.gr.get_egpc_base(g) + (gpc_num * gpc_stride) + tpc_in_gpc_base + (tpc_num * tpc_in_gpc_stride); address = base_address + tpc_addr; /* * The data for the TPCs is interleaved in the context buffer. * Example with num_tpcs = 2 * 0 1 2 3 4 5 6 7 8 9 10 11 ... * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ... */ tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4U); if (pri_addr == address) { *priv_offset = tpc_offset; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "egpc/etpc priv_offset=0x%#08x", *priv_offset); return 0; } } } } /* Process the PPC segment. */ if (addr_type == CTXSW_ADDR_TYPE_PPC) { for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) { for (i = 0; i < g->netlist_vars->ctxsw_regs.ppc.count; i++) { reg = &g->netlist_vars->ctxsw_regs.ppc.l[i]; address = reg->addr; ppc_addr = pri_ppccs_addr_mask(address); base_address = gpc_base + (gpc_num * gpc_stride) + ppc_in_gpc_base + (ppc_num * ppc_in_gpc_stride); address = base_address + ppc_addr; /* * The data for the PPCs is interleaved in the context buffer. * Example with numPpcs = 2 * 0 1 2 3 4 5 6 7 8 9 10 11 ... * 0-0 1-0 0-1 1-1 0-2 1-2 0-3 1-3 0-4 1-4 0-5 1-5 ... */ ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4U); if (pri_addr == address) { *priv_offset = ppc_offset; return 0; } } } } /* Process the GPC segment. */ if (addr_type == CTXSW_ADDR_TYPE_GPC) { for (i = 0; i < g->netlist_vars->ctxsw_regs.gpc.count; i++) { reg = &g->netlist_vars->ctxsw_regs.gpc.l[i]; address = reg->addr; gpc_addr = pri_gpccs_addr_mask(address); gpc_offset = reg->index; base_address = gpc_base + (gpc_num * gpc_stride); address = base_address + gpc_addr; if (pri_addr == address) { *priv_offset = gpc_offset; return 0; } } } return -EINVAL; } static int gr_gk20a_determine_ppc_configuration(struct gk20a *g, u8 *context, u32 *num_ppcs, u32 *ppc_mask, u32 *reg_ppc_count) { u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC); /* * if there is only 1 PES_PER_GPC, then we put the PES registers * in the GPC reglist, so we can't error out if ppc.count == 0 */ if ((!g->netlist_valid) || ((g->netlist_vars->ctxsw_regs.ppc.count == 0U) && (num_pes_per_gpc > 1U))) { return -EINVAL; } g->ops.gr.ctxsw_prog.get_ppc_info(context, num_ppcs, ppc_mask); *reg_ppc_count = g->netlist_vars->ctxsw_regs.ppc.count; return 0; } int gr_gk20a_get_offset_in_gpccs_segment(struct gk20a *g, enum ctxsw_addr_type addr_type, u32 num_tpcs, u32 num_ppcs, u32 reg_list_ppc_count, u32 *__offset_in_segment) { u32 offset_in_segment = 0; if (addr_type == CTXSW_ADDR_TYPE_TPC) { /* * reg = g->netlist_vars->ctxsw_regs.tpc.l; * offset_in_segment = 0; */ } else if ((addr_type == CTXSW_ADDR_TYPE_EGPC) || (addr_type == CTXSW_ADDR_TYPE_ETPC)) { offset_in_segment = ((g->netlist_vars->ctxsw_regs.tpc.count * num_tpcs) << 2); nvgpu_log(g, gpu_dbg_info | gpu_dbg_gpu_dbg, "egpc etpc offset_in_segment 0x%#08x", offset_in_segment); } else if (addr_type == CTXSW_ADDR_TYPE_PPC) { /* * The ucode stores TPC data before PPC data. * Advance offset past TPC data to PPC data. */ offset_in_segment = (((g->netlist_vars->ctxsw_regs.tpc.count + g->netlist_vars->ctxsw_regs.etpc.count) * num_tpcs) << 2); } else if (addr_type == CTXSW_ADDR_TYPE_GPC) { /* * The ucode stores TPC/PPC data before GPC data. * Advance offset past TPC/PPC data to GPC data. * * Note 1 PES_PER_GPC case */ u32 num_pes_per_gpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_PES_PER_GPC); if (num_pes_per_gpc > 1U) { offset_in_segment = ((((g->netlist_vars->ctxsw_regs.tpc.count + g->netlist_vars->ctxsw_regs.etpc.count) * num_tpcs) << 2) + ((reg_list_ppc_count * num_ppcs) << 2)); } else { offset_in_segment = (((g->netlist_vars->ctxsw_regs.tpc.count + g->netlist_vars->ctxsw_regs.etpc.count) * num_tpcs) << 2); } } else { nvgpu_log_fn(g, "Unknown address type."); return -EINVAL; } *__offset_in_segment = offset_in_segment; return 0; } /* * This function will return the 32 bit offset for a priv register if it is * present in the context buffer. The context buffer is in CPU memory. */ static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g, u32 addr, bool is_quad, u32 quad, u32 *context_buffer, u32 context_buffer_size, u32 *priv_offset) { u32 i; int err; enum ctxsw_addr_type addr_type; u32 broadcast_flags; u32 gpc_num, tpc_num, ppc_num, be_num; u32 num_gpcs, num_tpcs, num_ppcs; u32 offset; u32 sys_priv_offset, gpc_priv_offset; u32 ppc_mask, reg_list_ppc_count; u8 *context; u32 offset_to_segment, offset_in_segment = 0; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); err = g->ops.gr.decode_priv_addr(g, addr, &addr_type, &gpc_num, &tpc_num, &ppc_num, &be_num, &broadcast_flags); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr_type = %d, broadcast_flags: %08x", addr_type, broadcast_flags); if (err != 0) { return err; } context = (u8 *)context_buffer; if (!g->ops.gr.ctxsw_prog.check_main_image_header_magic(context)) { nvgpu_err(g, "Invalid main header: magic value"); return -EINVAL; } num_gpcs = g->ops.gr.ctxsw_prog.get_num_gpcs(context); /* Parse the FECS local header. */ context += g->ops.gr.ctxsw_prog.hw_get_fecs_header_size(); if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) { nvgpu_err(g, "Invalid FECS local header: magic value"); return -EINVAL; } sys_priv_offset = g->ops.gr.ctxsw_prog.get_local_priv_register_ctl_offset(context); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "sys_priv_offset=0x%x", sys_priv_offset); /* If found in Ext buffer, ok. * If it failed and we expected to find it there (quad offset) * then return the error. Otherwise continue on. */ err = gr_gk20a_find_priv_offset_in_ext_buffer(g, addr, is_quad, quad, context_buffer, context_buffer_size, priv_offset); if ((err == 0) || ((err != 0) && is_quad)) { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "err = %d, is_quad = %s", err, is_quad ? "true" : "false"); return err; } if ((addr_type == CTXSW_ADDR_TYPE_SYS) || (addr_type == CTXSW_ADDR_TYPE_BE)) { /* Find the offset in the FECS segment. */ offset_to_segment = sys_priv_offset * 256U; err = gr_gk20a_process_context_buffer_priv_segment(g, addr_type, addr, 0, 0, 0, 0, &offset); if (err != 0) { return err; } *priv_offset = (offset_to_segment + offset); return 0; } if ((gpc_num + 1U) > num_gpcs) { nvgpu_err(g, "GPC %d not in this context buffer.", gpc_num); return -EINVAL; } /* Parse the GPCCS local header(s).*/ for (i = 0; i < num_gpcs; i++) { context += g->ops.gr.ctxsw_prog.hw_get_gpccs_header_size(); if (!g->ops.gr.ctxsw_prog.check_local_header_magic(context)) { nvgpu_err(g, "Invalid GPCCS local header: magic value"); return -EINVAL; } gpc_priv_offset = g->ops.gr.ctxsw_prog.get_local_priv_register_ctl_offset(context); err = gr_gk20a_determine_ppc_configuration(g, context, &num_ppcs, &ppc_mask, ®_list_ppc_count); if (err != 0) { nvgpu_err(g, "determine ppc configuration failed"); return err; } num_tpcs = g->ops.gr.ctxsw_prog.get_num_tpcs(context); if ((i == gpc_num) && ((tpc_num + 1U) > num_tpcs)) { nvgpu_err(g, "GPC %d TPC %d not in this context buffer.", gpc_num, tpc_num); return -EINVAL; } /* Find the offset in the GPCCS segment.*/ if (i == gpc_num) { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "gpc_priv_offset 0x%#08x", gpc_priv_offset); offset_to_segment = gpc_priv_offset * 256U; err = g->ops.gr.get_offset_in_gpccs_segment(g, addr_type, num_tpcs, num_ppcs, reg_list_ppc_count, &offset_in_segment); if (err != 0) { return -EINVAL; } offset_to_segment += offset_in_segment; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "offset_to_segment 0x%#08x", offset_to_segment); err = gr_gk20a_process_context_buffer_priv_segment(g, addr_type, addr, i, num_tpcs, num_ppcs, ppc_mask, &offset); if (err != 0) { return -EINVAL; } *priv_offset = offset_to_segment + offset; return 0; } } return -EINVAL; } static int map_cmp(const void *a, const void *b) { const struct ctxsw_buf_offset_map_entry *e1; const struct ctxsw_buf_offset_map_entry *e2; e1 = (const struct ctxsw_buf_offset_map_entry *)a; e2 = (const struct ctxsw_buf_offset_map_entry *)b; if (e1->addr < e2->addr) { return -1; } if (e1->addr > e2->addr) { return 1; } return 0; } static int add_ctxsw_buffer_map_entries_pmsys(struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 mask) { u32 idx; u32 cnt = *count; u32 off = *offset; if ((cnt + regs->count) > max_cnt) { return -EINVAL; } for (idx = 0; idx < regs->count; idx++) { if ((base + (regs->l[idx].addr & mask)) < 0xFFFU) { map[cnt].addr = base + (regs->l[idx].addr & mask) + NV_PCFG_BASE; } else { map[cnt].addr = base + (regs->l[idx].addr & mask); } map[cnt++].offset = off; off += 4U; } *count = cnt; *offset = off; return 0; } static int add_ctxsw_buffer_map_entries_pmgpc(struct gk20a *g, struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 mask) { u32 idx; u32 cnt = *count; u32 off = *offset; if ((cnt + regs->count) > max_cnt) { return -EINVAL; } /* NOTE: The PPC offsets get added to the pm_gpc list if numPpc <= 1 * To handle the case of PPC registers getting added into GPC, the below * code specifically checks for any PPC offsets and adds them using * proper mask */ for (idx = 0; idx < regs->count; idx++) { /* Check if the address is PPC address */ if (pri_is_ppc_addr_shared(g, regs->l[idx].addr & mask)) { u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE); u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE); /* Use PPC mask instead of the GPC mask provided */ u32 ppcmask = ppc_in_gpc_stride - 1U; map[cnt].addr = base + ppc_in_gpc_base + (regs->l[idx].addr & ppcmask); } else { map[cnt].addr = base + (regs->l[idx].addr & mask); } map[cnt++].offset = off; off += 4U; } *count = cnt; *offset = off; return 0; } static int add_ctxsw_buffer_map_entries(struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 mask) { u32 idx; u32 cnt = *count; u32 off = *offset; if ((cnt + regs->count) > max_cnt) { return -EINVAL; } for (idx = 0; idx < regs->count; idx++) { map[cnt].addr = base + (regs->l[idx].addr & mask); map[cnt++].offset = off; off += 4U; } *count = cnt; *offset = off; return 0; } /* Helper function to add register entries to the register map for all * subunits */ static int add_ctxsw_buffer_map_entries_subunits( struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 num_units, u32 stride, u32 mask) { u32 unit; u32 idx; u32 cnt = *count; u32 off = *offset; if ((cnt + (regs->count * num_units)) > max_cnt) { return -EINVAL; } /* Data is interleaved for units in ctxsw buffer */ for (idx = 0; idx < regs->count; idx++) { for (unit = 0; unit < num_units; unit++) { map[cnt].addr = base + (regs->l[idx].addr & mask) + (unit * stride); map[cnt++].offset = off; off += 4U; } } *count = cnt; *offset = off; return 0; } int gr_gk20a_add_ctxsw_reg_pm_fbpa(struct gk20a *g, struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 num_fbpas, u32 stride, u32 mask) { return add_ctxsw_buffer_map_entries_subunits(map, regs, count, offset, max_cnt, base, num_fbpas, stride, mask); } static int add_ctxsw_buffer_map_entries_gpcs(struct gk20a *g, struct ctxsw_buf_offset_map_entry *map, u32 *count, u32 *offset, u32 max_cnt) { u32 num_gpcs = nvgpu_gr_config_get_gpc_count(g->gr.config); u32 num_ppcs, num_tpcs, gpc_num, base; u32 gpc_base = nvgpu_get_litter_value(g, GPU_LIT_GPC_BASE); u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 ppc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_BASE); u32 ppc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_PPC_IN_GPC_STRIDE); u32 tpc_in_gpc_base = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_BASE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); for (gpc_num = 0; gpc_num < num_gpcs; gpc_num++) { num_tpcs = nvgpu_gr_config_get_gpc_tpc_count(g->gr.config, gpc_num); base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base; if (add_ctxsw_buffer_map_entries_subunits(map, &g->netlist_vars->ctxsw_regs.pm_tpc, count, offset, max_cnt, base, num_tpcs, tpc_in_gpc_stride, (tpc_in_gpc_stride - 1U)) != 0) { return -EINVAL; } num_ppcs = nvgpu_gr_config_get_gpc_ppc_count(g->gr.config, gpc_num); base = gpc_base + (gpc_stride * gpc_num) + ppc_in_gpc_base; if (add_ctxsw_buffer_map_entries_subunits(map, &g->netlist_vars->ctxsw_regs.pm_ppc, count, offset, max_cnt, base, num_ppcs, ppc_in_gpc_stride, (ppc_in_gpc_stride - 1U)) != 0) { return -EINVAL; } base = gpc_base + (gpc_stride * gpc_num); if (add_ctxsw_buffer_map_entries_pmgpc(g, map, &g->netlist_vars->ctxsw_regs.pm_gpc, count, offset, max_cnt, base, (gpc_stride - 1U)) != 0) { return -EINVAL; } base = NV_XBAR_MXBAR_PRI_GPC_GNIC_STRIDE * gpc_num; if (add_ctxsw_buffer_map_entries(map, &g->netlist_vars->ctxsw_regs.pm_ucgpc, count, offset, max_cnt, base, ~U32(0U)) != 0) { return -EINVAL; } base = (g->ops.gr.get_pmm_per_chiplet_offset() * gpc_num); if (add_ctxsw_buffer_map_entries(map, &g->netlist_vars->ctxsw_regs.perf_gpc, count, offset, max_cnt, base, ~U32(0U)) != 0) { return -EINVAL; } base = (NV_PERF_PMMGPCROUTER_STRIDE * gpc_num); if (add_ctxsw_buffer_map_entries(map, &g->netlist_vars->ctxsw_regs.gpc_router, count, offset, max_cnt, base, ~U32(0U)) != 0) { return -EINVAL; } /* Counter Aggregation Unit, if available */ if (g->netlist_vars->ctxsw_regs.pm_cau.count != 0U) { base = gpc_base + (gpc_stride * gpc_num) + tpc_in_gpc_base; if (add_ctxsw_buffer_map_entries_subunits(map, &g->netlist_vars->ctxsw_regs.pm_cau, count, offset, max_cnt, base, num_tpcs, tpc_in_gpc_stride, (tpc_in_gpc_stride - 1U)) != 0) { return -EINVAL; } } *offset = ALIGN(*offset, 256); } return 0; } int gr_gk20a_add_ctxsw_reg_perf_pma(struct ctxsw_buf_offset_map_entry *map, struct netlist_aiv_list *regs, u32 *count, u32 *offset, u32 max_cnt, u32 base, u32 mask) { return add_ctxsw_buffer_map_entries(map, regs, count, offset, max_cnt, base, mask); } /* * PM CTXSW BUFFER LAYOUT : *|---------------------------------------------|0x00 <----PM CTXSW BUFFER BASE *| | *| LIST_compressed_pm_ctx_reg_SYS |Space allocated: numRegs words *|---------------------------------------------| *| | *| LIST_compressed_nv_perf_ctx_reg_SYS |Space allocated: numRegs words *|---------------------------------------------| *| | *| LIST_compressed_nv_perf_ctx_reg_sysrouter|Space allocated: numRegs words *|---------------------------------------------| *| | *| LIST_compressed_nv_perf_ctx_reg_PMA |Space allocated: numRegs words *|---------------------------------------------| *| PADDING for 256 byte alignment | *|---------------------------------------------|<----256 byte aligned *| LIST_compressed_nv_perf_fbp_ctx_regs | *| |Space allocated: numRegs * n words (for n FB units) *|---------------------------------------------| *| LIST_compressed_nv_perf_fbprouter_ctx_regs | *| |Space allocated: numRegs * n words (for n FB units) *|---------------------------------------------| *| LIST_compressed_pm_fbpa_ctx_regs | *| |Space allocated: numRegs * n words (for n FB units) *|---------------------------------------------| *| LIST_compressed_pm_rop_ctx_regs | *|---------------------------------------------| *| LIST_compressed_pm_ltc_ctx_regs | *| LTC0 LTS0 | *| LTC1 LTS0 |Space allocated: numRegs * n words (for n LTC units) *| LTCn LTS0 | *| LTC0 LTS1 | *| LTC1 LTS1 | *| LTCn LTS1 | *| LTC0 LTSn | *| LTC1 LTSn | *| LTCn LTSn | *|---------------------------------------------| *| PADDING for 256 byte alignment | *|---------------------------------------------|<----256 byte aligned *| GPC0 REG0 TPC0 |Each GPC has space allocated to accommodate *| REG0 TPC1 | all the GPC/TPC register lists *| Lists in each GPC region: REG0 TPCn |Per GPC allocated space is always 256 byte aligned *| LIST_pm_ctx_reg_TPC REG1 TPC0 | *| * numTpcs REG1 TPC1 | *| LIST_pm_ctx_reg_PPC REG1 TPCn | *| * numPpcs REGn TPC0 | *| LIST_pm_ctx_reg_GPC REGn TPC1 | *| List_pm_ctx_reg_uc_GPC REGn TPCn | *| LIST_nv_perf_ctx_reg_GPC | *| LIST_nv_perf_gpcrouter_ctx_reg | *| LIST_nv_perf_ctx_reg_CAU | *| ---- |-- *| GPC1 . | *| . |<---- *|---------------------------------------------| *= = *| GPCn | *= = *|---------------------------------------------| */ static int gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(struct gk20a *g) { u32 hwpm_ctxsw_buffer_size = g->gr.ctx_vars.pm_ctxsw_image_size; u32 hwpm_ctxsw_reg_count_max; u32 map_size; u32 i, count = 0; u32 offset = 0; int ret; struct ctxsw_buf_offset_map_entry *map; u32 ltc_stride = nvgpu_get_litter_value(g, GPU_LIT_LTC_STRIDE); u32 num_fbpas = nvgpu_get_litter_value(g, GPU_LIT_NUM_FBPAS); u32 fbpa_stride = nvgpu_get_litter_value(g, GPU_LIT_FBPA_STRIDE); u32 num_ltc = g->ops.gr.get_max_ltc_per_fbp(g) * g->gr.num_fbps; if (hwpm_ctxsw_buffer_size == 0U) { nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "no PM Ctxsw buffer memory in context buffer"); return -EINVAL; } hwpm_ctxsw_reg_count_max = hwpm_ctxsw_buffer_size >> 2; map_size = hwpm_ctxsw_reg_count_max * (u32)sizeof(*map); map = nvgpu_big_zalloc(g, map_size); if (map == NULL) { return -ENOMEM; } /* Add entries from _LIST_pm_ctx_reg_SYS */ if (add_ctxsw_buffer_map_entries_pmsys(map, &g->netlist_vars->ctxsw_regs.pm_sys, &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) { goto cleanup; } /* Add entries from _LIST_nv_perf_ctx_reg_SYS */ if (add_ctxsw_buffer_map_entries(map, &g->netlist_vars->ctxsw_regs.perf_sys, &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) { goto cleanup; } /* Add entries from _LIST_nv_perf_sysrouter_ctx_reg*/ if (add_ctxsw_buffer_map_entries(map, &g->netlist_vars->ctxsw_regs.perf_sys_router, &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) { goto cleanup; } /* Add entries from _LIST_nv_perf_pma_ctx_reg*/ ret = g->ops.gr.add_ctxsw_reg_perf_pma(map, &g->netlist_vars->ctxsw_regs.perf_pma, &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)); if (ret != 0) { goto cleanup; } offset = ALIGN(offset, 256); /* Add entries from _LIST_nv_perf_fbp_ctx_regs */ if (add_ctxsw_buffer_map_entries_subunits(map, &g->netlist_vars->ctxsw_regs.fbp, &count, &offset, hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps, g->ops.gr.get_pmm_per_chiplet_offset(), ~U32(0U)) != 0) { goto cleanup; } /* Add entries from _LIST_nv_perf_fbprouter_ctx_regs */ if (add_ctxsw_buffer_map_entries_subunits(map, &g->netlist_vars->ctxsw_regs.fbp_router, &count, &offset, hwpm_ctxsw_reg_count_max, 0, g->gr.num_fbps, NV_PERF_PMM_FBP_ROUTER_STRIDE, ~U32(0U)) != 0) { goto cleanup; } /* Add entries from _LIST_nv_pm_fbpa_ctx_regs */ ret = g->ops.gr.add_ctxsw_reg_pm_fbpa(g, map, &g->netlist_vars->ctxsw_regs.pm_fbpa, &count, &offset, hwpm_ctxsw_reg_count_max, 0, num_fbpas, fbpa_stride, ~U32(0U)); if (ret != 0) { goto cleanup; } /* Add entries from _LIST_nv_pm_rop_ctx_regs */ if (add_ctxsw_buffer_map_entries(map, &g->netlist_vars->ctxsw_regs.pm_rop, &count, &offset, hwpm_ctxsw_reg_count_max, 0, ~U32(0U)) != 0) { goto cleanup; } /* Add entries from _LIST_compressed_nv_pm_ltc_ctx_regs */ if (add_ctxsw_buffer_map_entries_subunits(map, &g->netlist_vars->ctxsw_regs.pm_ltc, &count, &offset, hwpm_ctxsw_reg_count_max, 0, num_ltc, ltc_stride, ~U32(0U)) != 0) { goto cleanup; } offset = ALIGN(offset, 256); /* Add GPC entries */ if (add_ctxsw_buffer_map_entries_gpcs(g, map, &count, &offset, hwpm_ctxsw_reg_count_max) != 0) { goto cleanup; } if (offset > hwpm_ctxsw_buffer_size) { nvgpu_err(g, "offset > buffer size"); goto cleanup; } sort(map, count, sizeof(*map), map_cmp, NULL); g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map = map; g->gr.ctx_vars.hwpm_ctxsw_buffer_offset_map_count = count; nvgpu_log_info(g, "Reg Addr => HWPM Ctxt switch buffer offset"); for (i = 0; i < count; i++) { nvgpu_log_info(g, "%08x => %08x", map[i].addr, map[i].offset); } return 0; cleanup: nvgpu_err(g, "Failed to create HWPM buffer offset map"); nvgpu_big_free(g, map); return -EINVAL; } /* * This function will return the 32 bit offset for a priv register if it is * present in the PM context buffer. */ static int gr_gk20a_find_priv_offset_in_pm_buffer(struct gk20a *g, u32 addr, u32 *priv_offset) { struct gr_gk20a *gr = &g->gr; int err = 0; u32 count; struct ctxsw_buf_offset_map_entry *map, *result, map_key; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr); /* Create map of pri address and pm offset if necessary */ if (gr->ctx_vars.hwpm_ctxsw_buffer_offset_map == NULL) { err = gr_gk20a_create_hwpm_ctxsw_buffer_offset_map(g); if (err != 0) { return err; } } *priv_offset = 0; map = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map; count = gr->ctx_vars.hwpm_ctxsw_buffer_offset_map_count; map_key.addr = addr; result = nvgpu_bsearch(&map_key, map, count, sizeof(*map), map_cmp); if (result != NULL) { *priv_offset = result->offset; } else { nvgpu_err(g, "Lookup failed for address 0x%x", addr); err = -EINVAL; } return err; } bool gk20a_is_channel_ctx_resident(struct channel_gk20a *ch) { u32 curr_gr_ctx; u32 curr_gr_tsgid; struct gk20a *g = ch->g; struct channel_gk20a *curr_ch; bool ret = false; struct tsg_gk20a *tsg; curr_gr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r()); /* when contexts are unloaded from GR, the valid bit is reset * but the instance pointer information remains intact. So the * valid bit must be checked to be absolutely certain that a * valid context is currently resident. */ if (gr_fecs_current_ctx_valid_v(curr_gr_ctx) == 0U) { return false; } curr_ch = gk20a_gr_get_channel_from_ctx(g, curr_gr_ctx, &curr_gr_tsgid); nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "curr_gr_chid=%d curr_tsgid=%d, ch->tsgid=%d" " ch->chid=%d", (curr_ch != NULL) ? curr_ch->chid : U32_MAX, curr_gr_tsgid, ch->tsgid, ch->chid); if (curr_ch == NULL) { return false; } if (ch->chid == curr_ch->chid) { ret = true; } tsg = tsg_gk20a_from_ch(ch); if ((tsg != NULL) && (tsg->tsgid == curr_gr_tsgid)) { ret = true; } gk20a_channel_put(curr_ch); return ret; } int __gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops, u32 num_ctx_wr_ops, u32 num_ctx_rd_ops, bool ch_is_curr_ctx) { struct gk20a *g = ch->g; struct tsg_gk20a *tsg; struct nvgpu_gr_ctx *gr_ctx; bool gr_ctx_ready = false; bool pm_ctx_ready = false; struct nvgpu_mem *current_mem = NULL; u32 i, j, offset, v; struct gr_gk20a *gr = &g->gr; u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); u32 max_offsets = nvgpu_gr_config_get_max_gpc_count(gr->config) * nvgpu_gr_config_get_max_tpc_per_gpc_count(gr->config) * sm_per_tpc; u32 *offsets = NULL; u32 *offset_addrs = NULL; u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops}; int err = 0, pass; nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d", num_ctx_wr_ops, num_ctx_rd_ops); tsg = tsg_gk20a_from_ch(ch); if (tsg == NULL) { return -EINVAL; } gr_ctx = tsg->gr_ctx; if (ch_is_curr_ctx) { for (pass = 0; pass < 2; pass++) { ctx_op_nr = 0; for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) { /* only do ctx ops and only on the right pass */ if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) || (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) || ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) { continue; } /* if this is a quad access, setup for special access*/ if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)) && (g->ops.gr.access_smpc_reg != NULL)) { g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad, ctx_ops[i].offset); } offset = ctx_ops[i].offset; if (pass == 0) { /* write pass */ v = gk20a_readl(g, offset); v &= ~ctx_ops[i].and_n_mask_lo; v |= ctx_ops[i].value_lo; gk20a_writel(g, offset, v); nvgpu_log(g, gpu_dbg_gpu_dbg, "direct wr: offset=0x%x v=0x%x", offset, v); if (ctx_ops[i].op == REGOP(WRITE_64)) { v = gk20a_readl(g, offset + 4U); v &= ~ctx_ops[i].and_n_mask_hi; v |= ctx_ops[i].value_hi; gk20a_writel(g, offset + 4U, v); nvgpu_log(g, gpu_dbg_gpu_dbg, "direct wr: offset=0x%x v=0x%x", offset + 4U, v); } } else { /* read pass */ ctx_ops[i].value_lo = gk20a_readl(g, offset); nvgpu_log(g, gpu_dbg_gpu_dbg, "direct rd: offset=0x%x v=0x%x", offset, ctx_ops[i].value_lo); if (ctx_ops[i].op == REGOP(READ_64)) { ctx_ops[i].value_hi = gk20a_readl(g, offset + 4U); nvgpu_log(g, gpu_dbg_gpu_dbg, "direct rd: offset=0x%x v=0x%x", offset, ctx_ops[i].value_lo); } else { ctx_ops[i].value_hi = 0; } } ctx_op_nr++; } } goto cleanup; } /* they're the same size, so just use one alloc for both */ offsets = nvgpu_kzalloc(g, 2U * sizeof(u32) * max_offsets); if (offsets == NULL) { err = -ENOMEM; goto cleanup; } offset_addrs = offsets + max_offsets; err = nvgpu_gr_ctx_patch_write_begin(g, gr_ctx, false); if (err != 0) { goto cleanup; } err = g->ops.mm.l2_flush(g, true); if (err != 0) { nvgpu_err(g, "l2_flush failed"); goto cleanup; } /* write to appropriate place in context image, * first have to figure out where that really is */ /* first pass is writes, second reads */ for (pass = 0; pass < 2; pass++) { ctx_op_nr = 0; for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) { u32 num_offsets; /* only do ctx ops and only on the right pass */ if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) || (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) || ((pass == 1) && !reg_op_is_read(ctx_ops[i].op)))) { continue; } err = gr_gk20a_get_ctx_buffer_offsets(g, ctx_ops[i].offset, max_offsets, offsets, offset_addrs, &num_offsets, ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD), ctx_ops[i].quad); if (err == 0) { if (!gr_ctx_ready) { gr_ctx_ready = true; } current_mem = &gr_ctx->mem; } else { err = gr_gk20a_get_pm_ctx_buffer_offsets(g, ctx_ops[i].offset, max_offsets, offsets, offset_addrs, &num_offsets); if (err != 0) { nvgpu_log(g, gpu_dbg_gpu_dbg, "ctx op invalid offset: offset=0x%x", ctx_ops[i].offset); ctx_ops[i].status = REGOP(STATUS_INVALID_OFFSET); continue; } if (!pm_ctx_ready) { /* Make sure ctx buffer was initialized */ if (!nvgpu_mem_is_valid(&gr_ctx->pm_ctx.mem)) { nvgpu_err(g, "Invalid ctx buffer"); err = -EINVAL; goto cleanup; } pm_ctx_ready = true; } current_mem = &gr_ctx->pm_ctx.mem; } /* if this is a quad access, setup for special access*/ if ((ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)) && (g->ops.gr.access_smpc_reg != NULL)) { g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad, ctx_ops[i].offset); } for (j = 0; j < num_offsets; j++) { /* sanity check gr ctxt offsets, * don't write outside, worst case */ if ((current_mem == &gr_ctx->mem) && (offsets[j] >= g->gr.ctx_vars.golden_image_size)) { continue; } if (pass == 0) { /* write pass */ v = nvgpu_mem_rd(g, current_mem, offsets[j]); v &= ~ctx_ops[i].and_n_mask_lo; v |= ctx_ops[i].value_lo; nvgpu_mem_wr(g, current_mem, offsets[j], v); nvgpu_log(g, gpu_dbg_gpu_dbg, "context wr: offset=0x%x v=0x%x", offsets[j], v); if (ctx_ops[i].op == REGOP(WRITE_64)) { v = nvgpu_mem_rd(g, current_mem, offsets[j] + 4U); v &= ~ctx_ops[i].and_n_mask_hi; v |= ctx_ops[i].value_hi; nvgpu_mem_wr(g, current_mem, offsets[j] + 4U, v); nvgpu_log(g, gpu_dbg_gpu_dbg, "context wr: offset=0x%x v=0x%x", offsets[j] + 4U, v); } if (current_mem == &gr_ctx->mem) { /* check to see if we need to add a special WAR for some of the SMPC perf regs */ gr_gk20a_ctx_patch_smpc(g, ch, offset_addrs[j], v, gr_ctx); } } else { /* read pass */ ctx_ops[i].value_lo = nvgpu_mem_rd(g, current_mem, offsets[0]); nvgpu_log(g, gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x", offsets[0], ctx_ops[i].value_lo); if (ctx_ops[i].op == REGOP(READ_64)) { ctx_ops[i].value_hi = nvgpu_mem_rd(g, current_mem, offsets[0] + 4U); nvgpu_log(g, gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x", offsets[0] + 4U, ctx_ops[i].value_hi); } else { ctx_ops[i].value_hi = 0; } } } ctx_op_nr++; } } cleanup: if (offsets != NULL) { nvgpu_kfree(g, offsets); } if (gr_ctx->patch_ctx.mem.cpu_va != NULL) { nvgpu_gr_ctx_patch_write_end(g, gr_ctx, gr_ctx_ready); } return err; } int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch, struct nvgpu_dbg_reg_op *ctx_ops, u32 num_ops, u32 num_ctx_wr_ops, u32 num_ctx_rd_ops, bool *is_curr_ctx) { struct gk20a *g = ch->g; int err, tmp_err; bool ch_is_curr_ctx; /* disable channel switching. * at that point the hardware state can be inspected to * determine if the context we're interested in is current. */ err = gr_gk20a_disable_ctxsw(g); if (err != 0) { nvgpu_err(g, "unable to stop gr ctxsw"); /* this should probably be ctx-fatal... */ return err; } ch_is_curr_ctx = gk20a_is_channel_ctx_resident(ch); if (is_curr_ctx != NULL) { *is_curr_ctx = ch_is_curr_ctx; } nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx); err = __gr_gk20a_exec_ctx_ops(ch, ctx_ops, num_ops, num_ctx_wr_ops, num_ctx_rd_ops, ch_is_curr_ctx); tmp_err = gr_gk20a_enable_ctxsw(g); if (tmp_err != 0) { nvgpu_err(g, "unable to restart ctxsw!"); err = tmp_err; } return err; } void gr_gk20a_commit_global_pagepool(struct gk20a *g, struct nvgpu_gr_ctx *gr_ctx, u64 addr, u32 size, bool patch) { BUG_ON(u64_hi32(addr) != 0U); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_base_r(), gr_scc_pagepool_base_addr_39_8_f((u32)addr), patch); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_scc_pagepool_r(), gr_scc_pagepool_total_pages_f(size) | gr_scc_pagepool_valid_true_f(), patch); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_base_r(), gr_gpcs_gcc_pagepool_base_addr_39_8_f((u32)addr), patch); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_gpcs_gcc_pagepool_r(), gr_gpcs_gcc_pagepool_total_pages_f(size), patch); nvgpu_gr_ctx_patch_write(g, gr_ctx, gr_pd_pagepool_r(), gr_pd_pagepool_total_pages_f(size) | gr_pd_pagepool_valid_true_f(), patch); } void gk20a_init_gr(struct gk20a *g) { nvgpu_cond_init(&g->gr.init_wq); } int gk20a_gr_wait_for_sm_lock_down(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors) { bool locked_down; bool no_error_pending; u32 delay = GR_IDLE_CHECK_DEFAULT; bool mmu_debug_mode_enabled = g->ops.fb.is_debug_mode_enabled(g); u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); u32 dbgr_status0 = 0, dbgr_control0 = 0; u64 warps_valid = 0, warps_paused = 0, warps_trapped = 0; struct nvgpu_timeout timeout; u32 warp_esr; nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d SM%d: locking down SM", gpc, tpc, sm); nvgpu_timeout_init(g, &timeout, gk20a_get_gr_idle_timeout(g), NVGPU_TIMER_CPU_TIMER); /* wait for the sm to lock down */ do { u32 global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r() + offset); warp_esr = g->ops.gr.get_sm_hww_warp_esr(g, gpc, tpc, sm); locked_down = (gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) == gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v()); no_error_pending = check_errors && (gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) == gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) && ((global_esr & ~global_esr_mask) == 0U); if (locked_down || no_error_pending) { nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC%d TPC%d SM%d: locked down SM", gpc, tpc, sm); return 0; } /* if an mmu fault is pending and mmu debug mode is not * enabled, the sm will never lock down. */ if (!mmu_debug_mode_enabled && (g->ops.mm.mmu_fault_pending(g))) { nvgpu_err(g, "GPC%d TPC%d: mmu fault pending," " SM%d will never lock down!", gpc, tpc, sm); return -EFAULT; } nvgpu_usleep_range(delay, delay * 2U); delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX); } while (nvgpu_timeout_expired(&timeout) == 0); dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset); /* 64 bit read */ warps_valid = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_1_r() + offset) << 32; warps_valid |= gk20a_readl(g, gr_gpc0_tpc0_sm_warp_valid_mask_r() + offset); /* 64 bit read */ warps_paused = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_1_r() + offset) << 32; warps_paused |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_pause_mask_r() + offset); /* 64 bit read */ warps_trapped = (u64)gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_1_r() + offset) << 32; warps_trapped |= gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_bpt_trap_mask_r() + offset); nvgpu_err(g, "GPC%d TPC%d: timed out while trying to lock down SM", gpc, tpc); nvgpu_err(g, "STATUS0(0x%x)=0x%x CONTROL0=0x%x VALID_MASK=0x%llx PAUSE_MASK=0x%llx TRAP_MASK=0x%llx", gr_gpc0_tpc0_sm_dbgr_status0_r() + offset, dbgr_status0, dbgr_control0, warps_valid, warps_paused, warps_trapped); return -ETIMEDOUT; } void gk20a_gr_suspend_single_sm(struct gk20a *g, u32 gpc, u32 tpc, u32 sm, u32 global_esr_mask, bool check_errors) { int err; u32 dbgr_control0; u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); /* if an SM debugger isn't attached, skip suspend */ if (!g->ops.gr.sm_debugger_attached(g)) { nvgpu_err(g, "SM debugger not attached, skipping suspend!"); return; } nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending gpc:%d, tpc:%d, sm%d", gpc, tpc, sm); /* assert stop trigger. */ dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset); dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f(); gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0); err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask, check_errors); if (err != 0) { nvgpu_err(g, "SuspendSm failed"); return; } } void gk20a_gr_suspend_all_sms(struct gk20a *g, u32 global_esr_mask, bool check_errors) { struct gr_gk20a *gr = &g->gr; u32 gpc, tpc, sm; int err; u32 dbgr_control0; u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); /* if an SM debugger isn't attached, skip suspend */ if (!g->ops.gr.sm_debugger_attached(g)) { nvgpu_err(g, "SM debugger not attached, skipping suspend!"); return; } nvgpu_log(g, gpu_dbg_fn | gpu_dbg_gpu_dbg, "suspending all sms"); /* assert stop trigger. uniformity assumption: all SMs will have * the same state in dbg_control0. */ dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r()); dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f(); /* broadcast write */ gk20a_writel(g, gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0); for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) { for (tpc = 0; tpc < nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc); tpc++) { for (sm = 0; sm < sm_per_tpc; sm++) { err = g->ops.gr.wait_for_sm_lock_down(g, gpc, tpc, sm, global_esr_mask, check_errors); if (err != 0) { nvgpu_err(g, "SuspendAllSms failed"); return; } } } } } void gk20a_gr_resume_single_sm(struct gk20a *g, u32 gpc, u32 tpc, u32 sm) { u32 dbgr_control0; u32 offset; /* * The following requires some clarification. Despite the fact that both * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their * names, only one is actually a trigger, and that is the STOP_TRIGGER. * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0 * (_DISABLE) as well. * Advice from the arch group: Disable the stop trigger first, as a * separate operation, in order to ensure that the trigger has taken * effect, before enabling the run trigger. */ offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); /*De-assert stop trigger */ dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset); dbgr_control0 = set_field(dbgr_control0, gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_m(), gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_disable_f()); gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0); /* Run trigger */ dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f(); gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r() + offset, dbgr_control0); } void gk20a_gr_resume_all_sms(struct gk20a *g) { u32 dbgr_control0; /* * The following requires some clarification. Despite the fact that both * RUN_TRIGGER and STOP_TRIGGER have the word "TRIGGER" in their * names, only one is actually a trigger, and that is the STOP_TRIGGER. * Merely writing a 1(_TASK) to the RUN_TRIGGER is not sufficient to * resume the gpu - the _STOP_TRIGGER must explicitly be set to 0 * (_DISABLE) as well. * Advice from the arch group: Disable the stop trigger first, as a * separate operation, in order to ensure that the trigger has taken * effect, before enabling the run trigger. */ /*De-assert stop trigger */ dbgr_control0 = gk20a_readl(g, gr_gpcs_tpcs_sm_dbgr_control0_r()); dbgr_control0 &= ~gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f(); gk20a_writel(g, gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0); /* Run trigger */ dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_run_trigger_task_f(); gk20a_writel(g, gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0); } int gr_gk20a_set_sm_debug_mode(struct gk20a *g, struct channel_gk20a *ch, u64 sms, bool enable) { struct nvgpu_dbg_reg_op *ops; unsigned int i = 0, sm_id; int err; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); ops = nvgpu_kcalloc(g, g->gr.no_of_sm, sizeof(*ops)); if (ops == NULL) { return -ENOMEM; } for (sm_id = 0; sm_id < g->gr.no_of_sm; sm_id++) { u32 gpc, tpc; u32 tpc_offset, gpc_offset, reg_offset, reg_mask, reg_val; if ((sms & BIT64(sm_id)) == 0ULL) { continue; } gpc = g->gr.sm_to_cluster[sm_id].gpc_index; tpc = g->gr.sm_to_cluster[sm_id].tpc_index; tpc_offset = tpc_in_gpc_stride * tpc; gpc_offset = gpc_stride * gpc; reg_offset = tpc_offset + gpc_offset; ops[i].op = REGOP(WRITE_32); ops[i].type = REGOP(TYPE_GR_CTX); ops[i].offset = gr_gpc0_tpc0_sm_dbgr_control0_r() + reg_offset; reg_mask = 0; reg_val = 0; if (enable) { reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m(); reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_f(); reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_m(); reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_warp_disable_f(); reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_m(); reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_stop_on_any_sm_disable_f(); } else { reg_mask |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_m(); reg_val |= gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_off_f(); } ops[i].and_n_mask_lo = reg_mask; ops[i].value_lo = reg_val; i++; } err = gr_gk20a_exec_ctx_ops(ch, ops, i, i, 0, NULL); if (err != 0) { nvgpu_err(g, "Failed to access register"); } nvgpu_kfree(g, ops); return err; } /* * gr_gk20a_suspend_context() * This API should be called with dbg_session lock held * and ctxsw disabled * Returns bool value indicating if context was resident * or not */ bool gr_gk20a_suspend_context(struct channel_gk20a *ch) { struct gk20a *g = ch->g; bool ctx_resident = false; if (gk20a_is_channel_ctx_resident(ch)) { g->ops.gr.suspend_all_sms(g, 0, false); ctx_resident = true; } else { gk20a_disable_channel_tsg(g, ch); } return ctx_resident; } bool gr_gk20a_resume_context(struct channel_gk20a *ch) { struct gk20a *g = ch->g; bool ctx_resident = false; if (gk20a_is_channel_ctx_resident(ch)) { g->ops.gr.resume_all_sms(g); ctx_resident = true; } else { gk20a_enable_channel_tsg(g, ch); } return ctx_resident; } int gr_gk20a_suspend_contexts(struct gk20a *g, struct dbg_session_gk20a *dbg_s, int *ctx_resident_ch_fd) { int local_ctx_resident_ch_fd = -1; bool ctx_resident; struct channel_gk20a *ch; struct dbg_session_channel_data *ch_data; int err = 0; nvgpu_mutex_acquire(&g->dbg_sessions_lock); err = gr_gk20a_disable_ctxsw(g); if (err != 0) { nvgpu_err(g, "unable to stop gr ctxsw"); goto clean_up; } nvgpu_mutex_acquire(&dbg_s->ch_list_lock); nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list, dbg_session_channel_data, ch_entry) { ch = g->fifo.channel + ch_data->chid; ctx_resident = gr_gk20a_suspend_context(ch); if (ctx_resident) { local_ctx_resident_ch_fd = ch_data->channel_fd; } } nvgpu_mutex_release(&dbg_s->ch_list_lock); err = gr_gk20a_enable_ctxsw(g); if (err != 0) { nvgpu_err(g, "unable to restart ctxsw!"); } *ctx_resident_ch_fd = local_ctx_resident_ch_fd; clean_up: nvgpu_mutex_release(&g->dbg_sessions_lock); return err; } int gr_gk20a_resume_contexts(struct gk20a *g, struct dbg_session_gk20a *dbg_s, int *ctx_resident_ch_fd) { int local_ctx_resident_ch_fd = -1; bool ctx_resident; struct channel_gk20a *ch; int err = 0; struct dbg_session_channel_data *ch_data; nvgpu_mutex_acquire(&g->dbg_sessions_lock); err = gr_gk20a_disable_ctxsw(g); if (err != 0) { nvgpu_err(g, "unable to stop gr ctxsw"); goto clean_up; } nvgpu_list_for_each_entry(ch_data, &dbg_s->ch_list, dbg_session_channel_data, ch_entry) { ch = g->fifo.channel + ch_data->chid; ctx_resident = gr_gk20a_resume_context(ch); if (ctx_resident) { local_ctx_resident_ch_fd = ch_data->channel_fd; } } err = gr_gk20a_enable_ctxsw(g); if (err != 0) { nvgpu_err(g, "unable to restart ctxsw!"); } *ctx_resident_ch_fd = local_ctx_resident_ch_fd; clean_up: nvgpu_mutex_release(&g->dbg_sessions_lock); return err; } int gr_gk20a_trigger_suspend(struct gk20a *g) { int err = 0; u32 dbgr_control0; /* assert stop trigger. uniformity assumption: all SMs will have * the same state in dbg_control0. */ dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r()); dbgr_control0 |= gr_gpcs_tpcs_sm_dbgr_control0_stop_trigger_enable_f(); /* broadcast write */ gk20a_writel(g, gr_gpcs_tpcs_sm_dbgr_control0_r(), dbgr_control0); return err; } int gr_gk20a_wait_for_pause(struct gk20a *g, struct nvgpu_warpstate *w_state) { int err = 0; struct gr_gk20a *gr = &g->gr; u32 gpc, tpc, sm, sm_id; u32 global_mask; /* Wait for the SMs to reach full stop. This condition is: * 1) All SMs with valid warps must be in the trap handler (SM_IN_TRAP_MODE) * 2) All SMs in the trap handler must have equivalent VALID and PAUSED warp * masks. */ global_mask = g->ops.gr.get_sm_no_lock_down_hww_global_esr_mask(g); /* Lock down all SMs */ for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { gpc = g->gr.sm_to_cluster[sm_id].gpc_index; tpc = g->gr.sm_to_cluster[sm_id].tpc_index; sm = g->gr.sm_to_cluster[sm_id].sm_index; err = g->ops.gr.lock_down_sm(g, gpc, tpc, sm, global_mask, false); if (err != 0) { nvgpu_err(g, "sm did not lock down!"); return err; } } /* Read the warp status */ g->ops.gr.bpt_reg_info(g, w_state); return 0; } int gr_gk20a_resume_from_pause(struct gk20a *g) { int err = 0; u32 reg_val; /* Clear the pause mask to tell the GPU we want to resume everyone */ gk20a_writel(g, gr_gpcs_tpcs_sm_dbgr_bpt_pause_mask_r(), 0); /* explicitly re-enable forwarding of SM interrupts upon any resume */ reg_val = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r()); reg_val |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f(); gk20a_writel(g, gr_gpcs_tpcs_tpccs_tpc_exception_en_r(), reg_val); /* Now resume all sms, write a 0 to the stop trigger * then a 1 to the run trigger */ g->ops.gr.resume_all_sms(g); return err; } int gr_gk20a_clear_sm_errors(struct gk20a *g) { int ret = 0; u32 gpc, tpc, sm; struct gr_gk20a *gr = &g->gr; u32 global_esr; u32 sm_per_tpc = nvgpu_get_litter_value(g, GPU_LIT_NUM_SM_PER_TPC); for (gpc = 0; gpc < nvgpu_gr_config_get_gpc_count(gr->config); gpc++) { /* check if any tpc has an exception */ for (tpc = 0; tpc < nvgpu_gr_config_get_gpc_tpc_count(gr->config, gpc); tpc++) { for (sm = 0; sm < sm_per_tpc; sm++) { global_esr = g->ops.gr.get_sm_hww_global_esr(g, gpc, tpc, sm); /* clearing hwws, also causes tpc and gpc * exceptions to be cleared */ g->ops.gr.clear_sm_hww(g, gpc, tpc, sm, global_esr); } } } return ret; } u32 gr_gk20a_tpc_enabled_exceptions(struct gk20a *g) { struct gr_gk20a *gr = &g->gr; u32 sm_id, tpc_exception_en = 0; u32 offset, regval, tpc_offset, gpc_offset; u32 gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_GPC_STRIDE); u32 tpc_in_gpc_stride = nvgpu_get_litter_value(g, GPU_LIT_TPC_IN_GPC_STRIDE); for (sm_id = 0; sm_id < gr->no_of_sm; sm_id++) { tpc_offset = tpc_in_gpc_stride * g->gr.sm_to_cluster[sm_id].tpc_index; gpc_offset = gpc_stride * g->gr.sm_to_cluster[sm_id].gpc_index; offset = tpc_offset + gpc_offset; regval = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r() + offset); /* Each bit represents corresponding enablement state, bit 0 corrsponds to SM0 */ tpc_exception_en |= gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_v(regval) << sm_id; } return tpc_exception_en; } u32 gk20a_gr_get_sm_hww_warp_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm) { u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); u32 hww_warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r() + offset); return hww_warp_esr; } u32 gk20a_gr_get_sm_hww_global_esr(struct gk20a *g, u32 gpc, u32 tpc, u32 sm) { u32 offset = gk20a_gr_gpc_offset(g, gpc) + gk20a_gr_tpc_offset(g, tpc); u32 hww_global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r() + offset); return hww_global_esr; } u32 gk20a_gr_get_sm_no_lock_down_hww_global_esr_mask(struct gk20a *g) { /* * These three interrupts don't require locking down the SM. They can * be handled by usermode clients as they aren't fatal. Additionally, * usermode clients may wish to allow some warps to execute while others * are at breakpoints, as opposed to fatal errors where all warps should * halt. */ u32 global_esr_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f() | gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() | gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(); return global_esr_mask; } /* invalidate channel lookup tlb */ void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr) { nvgpu_spinlock_acquire(&gr->ch_tlb_lock); (void) memset(gr->chid_tlb, 0, sizeof(struct gr_channel_map_tlb_entry) * GR_CHANNEL_MAP_TLB_SIZE); nvgpu_spinlock_release(&gr->ch_tlb_lock); } u32 gk20a_gr_get_fecs_ctx_state_store_major_rev_id(struct gk20a *g) { return nvgpu_readl(g, gr_fecs_ctx_state_store_major_rev_id_r()); } u32 gr_gk20a_fecs_falcon_base_addr(void) { return gr_fecs_irqsset_r(); } u32 gr_gk20a_gpccs_falcon_base_addr(void) { return gr_gpcs_gpccs_irqsset_r(); }