gpu: nvgpu: Add doxygen documentation in tsg.h

- Add doxygen documentation.
- Remove unused fields of nvgpu_tsg struct:
-- timeslice_timeout
-- timeslice_scale
- Remove unused functions:
-- nvgpu_tsg_set_runlist_interleave
- nvgpu_tsg_post_event_id is not supported in safety build.
  This function is moved under CONFIG_NVGPU_CHANNEL_TSG_CONTROL
  compiler flag.
- Below functions are moved under CONFIG_NVGPU_KERNEL_MODE_SUBMIT
  nvgpu_tsg_ctxsw_timeout_debug_dump_state
  nvgpu_tsg_set_ctxsw_timeout_accumulated_ms
- Rename
  gk20a_is_channel_active -> nvgpu_tsg_is_channel_active
  release_used_tsg -> nvgpu_tsg_release_used_tsg
- nvgpu_tsg_unbind_channel_common declared static
- Fix build issue when CONFIG_NVGPU_CHANNEL_TSG_CONTROL is disabled
  Remove CONFIG_NVGPU_CHANNEL_TSG_CONTROL for
  nvgpu_gr_setup_set_preemption_mode as it is needed in safety build.
  By default compute preemption mode will be set to WFI. CUDA will
  change it to CTA during context init time.

JIRA NVGPU-3595

Change-Id: I8ff6cabc8b892c691d951c37cdc0721e820a0297
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2151489
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
This commit is contained in:
Seema Khowala
2019-07-10 23:16:11 -07:00
committed by mobile promotions
parent 834be35d08
commit 2f731c5fa8
5 changed files with 498 additions and 55 deletions

View File

@@ -65,7 +65,8 @@ struct nvgpu_tsg *nvgpu_tsg_get_from_id(struct gk20a *g, u32 tsgid)
}
static bool gk20a_is_channel_active(struct gk20a *g, struct nvgpu_channel *ch)
static bool nvgpu_tsg_is_channel_active(struct gk20a *g,
struct nvgpu_channel *ch)
{
struct nvgpu_fifo *f = &g->fifo;
struct nvgpu_runlist_info *runlist;
@@ -99,7 +100,7 @@ int nvgpu_tsg_bind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
}
/* channel cannot be bound to TSG if it is already active */
if (gk20a_is_channel_active(tsg->g, ch)) {
if (nvgpu_tsg_is_channel_active(tsg->g, ch)) {
return -EINVAL;
}
@@ -135,42 +136,7 @@ int nvgpu_tsg_bind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
return err;
}
/* The caller must ensure that channel belongs to a tsg */
int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
{
struct gk20a *g = ch->g;
int err;
nvgpu_log_fn(g, "unbind tsg:%u ch:%u\n", tsg->tsgid, ch->chid);
err = nvgpu_tsg_unbind_channel_common(tsg, ch);
if (err != 0) {
nvgpu_err(g, "Channel %d unbind failed, tearing down TSG %d",
ch->chid, tsg->tsgid);
nvgpu_tsg_abort(g, tsg, true);
/* If channel unbind fails, channel is still part of runlist */
if (nvgpu_channel_update_runlist(ch, false) != 0) {
nvgpu_err(g,
"remove ch %u from runlist failed", ch->chid);
}
nvgpu_rwsem_down_write(&tsg->ch_list_lock);
nvgpu_list_del(&ch->ch_entry);
ch->tsgid = NVGPU_INVALID_TSG_ID;
nvgpu_rwsem_up_write(&tsg->ch_list_lock);
}
if (g->ops.tsg.unbind_channel != NULL) {
err = g->ops.tsg.unbind_channel(tsg, ch);
}
nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);
return 0;
}
int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
static int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
struct nvgpu_channel *ch)
{
struct gk20a *g = ch->g;
@@ -251,6 +217,41 @@ fail_enable_tsg:
return err;
}
/* The caller must ensure that channel belongs to a tsg */
int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch)
{
struct gk20a *g = ch->g;
int err;
nvgpu_log_fn(g, "unbind tsg:%u ch:%u\n", tsg->tsgid, ch->chid);
err = nvgpu_tsg_unbind_channel_common(tsg, ch);
if (err != 0) {
nvgpu_err(g, "Channel %d unbind failed, tearing down TSG %d",
ch->chid, tsg->tsgid);
nvgpu_tsg_abort(g, tsg, true);
/* If channel unbind fails, channel is still part of runlist */
if (nvgpu_channel_update_runlist(ch, false) != 0) {
nvgpu_err(g,
"remove ch %u from runlist failed", ch->chid);
}
nvgpu_rwsem_down_write(&tsg->ch_list_lock);
nvgpu_list_del(&ch->ch_entry);
ch->tsgid = NVGPU_INVALID_TSG_ID;
nvgpu_rwsem_up_write(&tsg->ch_list_lock);
}
if (g->ops.tsg.unbind_channel != NULL) {
err = g->ops.tsg.unbind_channel(tsg, ch);
}
nvgpu_ref_put(&tsg->refcount, nvgpu_tsg_release);
return 0;
}
int nvgpu_tsg_unbind_channel_check_hw_state(struct nvgpu_tsg *tsg,
struct nvgpu_channel *ch)
{
@@ -405,6 +406,7 @@ bool nvgpu_tsg_mark_error(struct gk20a *g,
}
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
void nvgpu_tsg_set_ctxsw_timeout_accumulated_ms(struct nvgpu_tsg *tsg, u32 ms)
{
struct nvgpu_channel *ch = NULL;
@@ -437,6 +439,7 @@ bool nvgpu_tsg_ctxsw_timeout_debug_dump_state(struct nvgpu_tsg *tsg)
return verbose;
}
#endif
void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct nvgpu_tsg *tsg,
u32 error_notifier)
@@ -608,14 +611,15 @@ void nvgpu_tsg_disable_sched(struct gk20a *g, struct nvgpu_tsg *tsg)
RUNLIST_DISABLED);
}
static void release_used_tsg(struct nvgpu_fifo *f, struct nvgpu_tsg *tsg)
static void nvgpu_tsg_release_used_tsg(struct nvgpu_fifo *f,
struct nvgpu_tsg *tsg)
{
nvgpu_mutex_acquire(&f->tsg_inuse_mutex);
f->tsg[tsg->tsgid].in_use = false;
nvgpu_mutex_release(&f->tsg_inuse_mutex);
}
static struct nvgpu_tsg *gk20a_tsg_acquire_unused_tsg(struct nvgpu_fifo *f)
static struct nvgpu_tsg *nvgpu_tsg_acquire_unused_tsg(struct nvgpu_fifo *f)
{
struct nvgpu_tsg *tsg = NULL;
unsigned int tsgid;
@@ -696,14 +700,14 @@ struct nvgpu_tsg *nvgpu_tsg_open(struct gk20a *g, pid_t pid)
struct nvgpu_tsg *tsg;
int err;
tsg = gk20a_tsg_acquire_unused_tsg(&g->fifo);
tsg = nvgpu_tsg_acquire_unused_tsg(&g->fifo);
if (tsg == NULL) {
return NULL;
}
err = nvgpu_tsg_open_common(g, tsg, pid);
if (err != 0) {
release_used_tsg(&g->fifo, tsg);
nvgpu_tsg_release_used_tsg(&g->fifo, tsg);
nvgpu_err(g, "tsg %d open failed %d", tsg->tsgid, err);
return NULL;
}
@@ -765,7 +769,7 @@ void nvgpu_tsg_release(struct nvgpu_ref *ref)
nvgpu_mutex_release(&tsg->event_id_list_lock);
nvgpu_tsg_release_common(g, tsg);
release_used_tsg(&g->fifo, tsg);
nvgpu_tsg_release_used_tsg(&g->fifo, tsg);
nvgpu_log(g, gpu_dbg_fn, "tsg released %d", tsg->tsgid);
}

View File

@@ -679,11 +679,9 @@ struct gpu_ops {
u64 zcull_va,
u32 mode);
#endif
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
int (*set_preemption_mode)(struct nvgpu_channel *ch,
u32 graphics_preempt_mode,
u32 compute_preempt_mode);
#endif
} setup;
#ifdef CONFIG_NVGPU_GRAPHICS
struct {

View File

@@ -40,10 +40,8 @@ void nvgpu_gr_setup_free_gr_ctx(struct gk20a *g,
struct vm_gk20a *vm, struct nvgpu_gr_ctx *gr_ctx);
void nvgpu_gr_setup_free_subctx(struct nvgpu_channel *c);
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
int nvgpu_gr_setup_set_preemption_mode(struct nvgpu_channel *ch,
u32 graphics_preempt_mode,
u32 compute_preempt_mode);
#endif
#endif /* NVGPU_GR_SETUP_H */

View File

@@ -21,6 +21,11 @@
*/
#ifndef NVGPU_TSG_H
#define NVGPU_TSG_H
/**
* @file
*
* Abstract interface for TSG related functionality.
*/
#include <nvgpu/lock.h>
#include <nvgpu/kref.h>
@@ -28,6 +33,9 @@
#include <nvgpu/list.h>
#include <nvgpu/cond.h>
/**
* Software defined invalid TSG id value.
*/
#define NVGPU_INVALID_TSG_ID (U32_MAX)
#define NVGPU_TSG_TIMESLICE_LOW_PRIORITY_US 1300U
@@ -35,6 +43,9 @@
#define NVGPU_TSG_TIMESLICE_HIGH_PRIORITY_US 5200U
#define NVGPU_TSG_TIMESLICE_MIN_US 1000U
#define NVGPU_TSG_TIMESLICE_MAX_US 50000U
/**
* Default TSG timeslice value in microseconds. Currently it is 1024 us.
*/
#define NVGPU_TSG_TIMESLICE_DEFAULT_US (128U << 3U)
struct gk20a;
@@ -42,8 +53,11 @@ struct nvgpu_channel;
struct nvgpu_gr_ctx;
struct nvgpu_channel_hw_state;
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
enum nvgpu_event_id_type;
#endif
/** Format for reporting SM errors read from h/w registers */
struct nvgpu_tsg_sm_error_state {
u32 hww_global_esr;
u32 hww_warp_esr;
@@ -52,40 +66,125 @@ struct nvgpu_tsg_sm_error_state {
u32 hww_warp_esr_report_mask;
};
/**
* Fields corresponding to TSG's s/w context.
*/
struct nvgpu_tsg {
/** Pointer to GPU driver struct. */
struct gk20a *g;
/** Points to TSG's virtual memory */
struct vm_gk20a *vm;
/**
* Starting with Volta, when a Channel/TSG is set up, a recovery buffer
* region must be allocated in BAR2, to allow engine to save methods if
* it faults. Virtual memory address for this buffer is set by s/w in
* the channel instance block.
* S/w allocates memory for the #nvgpu_mem type struct for
* #nvgpu_fifo.num_pbdma. This is then used to alloc and map memory from
* BAR2 VM. Size of the actual method buffer is chip specific and
* calculated by s/w during TSG init.
*/
struct nvgpu_mem *eng_method_buffers;
/** Allocated during TSG open and freed during TSG release */
struct nvgpu_gr_ctx *gr_ctx;
/**
* This ref is initialized during tsg setup s/w.
* This is ref_get whenever a channel is bound to the TSG.
* This is ref_put whenever a channel is unbound from the TSG.
*/
struct nvgpu_ref refcount;
/** List of channels bound to a tsgid */
struct nvgpu_list_node ch_list;
/**
* Ioctls using this field are not supported in the safety build.
* Refer NVGPU_FEATURE_CHANNEL_TSG_CONTROL config.
*/
struct nvgpu_list_node event_id_list;
/**
* Read write type of semaphore lock used for accessing/modifying
* #ch_list.
*/
struct nvgpu_rwsem ch_list_lock;
/**
* Mutex used to access/modify #event_id_list.
* Ioctls using this field are not supported in the safety build.
* Refer NVGPU_FEATURE_CHANNEL_TSG_CONTROL config.
*/
struct nvgpu_mutex event_id_list_lock;
/**
* Total number of channels that are bound to a TSG. This can change
* during run time whenever channels are bound to a TSG or unbound
* from a TSG.
*/
u32 num_active_channels;
/**
* This is for timeout amount for the TSG's timslice.
* All channels in a TSG share the same runlist timeslice
* which specifies how long a single context runs on an engine
* or PBDMA before being swapped for a different context.
* The timeslice period is set in the TSG header of runlist entry
* defined by h/w.
* The timeslice period should normally not be set to zero. A timeslice
* of zero will be treated as a timeslice period of 1 ns (Bug 1652173).
* The runlist timeslice period begins after the context has been
* loaded on a PBDMA but is paused while the channel has an outstanding
* context load to an engine. Time spent switching a context into an
* engine is not part of the runlist timeslice.
*/
unsigned int timeslice_us;
unsigned int timeslice_timeout;
unsigned int timeslice_scale;
/**
* See include/nvgpu/runlist.h and
* refer #NVGPU_FIFO_RUNLIST_INTERLEAVE_LEVEL_LOW.
*/
u32 interleave_level;
/** This ranges from 0 to #nvgpu_fifo.num_channels. */
u32 tsgid;
/**
* There is maximum number of runlists defined by the h/w. Usually it
* is one runlist per engine (graphics and grcopy share a runlist).
* The runlist_id specifies the h/w runlist to which a runlist in
* memory is being submitted. Each runlist serves a specific set of
* engines. Refer top.h.
*/
u32 runlist_id;
/** tgid (OS specific) of the process that openend the TSG. */
pid_t tgid;
/**
* Number of active TPCs as requested by userspace.
* This is used while requesting for dynamic TPC PG (power gating).
* TPC PG is specific to chip.
*/
u32 num_active_tpcs;
/** Set to non-zero if dynamic TPC PG is requested to be enabled. */
u8 tpc_pg_enabled;
/**
* Set to true if dynamic TPC PG is enabled and #num_active_tpcs is
* non-zero.
*/
bool tpc_num_initialized;
/**
* Set to true if tsgid is acquired else set to false.
* This is protected by #nvgpu_fifo.tsg_inuse_mutex. Acquire/Release
* to check if tsgid is already acquired or not.
*/
bool in_use;
/** Non abortable TSG is for vidmem clear */
bool abortable;
/* MMU debug mode enabled if mmu_debug_mode_refcnt > 0 */
/** MMU debug mode enabled if mmu_debug_mode_refcnt > 0 */
u32 mmu_debug_mode_refcnt;
/**
* Pointer to store SM errors read from h/w registers.
* Check #nvgpu_tsg_sm_error_state.
*/
struct nvgpu_tsg_sm_error_state *sm_error_states;
#ifdef CONFIG_NVGPU_DEBUGGER
@@ -97,43 +196,290 @@ struct nvgpu_tsg {
};
int nvgpu_tsg_open_common(struct gk20a *g, struct nvgpu_tsg *tsg, pid_t pid);
/**
* @brief Open and initialize unused TSG
*
* @param g[in] The GPU driver struct.
* @param pid[in] The PID of the process.
*
* - Acquire unused TSG.
* - Set s/w context of the acquired TSG.
*
* @return Pointer to TSG struct. See #nvgpu_tsg .
* @retval NULL if there is no un-used TSG.
* @retval NULL if setting s/w context for opened TSG failed. If setting s/w
* context failed, release acquired TSG back to the pool of unused
* TSGs.
*/
struct nvgpu_tsg *nvgpu_tsg_open(struct gk20a *g, pid_t pid);
/**
* @brief Clean up resources used by tsg. This is needed for releasing TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to TSG struct.
*
* - Call non-NULL HAL to release tsg. This HAL is non-NULL for vgpu only.
* - Call nvgpu_free_gr_ctx_struct to free #nvgpu_tsg.gr_ctx.
* - Set #nvgpu_tsg.gr_ctx to NULL.
* - If #nvgpu_tsg.vm is non-NULL, do #nvgpu_vm_put for this vm and set
* it to NULL (Unhook TSG from VM).
* - If #nvgpu_tsg.sm_error_states is non-NULL, free allocated memory and set
* it to NULL.
*/
void nvgpu_tsg_release_common(struct gk20a *g, struct nvgpu_tsg *tsg);
/**
* @brief Release TSG to the pool of free TSGs.
*
* @param ref[in] Pointer to #nvgpu_tsg.refcount.
*
* - Get pointer to the #nvgpu_tsg using #ref.
* - Call HAL to free #nvgpu_tsg.gr_ctx if this memory pointer is non-NULL
* and valid and also #nvgpu_tsg.vm is non-NULL.
* - Unhook all events created on the TSG being released.
* -- Acquire #nvgpu_tsg.event_id_list_lock.
* -- While #nvgpu_tsg.event_id_list is non-empty,
* --- Delete #nvgpu_tsg.event_id_list.next.
* -- Release #nvgpu_tsg.event_id_list_lock.
* - Call #nvgpu_tsg_release_common.
* - Set #nvgpu_tsg.in_use to false so that tsg can be made available
* to the pool of unused tsgs.
*/
void nvgpu_tsg_release(struct nvgpu_ref *ref);
/**
* @brief Initialize s/w context for TSGs.
*
* @param g[in] The GPU driver struct.
*
* Initialize s/w context for TSGs:
* - Allocate zero initialized kernel memory area for #nvgpu_fifo.num_channels
* number of #nvgpu_fifo.tsg struct. This area of memory is indexed by
* tsgid starting from 0 to #nvgpu_fifo.num_channels.
* - Upon successful allocation of memory, initialize memory area assigned to
* each TSG with s/w defaults.
*
* @return 0 for successful init.
* @retval -ENOMEM if kernel memory could not be allocated to support TSG
* s/w context.
*/
int nvgpu_tsg_setup_sw(struct gk20a *g);
/**
* @brief De-initialize s/w context for TSGs.
*
* @param g[in] The GPU driver struct.
*
* De-initialize s/w context for TSGs:
* - Destroy s/w context for all tsgid starting from 0 to
* #nvgpu_fifo.num_channels.
* - De-allocate kernel memory area allocated to support s/w context of
* #nvgpu_fifo.num_channels number of TSGs.
*/
void nvgpu_tsg_cleanup_sw(struct gk20a *g);
/**
* @brief Get pointer to #nvgpu_tsg for the tsgid of the given Channel.
*
* @param ch[in] Pointer to Channel struct.
*
* Validate tsgid of the given channel. If tsgid is not equal to
* #NVGPU_INVALID_TSG_ID, get pointer to area of memory, reserved for s/w
* context of TSG and indexed by tsgid.
*
* @return Pointer to #nvgpu_tsg struct.
* @retval NULL if tsgid of the given channel is #NVGPU_INVALID_TSG_ID.
* @note This does not check if tsgid is < num_channels.
*/
struct nvgpu_tsg *nvgpu_tsg_from_ch(struct nvgpu_channel *ch);
/**
* @brief Disable all the channels bound to a TSG.
*
* @param tsg[in] Pointer to TSG struct.
*
* Disable all the channels bound to a TSG so that h/w scheduler does not
* schedule these channels.
*/
void nvgpu_tsg_disable(struct nvgpu_tsg *tsg);
/**
* @brief Bind a channel to the TSG.
*
* @param tsg[in] Pointer to TSG struct.
* @param ch[in] Pointer to Channel struct.
*
* - Make sure channel is not already bound to a TSG.
* - Make sure channel is not part of any runlists.
* - Set runlist id of TSG to channel's runlist_id if runlist_id of TSG
* is set to #NVGPU_INVALID_TSG_ID.
* - Call HAL to bind channel to TSG.
* - Add channel to TSG's list of channels. See #nvgpu_tsg.ch_list
* - Set #nvgpu_channel.tsgid to #nvgpu_tsg.tsgid.
* - Set #nvgpu_channel.unserviceable to false to mark that channel is
* serviceable.
* - Bind Engine Method Buffers (This may not be required for all the chips).
* - Get #nvgpu_tsg.refcount to prevent TSG from being freed till channel/s are
* bound to this TSG.
*
* @return 0 for successful bind
* @retval -EINVAL if channel is already bound to a TSG.
* @retval -EINVAL if channel is already active. This is done by checking if
* bit corresponding to chid is set in the
* #nvgpu_runlist_info.active_channels of any of the supported
* #nvgpu_fifo.num_runlists.
* @retval -EINVAL if runlist_id of the channel and tsg do not match.
*/
int nvgpu_tsg_bind_channel(struct nvgpu_tsg *tsg,
struct nvgpu_channel *ch);
/**
* @brief Get pointer to #nvgpu_tsg for the tsgid.
*
* @param g[in] The GPU driver struct.
* @param tsgid[in] Id of the TSG.
*
* Get pointer to area of memory, reserved for s/w context of TSG
* and indexed by tsgid.
*
* @return Pointer to #nvgpu_tsg struct.
*/
struct nvgpu_tsg *nvgpu_tsg_get_from_id(struct gk20a *g, u32 tsgid);
/**
* @brief Validate tsgid and get pointer to #nvgpu_tsg for this tsgid.
*
* @param g[in] The GPU driver struct.
* @param tsgid[in] Id of the TSG.
*
* If tsgid is not equal to #NVGPU_INVALID_TSG_ID, get pointer to area of
* memory, reserved for s/w context of TSG and indexed by tsgid.
*
* @return Pointer to #nvgpu_tsg struct.
* @retval NULL if tsgid is #NVGPU_INVALID_TSG_ID.
*/
struct nvgpu_tsg *nvgpu_tsg_check_and_get_from_id(struct gk20a *g, u32 tsgid);
/**
* @brief Unbind a channel from the TSG it is bound to.
*
* @param tsg[in] Pointer to TSG struct.
* @param ch[in] Pointer to Channel struct.
*
* Unbind channel from TSG:
* - Check if channel being unbound has become unserviceable.
* - Disable TSG.
* - Preempt TSG.
* - Check hw state of the channel.
* - Remove channel from its runlist.
* - Remove channel from TSG's channel list.
* - Set tsgid of the channel to #NVGPU_INVALID_TSG_ID.
* - Disable channel so that it is not picked up by h/w scheduler.
* - Enable TSG if it is still serviceable. TSG becomes unserviceable
* if channel being unbound has become unserviceable.
* - Do clean up for aborting channel.
* If an error occurred during previous steps:
* - Call #nvgpu_tsg_abort to abort the tsg.
* - Call #nvgpu_channel_update_runlist to remove the channel from the runlist.
* - Acquire #nvgpu_tsg.ch_list_lock of the tsg and delete channel from
* #nvgpu_tsg.ch_list.
* - Set #nvgpu_channel.tsgid to #NVGPU_INVALID_TSG_ID
* - Release #nvgpu_tsg.ch_list_lock of the tsg.
* Call non NULL HAL to unbind channel from the tsg. This HAL is vgpu specific
* and does not apply for non-vgpu.
* Release #nvgpu_tsg.refcount and call #nvgpu_tsg_release if refcount
* becomes 0.
*
* @return 0
* @note Caller of this function must make sure that channel requested to be
* unbound from the TSG is bound to the TSG.
*/
int nvgpu_tsg_unbind_channel(struct nvgpu_tsg *tsg, struct nvgpu_channel *ch);
int nvgpu_tsg_unbind_channel_common(struct nvgpu_tsg *tsg,
struct nvgpu_channel *ch);
/**
* @brief Check h/w channel status before unbinding Channel.
*
* @param tsg[in] Pointer to TSG struct.
* @param ch[in] Pointer to Channel struct.
*
* - Call HAL to read chip specific h/w channel status register into hw_state
* local variable.
* - If next bit is not set in hw_state,
* -- Call HAL (if supported for the chip) to check ctx_reload bit in hw_state.
* --- If set, move ctx_reload h/w
* state to some other channel's h/w status register. New channel id should
* be different than the channel requested to be unbound.
* -- Call HAL (if supported for the chip) to check eng_faulted bit in hw_state.
* --- If set, clear the CE method buffer in #ASYNC_CE_RUNQUE index of
* #nvgpu_tsg.eng_method_buffers of the tsg that the channel being unbound
* is bound to.
*
* @return 0 in case of success and < 0 in case of failure.
* @retval -EINVAL if next bit is set in hw_state.
*/
int nvgpu_tsg_unbind_channel_check_hw_state(struct nvgpu_tsg *tsg,
struct nvgpu_channel *ch);
/**
* @brief Find another channel in the TSG and force ctx reload if
* h/w channel status of the channel is set to ctx_reload.
*
* @param tsg[in] Pointer to TSG struct.
* @param ch[in] Pointer to Channel struct.
* @param hw_state[in] Pointer to nvgpu_channel_hw_state struct.
*
* Find another channel in the TSG and force ctx reload.
*
* @note If there is only one channel in this TSG then function will not find
* another channel to force ctx reload.
*/
void nvgpu_tsg_unbind_channel_check_ctx_reload(struct nvgpu_tsg *tsg,
struct nvgpu_channel *ch,
struct nvgpu_channel_hw_state *hw_state);
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
int nvgpu_tsg_force_reset_ch(struct nvgpu_channel *ch,
u32 err_code, bool verbose);
#endif
void nvgpu_tsg_post_event_id(struct nvgpu_tsg *tsg,
enum nvgpu_event_id_type event_id);
#endif
/**
* @brief Set mmu fault error notifier for all the channels bound to a TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to TSG struct.
*
* Set mmu fault error notifier for all the channels bound to the TSG.
*/
void nvgpu_tsg_set_ctx_mmu_error(struct gk20a *g,
struct nvgpu_tsg *tsg);
/**
* @brief Mark error for all the referenceable channels of tsg's channel list.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to TSG struct.
*
* - Set verbose local variable to false.
* - Acquire #nvgpu_tsg.ch_list_lock of the tsg.
* - For each entry of the channels in #nvgpu_tsg.ch_list of the tsg,
* -- Get reference to the channel.
* -- If channel is referenceable,
* --- Call #nvgpu_channel_mark_error and set verbose local variable to true
* if return value of this function is true.
* --- Put reference to the channel.
* - Release #nvgpu_tsg.ch_list_lock of the tsg.
*
* @return verbose bool variable. This is used to decide if driver needs to dump
* debug info. This can be either true or false.
*/
bool nvgpu_tsg_mark_error(struct gk20a *g, struct nvgpu_tsg *tsg);
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
bool nvgpu_tsg_check_ctxsw_timeout(struct nvgpu_tsg *tsg,
bool *debug_dump, u32 *ms);
#endif
int nvgpu_tsg_set_runlist_interleave(struct nvgpu_tsg *tsg, u32 level);
#ifdef CONFIG_NVGPU_CHANNEL_TSG_SCHEDULING
int nvgpu_tsg_set_timeslice(struct nvgpu_tsg *tsg, u32 timeslice_us);
u32 nvgpu_tsg_get_timeslice(struct nvgpu_tsg *tsg);
@@ -141,9 +487,56 @@ int nvgpu_tsg_set_priority(struct gk20a *g, struct nvgpu_tsg *tsg,
u32 priority);
int nvgpu_tsg_set_interleave(struct nvgpu_tsg *tsg, u32 level);
#endif
/**
* @brief Get default TSG timeslice in us as defined by nvgpu driver.
*
* @param g[in] The GPU driver struct.
*
* Get TSG timeslice value in microseconds. This is the default timeslice
* value in us as defined by s/w.
*
* @return S/w defined default TSG timeslice value in us.
*/
u32 nvgpu_tsg_default_timeslice_us(struct gk20a *g);
/**
* @brief Enable h/w runlist scheduler corresponding to the runlist_id
* of the TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to the TSG struct.
*
* Enable h/w runlist scheduler for #nvgpu_tsg.runlist_id.
*/
void nvgpu_tsg_enable_sched(struct gk20a *g, struct nvgpu_tsg *tsg);
/**
* @brief Disable h/w runlist scheduler corresponding to the runlist_id
* of the TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to the TSG struct.
*
* Disable h/w runlist scheduler for #nvgpu_tsg.runlist_id.
*/
void nvgpu_tsg_disable_sched(struct gk20a *g, struct nvgpu_tsg *tsg);
/**
* @brief Allocate zero initialized memory to store SM errors.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to the TSG struct.
* @param num_sm[in] Total number of SMs supported by h/w.
*
* Allocate zero initialized memory to store SM errors for all the SMs
* supported by h/w.
*
* @return 0 in case of success, < 0 in case of failure.
* @retval -EINVAL if memory is already allocated to store
* SM error states.
* @retval -ENOMEM if memory could not be allocated to store
* SM error states.
*/
int nvgpu_tsg_alloc_sm_error_states_mem(struct gk20a *g,
struct nvgpu_tsg *tsg,
u32 num_sm);
@@ -152,6 +545,7 @@ int nvgpu_tsg_set_sm_exception_type_mask(struct nvgpu_channel *ch,
u32 exception_mask);
#endif
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
struct gk20a_event_id_data {
struct gk20a *g;
@@ -172,12 +566,59 @@ gk20a_event_id_data_from_event_id_node(struct nvgpu_list_node *node)
return (struct gk20a_event_id_data *)
((uintptr_t)node - offsetof(struct gk20a_event_id_data, event_id_node));
};
#endif
/**
* @brief Set error notifier for all the channels bound to a TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to TSG struct.
* @param error_notifier[in] Error notifier defined by s/w.
*
* Set error notifier for all the channels bound to the tsg.
* See include/nvgpu/error_notifier.h.
*/
void nvgpu_tsg_set_error_notifier(struct gk20a *g, struct nvgpu_tsg *tsg,
u32 error_notifier);
#ifdef CONFIG_NVGPU_KERNEL_MODE_SUBMIT
bool nvgpu_tsg_ctxsw_timeout_debug_dump_state(struct nvgpu_tsg *tsg);
void nvgpu_tsg_set_ctxsw_timeout_accumulated_ms(struct nvgpu_tsg *tsg, u32 ms);
#endif
/**
* @brief Abort all the channels bound to the TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to TSG struct.
* @param preempt[in] Flag to ask for preempting TSG.
*
* - Disable all the channels bound to the #tsg so that h/w does not schedule
* them.
* - Preempt #tsg if #preempt flag is set. This is to offload all the channels
* bound to the #tsg from the pbdma/engines.
* - Set #nvgpu_channel.unserviceable of all the channels bound to the #tsg
* to let s/w know of bad state of channels.
* - Do s/w formalities so that channels bound to the #tsg are ready to be
* closed by userspace.
*/
void nvgpu_tsg_abort(struct gk20a *g, struct nvgpu_tsg *tsg, bool preempt);
/**
* @brief Clear h/w bits PBDMA_FAULTED and ENG_FAULTED in CCSR channel h/w
* register for all the channels bound to the TSG.
*
* @param g[in] The GPU driver struct.
* @param tsg[in] Pointer to TSG struct.
* @param eng[in] Flag to ask for clearing ENG_FAULTED h/w bit.
* @param pbdma[in] Flag to ask for clearing PBDMA_FAULTED h/w bit.
*
* If chip supports the h/w bits PBDMA_FAULTED and ENG_FAULTED and tsg
* is non-NULL, clear PBDMA_FAULTED bit in CCSR channel h/w register if #pbdma
* is set, clear ENG_FAULTED bit in CCSR channel h/w register if #eng is set.
* For chips that do not support these h/w bits, just return. Also return if
* #tsg input param is NULL.
*/
void nvgpu_tsg_reset_faulted_eng_pbdma(struct gk20a *g, struct nvgpu_tsg *tsg,
bool eng, bool pbdma);
#ifdef CONFIG_NVGPU_DEBUGGER

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -23,7 +23,9 @@
#include <nvgpu/tsg.h>
#include <nvgpu/gk20a.h>
#ifdef CONFIG_NVGPU_CHANNEL_TSG_CONTROL
void nvgpu_tsg_post_event_id(struct nvgpu_tsg *tsg,
enum nvgpu_event_id_type event_id)
{
}
#endif