mirror of
git://nv-tegra.nvidia.com/linux-nv-oot.git
synced 2025-12-22 09:11:26 +03:00
nvscic2c: Implement Read-After-Write ordering WAR
- Ordering between message/data and host1x syncpoints is not enforced strictly. - Out of the possible WARs, implement dummy PCIe Read between data/message write and notifications towards peer / remote post-fences. - WAR: (IPC/messaging mode)For any UMD produced data towards peer, before notification is triggered, issue a dummy PCIe read via CPU. - WAR: (streaming mode)DMA flush-ranges(data), wait for DMA interrupt, when success issue dummy PCIe reads via CPU on remote post-fences + issue CPU PCIe writes on each remote post-fence. To achieve this, CPU map every imported sync object. NVIPC-974 Change-Id: Id6711d372c0a35e13e399ffbbcd8efcabf147c56 Signed-off-by: Arihant Jejani <ajejani@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/2912894 Reviewed-by: svcacv <svcacv@nvidia.com> Reviewed-by: Janardhan Reddy A <jreddya@nvidia.com> Reviewed-by: Vipin Kumar <vipink@nvidia.com> GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
committed by
mobile promotions
parent
1285186621
commit
0e6d4c804f
@@ -490,10 +490,21 @@ ioctl_notify_remote_impl(struct endpoint_t *endpoint)
|
|||||||
ret = pci_client_raise_irq(endpoint->pci_client_h, PCI_EPC_IRQ_MSI,
|
ret = pci_client_raise_irq(endpoint->pci_client_h, PCI_EPC_IRQ_MSI,
|
||||||
endpoint->msi_irq);
|
endpoint->msi_irq);
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* increment peer's syncpoint. Write of any 4-byte value
|
* Ordering between message/data and host1x syncpoints is not
|
||||||
* increments remote's syncpoint shim by 1.
|
* enforced strictly. Out of the possible WARs, implement dummy
|
||||||
*/
|
* PCIe Read before any syncpoint notifications towards peer.
|
||||||
|
*
|
||||||
|
* For any writes from UMD which require notification, issuing a
|
||||||
|
* dummy PCIe read here shall suffice for all cases where UMD writes
|
||||||
|
* data and requires notification via syncpoint.
|
||||||
|
*/
|
||||||
|
(void)readl(syncpt->peer_mem.pva);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* increment peer's syncpoint. Write of any 4-byte value
|
||||||
|
* increments remote's syncpoint shim by 1.
|
||||||
|
*/
|
||||||
writel(0x1, syncpt->peer_mem.pva);
|
writel(0x1, syncpt->peer_mem.pva);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -84,13 +84,12 @@ struct copy_request {
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* actual number of edma-desc per the submit-copy request.
|
* actual number of edma-desc per the submit-copy request.
|
||||||
* Shall include (num_flush_range + num_remote_post_fences (eDMAed))
|
* Shall include (num_flush_range).
|
||||||
*/
|
*/
|
||||||
u64 num_edma_desc;
|
u64 num_edma_desc;
|
||||||
/*
|
/*
|
||||||
* space for num_edma_desc considering worst-case allocation:
|
* space for num_edma_desc considering worst-case allocation:
|
||||||
* (max_flush_ranges + max_post_fences), assuming submit-copy could have
|
* (max_flush_ranges).
|
||||||
* all the post-fences for remote signalling by eDMA.
|
|
||||||
*/
|
*/
|
||||||
struct tegra_pcie_edma_desc *edma_desc;
|
struct tegra_pcie_edma_desc *edma_desc;
|
||||||
|
|
||||||
@@ -226,7 +225,7 @@ signal_remote_post_fences(struct copy_request *cr);
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
|
prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
|
||||||
struct tegra_pcie_edma_desc *desc, u64 *num_desc, enum peer_cpu_t);
|
struct tegra_pcie_edma_desc *desc, u64 *num_desc);
|
||||||
|
|
||||||
static edma_xfer_status_t
|
static edma_xfer_status_t
|
||||||
schedule_edma_xfer(void *edma_h, void *priv, u64 num_desc,
|
schedule_edma_xfer(void *edma_h, void *priv, u64 num_desc,
|
||||||
@@ -501,8 +500,20 @@ ioctl_import_obj(struct stream_ext_ctx_t *ctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h);
|
peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h);
|
||||||
if (peer_cpu == NVCPU_X86_64)
|
if (peer_cpu == NVCPU_X86_64) {
|
||||||
stream_obj->import_obj_map = ioremap(stream_obj->aper, PAGE_SIZE);
|
stream_obj->import_obj_map = ioremap(stream_obj->aper,
|
||||||
|
PAGE_SIZE);
|
||||||
|
} else {
|
||||||
|
if (stream_obj->import_type == STREAM_OBJ_TYPE_SYNC) {
|
||||||
|
stream_obj->import_obj_map = ioremap(stream_obj->aper,
|
||||||
|
PAGE_SIZE);
|
||||||
|
if (WARN_ON(!stream_obj->import_obj_map)) {
|
||||||
|
fput(filep);
|
||||||
|
return -ENOMEM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fput(filep);
|
fput(filep);
|
||||||
|
|
||||||
args->out.handle = handle;
|
args->out.handle = handle;
|
||||||
@@ -582,9 +593,9 @@ ioctl_submit_copy_request(struct stream_ext_ctx_t *ctx,
|
|||||||
goto reclaim_cr;
|
goto reclaim_cr;
|
||||||
|
|
||||||
cr->peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h);
|
cr->peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h);
|
||||||
/* generate eDMA descriptors from flush_ranges, remote_post_fences.*/
|
/* generate eDMA descriptors from flush_ranges.*/
|
||||||
ret = prepare_edma_desc(ctx->drv_mode, &ctx->cr_params, cr->edma_desc,
|
ret = prepare_edma_desc(ctx->drv_mode, &ctx->cr_params, cr->edma_desc,
|
||||||
&cr->num_edma_desc, cr->peer_cpu);
|
&cr->num_edma_desc);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
release_copy_request_handles(cr);
|
release_copy_request_handles(cr);
|
||||||
goto reclaim_cr;
|
goto reclaim_cr;
|
||||||
@@ -944,13 +955,9 @@ callback_edma_xfer(void *priv, edma_xfer_status_t status,
|
|||||||
struct copy_request *cr = (struct copy_request *)priv;
|
struct copy_request *cr = (struct copy_request *)priv;
|
||||||
|
|
||||||
mutex_lock(&cr->ctx->free_lock);
|
mutex_lock(&cr->ctx->free_lock);
|
||||||
/* increment num_local_fences.*/
|
/* increment post fences: local and remote.*/
|
||||||
if (status == EDMA_XFER_SUCCESS) {
|
if (status == EDMA_XFER_SUCCESS) {
|
||||||
/* X86 remote end fences are signaled through CPU */
|
signal_remote_post_fences(cr);
|
||||||
if (cr->peer_cpu == NVCPU_X86_64)
|
|
||||||
signal_remote_post_fences(cr);
|
|
||||||
|
|
||||||
/* Signal local fences for Tegra*/
|
|
||||||
signal_local_post_fences(cr);
|
signal_local_post_fences(cr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -967,16 +974,14 @@ callback_edma_xfer(void *priv, edma_xfer_status_t status,
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
|
prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
|
||||||
struct tegra_pcie_edma_desc *desc, u64 *num_desc, enum peer_cpu_t peer_cpu)
|
struct tegra_pcie_edma_desc *desc, u64 *num_desc)
|
||||||
{
|
{
|
||||||
u32 i = 0;
|
u32 i = 0;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
u32 iter = 0;
|
u32 iter = 0;
|
||||||
s32 handle = -1;
|
|
||||||
struct file *filep = NULL;
|
struct file *filep = NULL;
|
||||||
struct stream_ext_obj *stream_obj = NULL;
|
struct stream_ext_obj *stream_obj = NULL;
|
||||||
struct nvscic2c_pcie_flush_range *flush_range = NULL;
|
struct nvscic2c_pcie_flush_range *flush_range = NULL;
|
||||||
phys_addr_t dummy_addr = 0x0;
|
|
||||||
|
|
||||||
*num_desc = 0;
|
*num_desc = 0;
|
||||||
for (i = 0; i < params->num_flush_ranges; i++) {
|
for (i = 0; i < params->num_flush_ranges; i++) {
|
||||||
@@ -985,7 +990,6 @@ prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
|
|||||||
filep = fget(flush_range->src_handle);
|
filep = fget(flush_range->src_handle);
|
||||||
stream_obj = filep->private_data;
|
stream_obj = filep->private_data;
|
||||||
desc[iter].src = (stream_obj->vmap.iova + flush_range->offset);
|
desc[iter].src = (stream_obj->vmap.iova + flush_range->offset);
|
||||||
dummy_addr = stream_obj->vmap.iova;
|
|
||||||
fput(filep);
|
fput(filep);
|
||||||
|
|
||||||
filep = fget(flush_range->dst_handle);
|
filep = fget(flush_range->dst_handle);
|
||||||
@@ -1000,27 +1004,6 @@ prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
|
|||||||
desc[iter].sz = flush_range->size;
|
desc[iter].sz = flush_range->size;
|
||||||
iter++;
|
iter++;
|
||||||
}
|
}
|
||||||
/* With Orin as remote end, the remote fence signaling is done using DMA
|
|
||||||
* With X86 as remote end, the remote fence signaling is done using CPU
|
|
||||||
*/
|
|
||||||
if (peer_cpu == NVCPU_ORIN) {
|
|
||||||
for (i = 0; i < params->num_remote_post_fences; i++) {
|
|
||||||
handle = params->remote_post_fences[i];
|
|
||||||
desc[iter].src = dummy_addr;
|
|
||||||
|
|
||||||
filep = fget(handle);
|
|
||||||
stream_obj = filep->private_data;
|
|
||||||
if (drv_mode == DRV_MODE_EPC)
|
|
||||||
desc[iter].dst = stream_obj->aper;
|
|
||||||
else
|
|
||||||
desc[iter].dst = stream_obj->vmap.iova;
|
|
||||||
|
|
||||||
fput(filep);
|
|
||||||
|
|
||||||
desc[iter].sz = 4;
|
|
||||||
iter++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*num_desc += iter;
|
*num_desc += iter;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@@ -1052,19 +1035,34 @@ signal_remote_post_fences(struct copy_request *cr)
|
|||||||
{
|
{
|
||||||
u32 i = 0;
|
u32 i = 0;
|
||||||
struct stream_ext_obj *stream_obj = NULL;
|
struct stream_ext_obj *stream_obj = NULL;
|
||||||
/* Dummy read operation is done on the imported buffer object to ensure
|
|
||||||
* coherence of data on Vidmem of GA100 dGPU, which is connected as an EP to X86.
|
/* X86 remote end fences are signaled through CPU */
|
||||||
* This is needed as Ampere architecture doesn't support coherence of Write after
|
if (cr->peer_cpu == NVCPU_X86_64) {
|
||||||
* Write operation and the dummy read of 4 bytes ensures the data is reconciled in
|
/* Dummy read operation is done on the imported buffer object
|
||||||
* vid-memory when the consumer waiting on a sysmem semaphore is unblocked.
|
* to ensure coherence of data on Vidmem of GA100 dGPU, which is
|
||||||
*/
|
* connected as an EP to X86. This is needed as Ampere architecture
|
||||||
for (i = 0; i < cr->num_remote_buf_objs; i++) {
|
* doesn't support coherence of Write after Write operation and the
|
||||||
stream_obj = cr->remote_buf_objs[i];
|
* dummy read of 4 bytes ensures the data is reconciled in vid-memory
|
||||||
(void)readl(stream_obj->import_obj_map);
|
* when the consumer waiting on a sysmem semaphore is unblocked.
|
||||||
}
|
*/
|
||||||
for (i = 0; i < cr->num_remote_post_fences; i++) {
|
for (i = 0; i < cr->num_remote_buf_objs; i++) {
|
||||||
stream_obj = cr->remote_post_fences[i];
|
stream_obj = cr->remote_buf_objs[i];
|
||||||
writeq(cr->remote_post_fence_values[i], stream_obj->import_obj_map);
|
(void)readl(stream_obj->import_obj_map);
|
||||||
|
}
|
||||||
|
for (i = 0; i < cr->num_remote_post_fences; i++) {
|
||||||
|
stream_obj = cr->remote_post_fences[i];
|
||||||
|
writeq(cr->remote_post_fence_values[i], stream_obj->import_obj_map);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i = 0; i < cr->num_remote_post_fences; i++) {
|
||||||
|
stream_obj = cr->remote_post_fences[i];
|
||||||
|
/*
|
||||||
|
* Issue dummy pcie read to ensure all data is visible
|
||||||
|
* to remote SoC before notification is delivered.
|
||||||
|
*/
|
||||||
|
(void)readl(stream_obj->import_obj_map);
|
||||||
|
writel(0x1, stream_obj->import_obj_map);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1388,13 +1386,9 @@ allocate_copy_request(struct stream_ext_ctx_t *ctx,
|
|||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* edma_desc shall include flush_range.*/
|
||||||
* edma_desc shall include flush_range + worst-case all post-fences
|
|
||||||
* (all max_post_fences could be remote_post_fence which need be eDMAd).
|
|
||||||
*/
|
|
||||||
cr->edma_desc = kzalloc((sizeof(*cr->edma_desc) *
|
cr->edma_desc = kzalloc((sizeof(*cr->edma_desc) *
|
||||||
(ctx->cr_limits.max_flush_ranges +
|
ctx->cr_limits.max_flush_ranges),
|
||||||
ctx->cr_limits.max_post_fences)),
|
|
||||||
GFP_KERNEL);
|
GFP_KERNEL);
|
||||||
if (WARN_ON(!cr->edma_desc)) {
|
if (WARN_ON(!cr->edma_desc)) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
|
|||||||
Reference in New Issue
Block a user