nvscic2c: Implement Read-After-Write ordering WAR

- Ordering between message/data and host1x syncpoints is not enforced
   strictly.
 - Out of the possible WARs, implement dummy PCIe Read between
   data/message write and notifications towards peer / remote post-fences.
 - WAR: (IPC/messaging mode)For any UMD produced data towards peer,
   before notification is triggered, issue a dummy PCIe read  via CPU.
 - WAR: (streaming mode)DMA flush-ranges(data), wait for DMA interrupt,
   when success issue dummy PCIe reads via CPU on remote post-fences
   + issue CPU PCIe writes on each remote post-fence. To achieve this,
   CPU map every imported sync object.

NVIPC-974

Change-Id: Id6711d372c0a35e13e399ffbbcd8efcabf147c56
Signed-off-by: Arihant Jejani <ajejani@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/2912894
Reviewed-by: svcacv <svcacv@nvidia.com>
Reviewed-by: Janardhan Reddy A <jreddya@nvidia.com>
Reviewed-by: Vipin Kumar <vipink@nvidia.com>
GVS: Gerrit_Virtual_Submit <buildbot_gerritrpt@nvidia.com>
This commit is contained in:
Arihant Jejani
2023-05-29 09:03:21 +00:00
committed by mobile promotions
parent 1285186621
commit 0e6d4c804f
2 changed files with 67 additions and 62 deletions

View File

@@ -490,6 +490,17 @@ ioctl_notify_remote_impl(struct endpoint_t *endpoint)
ret = pci_client_raise_irq(endpoint->pci_client_h, PCI_EPC_IRQ_MSI, ret = pci_client_raise_irq(endpoint->pci_client_h, PCI_EPC_IRQ_MSI,
endpoint->msi_irq); endpoint->msi_irq);
} else { } else {
/*
* Ordering between message/data and host1x syncpoints is not
* enforced strictly. Out of the possible WARs, implement dummy
* PCIe Read before any syncpoint notifications towards peer.
*
* For any writes from UMD which require notification, issuing a
* dummy PCIe read here shall suffice for all cases where UMD writes
* data and requires notification via syncpoint.
*/
(void)readl(syncpt->peer_mem.pva);
/* /*
* increment peer's syncpoint. Write of any 4-byte value * increment peer's syncpoint. Write of any 4-byte value
* increments remote's syncpoint shim by 1. * increments remote's syncpoint shim by 1.

View File

@@ -84,13 +84,12 @@ struct copy_request {
/* /*
* actual number of edma-desc per the submit-copy request. * actual number of edma-desc per the submit-copy request.
* Shall include (num_flush_range + num_remote_post_fences (eDMAed)) * Shall include (num_flush_range).
*/ */
u64 num_edma_desc; u64 num_edma_desc;
/* /*
* space for num_edma_desc considering worst-case allocation: * space for num_edma_desc considering worst-case allocation:
* (max_flush_ranges + max_post_fences), assuming submit-copy could have * (max_flush_ranges).
* all the post-fences for remote signalling by eDMA.
*/ */
struct tegra_pcie_edma_desc *edma_desc; struct tegra_pcie_edma_desc *edma_desc;
@@ -226,7 +225,7 @@ signal_remote_post_fences(struct copy_request *cr);
static int static int
prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params, prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
struct tegra_pcie_edma_desc *desc, u64 *num_desc, enum peer_cpu_t); struct tegra_pcie_edma_desc *desc, u64 *num_desc);
static edma_xfer_status_t static edma_xfer_status_t
schedule_edma_xfer(void *edma_h, void *priv, u64 num_desc, schedule_edma_xfer(void *edma_h, void *priv, u64 num_desc,
@@ -501,8 +500,20 @@ ioctl_import_obj(struct stream_ext_ctx_t *ctx,
} }
peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h); peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h);
if (peer_cpu == NVCPU_X86_64) if (peer_cpu == NVCPU_X86_64) {
stream_obj->import_obj_map = ioremap(stream_obj->aper, PAGE_SIZE); stream_obj->import_obj_map = ioremap(stream_obj->aper,
PAGE_SIZE);
} else {
if (stream_obj->import_type == STREAM_OBJ_TYPE_SYNC) {
stream_obj->import_obj_map = ioremap(stream_obj->aper,
PAGE_SIZE);
if (WARN_ON(!stream_obj->import_obj_map)) {
fput(filep);
return -ENOMEM;
}
}
}
fput(filep); fput(filep);
args->out.handle = handle; args->out.handle = handle;
@@ -582,9 +593,9 @@ ioctl_submit_copy_request(struct stream_ext_ctx_t *ctx,
goto reclaim_cr; goto reclaim_cr;
cr->peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h); cr->peer_cpu = pci_client_get_peer_cpu(ctx->pci_client_h);
/* generate eDMA descriptors from flush_ranges, remote_post_fences.*/ /* generate eDMA descriptors from flush_ranges.*/
ret = prepare_edma_desc(ctx->drv_mode, &ctx->cr_params, cr->edma_desc, ret = prepare_edma_desc(ctx->drv_mode, &ctx->cr_params, cr->edma_desc,
&cr->num_edma_desc, cr->peer_cpu); &cr->num_edma_desc);
if (ret) { if (ret) {
release_copy_request_handles(cr); release_copy_request_handles(cr);
goto reclaim_cr; goto reclaim_cr;
@@ -944,13 +955,9 @@ callback_edma_xfer(void *priv, edma_xfer_status_t status,
struct copy_request *cr = (struct copy_request *)priv; struct copy_request *cr = (struct copy_request *)priv;
mutex_lock(&cr->ctx->free_lock); mutex_lock(&cr->ctx->free_lock);
/* increment num_local_fences.*/ /* increment post fences: local and remote.*/
if (status == EDMA_XFER_SUCCESS) { if (status == EDMA_XFER_SUCCESS) {
/* X86 remote end fences are signaled through CPU */
if (cr->peer_cpu == NVCPU_X86_64)
signal_remote_post_fences(cr); signal_remote_post_fences(cr);
/* Signal local fences for Tegra*/
signal_local_post_fences(cr); signal_local_post_fences(cr);
} }
@@ -967,16 +974,14 @@ callback_edma_xfer(void *priv, edma_xfer_status_t status,
static int static int
prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params, prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
struct tegra_pcie_edma_desc *desc, u64 *num_desc, enum peer_cpu_t peer_cpu) struct tegra_pcie_edma_desc *desc, u64 *num_desc)
{ {
u32 i = 0; u32 i = 0;
int ret = 0; int ret = 0;
u32 iter = 0; u32 iter = 0;
s32 handle = -1;
struct file *filep = NULL; struct file *filep = NULL;
struct stream_ext_obj *stream_obj = NULL; struct stream_ext_obj *stream_obj = NULL;
struct nvscic2c_pcie_flush_range *flush_range = NULL; struct nvscic2c_pcie_flush_range *flush_range = NULL;
phys_addr_t dummy_addr = 0x0;
*num_desc = 0; *num_desc = 0;
for (i = 0; i < params->num_flush_ranges; i++) { for (i = 0; i < params->num_flush_ranges; i++) {
@@ -985,7 +990,6 @@ prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
filep = fget(flush_range->src_handle); filep = fget(flush_range->src_handle);
stream_obj = filep->private_data; stream_obj = filep->private_data;
desc[iter].src = (stream_obj->vmap.iova + flush_range->offset); desc[iter].src = (stream_obj->vmap.iova + flush_range->offset);
dummy_addr = stream_obj->vmap.iova;
fput(filep); fput(filep);
filep = fget(flush_range->dst_handle); filep = fget(flush_range->dst_handle);
@@ -1000,27 +1004,6 @@ prepare_edma_desc(enum drv_mode_t drv_mode, struct copy_req_params *params,
desc[iter].sz = flush_range->size; desc[iter].sz = flush_range->size;
iter++; iter++;
} }
/* With Orin as remote end, the remote fence signaling is done using DMA
* With X86 as remote end, the remote fence signaling is done using CPU
*/
if (peer_cpu == NVCPU_ORIN) {
for (i = 0; i < params->num_remote_post_fences; i++) {
handle = params->remote_post_fences[i];
desc[iter].src = dummy_addr;
filep = fget(handle);
stream_obj = filep->private_data;
if (drv_mode == DRV_MODE_EPC)
desc[iter].dst = stream_obj->aper;
else
desc[iter].dst = stream_obj->vmap.iova;
fput(filep);
desc[iter].sz = 4;
iter++;
}
}
*num_desc += iter; *num_desc += iter;
return ret; return ret;
} }
@@ -1052,11 +1035,15 @@ signal_remote_post_fences(struct copy_request *cr)
{ {
u32 i = 0; u32 i = 0;
struct stream_ext_obj *stream_obj = NULL; struct stream_ext_obj *stream_obj = NULL;
/* Dummy read operation is done on the imported buffer object to ensure
* coherence of data on Vidmem of GA100 dGPU, which is connected as an EP to X86. /* X86 remote end fences are signaled through CPU */
* This is needed as Ampere architecture doesn't support coherence of Write after if (cr->peer_cpu == NVCPU_X86_64) {
* Write operation and the dummy read of 4 bytes ensures the data is reconciled in /* Dummy read operation is done on the imported buffer object
* vid-memory when the consumer waiting on a sysmem semaphore is unblocked. * to ensure coherence of data on Vidmem of GA100 dGPU, which is
* connected as an EP to X86. This is needed as Ampere architecture
* doesn't support coherence of Write after Write operation and the
* dummy read of 4 bytes ensures the data is reconciled in vid-memory
* when the consumer waiting on a sysmem semaphore is unblocked.
*/ */
for (i = 0; i < cr->num_remote_buf_objs; i++) { for (i = 0; i < cr->num_remote_buf_objs; i++) {
stream_obj = cr->remote_buf_objs[i]; stream_obj = cr->remote_buf_objs[i];
@@ -1066,6 +1053,17 @@ signal_remote_post_fences(struct copy_request *cr)
stream_obj = cr->remote_post_fences[i]; stream_obj = cr->remote_post_fences[i];
writeq(cr->remote_post_fence_values[i], stream_obj->import_obj_map); writeq(cr->remote_post_fence_values[i], stream_obj->import_obj_map);
} }
} else {
for (i = 0; i < cr->num_remote_post_fences; i++) {
stream_obj = cr->remote_post_fences[i];
/*
* Issue dummy pcie read to ensure all data is visible
* to remote SoC before notification is delivered.
*/
(void)readl(stream_obj->import_obj_map);
writel(0x1, stream_obj->import_obj_map);
}
}
} }
static int static int
@@ -1388,13 +1386,9 @@ allocate_copy_request(struct stream_ext_ctx_t *ctx,
goto err; goto err;
} }
/* /* edma_desc shall include flush_range.*/
* edma_desc shall include flush_range + worst-case all post-fences
* (all max_post_fences could be remote_post_fence which need be eDMAd).
*/
cr->edma_desc = kzalloc((sizeof(*cr->edma_desc) * cr->edma_desc = kzalloc((sizeof(*cr->edma_desc) *
(ctx->cr_limits.max_flush_ranges + ctx->cr_limits.max_flush_ranges),
ctx->cr_limits.max_post_fences)),
GFP_KERNEL); GFP_KERNEL);
if (WARN_ON(!cr->edma_desc)) { if (WARN_ON(!cr->edma_desc)) {
ret = -ENOMEM; ret = -ENOMEM;