diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 01401926..db1c6623 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -2,3 +2,4 @@ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. obj-m += controller/ +obj-m += endpoint/ diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile new file mode 100644 index 00000000..546aa755 --- /dev/null +++ b/drivers/pci/endpoint/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +obj-m += functions/ diff --git a/drivers/pci/endpoint/functions/Makefile b/drivers/pci/endpoint/functions/Makefile new file mode 100644 index 00000000..3d33f64b --- /dev/null +++ b/drivers/pci/endpoint/functions/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +obj-m += pci-epf-dma-test.o diff --git a/drivers/pci/endpoint/functions/pci-epf-dma-test.c b/drivers/pci/endpoint/functions/pci-epf-dma-test.c new file mode 100644 index 00000000..e2a13692 --- /dev/null +++ b/drivers/pci/endpoint/functions/pci-epf-dma-test.c @@ -0,0 +1,1766 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * PCIe DMA EPF test framework for Tegra PCIe + * + * Copyright (C) 2021-2022 NVIDIA Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "pci-epf-wrapper.h" + +static struct pcie_epf_dma *gepfnv; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)) +#define PCI_EPF_CORE_DEINIT +#endif + +struct pcie_epf_dma { + struct pci_epf_header header; + struct device *fdev; + struct device *cdev; + void *bar0_virt; + struct dentry *debugfs; + void __iomem *dma_base; + int irq; + + u32 dma_size; + u32 stress_count; + u32 async_count; + + struct task_struct *wr0_task; + struct task_struct *wr1_task; + struct task_struct *wr2_task; + struct task_struct *wr3_task; + struct task_struct *rd0_task; + struct task_struct *rd1_task; + u8 task_done; + wait_queue_head_t task_wq; + void *cookie; + + wait_queue_head_t wr_wq[DMA_WR_CHNL_NUM]; + wait_queue_head_t rd_wq[DMA_RD_CHNL_NUM]; + unsigned long wr_busy; + unsigned long rd_busy; + ktime_t wr_start_time[DMA_WR_CHNL_NUM]; + ktime_t wr_end_time[DMA_WR_CHNL_NUM]; + ktime_t rd_start_time[DMA_RD_CHNL_NUM]; + ktime_t rd_end_time[DMA_RD_CHNL_NUM]; + u32 wr_cnt[DMA_WR_CHNL_NUM + DMA_RD_CHNL_NUM]; + u32 rd_cnt[DMA_WR_CHNL_NUM + DMA_RD_CHNL_NUM]; + bool pcs[DMA_WR_CHNL_NUM + DMA_RD_CHNL_NUM]; + bool async_dma; + ktime_t edma_start_time[DMA_WR_CHNL_NUM]; + u64 tsz; + u32 edma_ch; + u32 prev_edma_ch; + u32 nents; + struct tegra_pcie_edma_desc *ll_desc; + struct edmalib_common edma; +}; + +struct edma_desc { + dma_addr_t src; + dma_addr_t dst; + size_t sz; +}; + +static irqreturn_t pcie_dma_epf_wr0_msi(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 0; + + epfnv->wr_busy &= ~BIT(bit); + wake_up(&epfnv->wr_wq[bit]); + + return IRQ_HANDLED; +} + +static irqreturn_t pcie_dma_epf_wr1_msi(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 1; + + epfnv->wr_busy &= ~BIT(bit); + wake_up(&epfnv->wr_wq[bit]); + + return IRQ_HANDLED; +} + +static irqreturn_t pcie_dma_epf_wr2_msi(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 2; + + epfnv->wr_busy &= ~BIT(bit); + wake_up(&epfnv->wr_wq[bit]); + + return IRQ_HANDLED; +} + +static irqreturn_t pcie_dma_epf_wr3_msi(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 3; + + epfnv->wr_busy &= ~BIT(bit); + wake_up(&epfnv->wr_wq[bit]); + + return IRQ_HANDLED; +} + +static irqreturn_t pcie_dma_epf_rd0_msi(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 0; + + epfnv->rd_busy &= ~BIT(bit); + wake_up(&epfnv->rd_wq[bit]); + + return IRQ_HANDLED; +} + +static irqreturn_t pcie_dma_epf_rd1_msi(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 1; + + epfnv->rd_busy &= ~BIT(bit); + wake_up(&epfnv->rd_wq[bit]); + + return IRQ_HANDLED; +} + +void pcie_async_dma_handler(struct pcie_epf_dma *epfnv) +{ + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + u64 llp_base, llp_iova; + u32 llp_idx, ridx; + int i; + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + llp_iova = dma_channel_rd(epfnv->dma_base, i, + DMA_LLP_HIGH_OFF_WRCH); + llp_iova = (llp_iova << 32); + llp_iova |= dma_channel_rd(epfnv->dma_base, i, + DMA_LLP_LOW_OFF_WRCH); + llp_base = epf_bar0->ep_phy_addr + DMA_LL_WR_OFFSET(i); + llp_idx = ((llp_iova - llp_base) / sizeof(struct dma_ll)); + llp_idx = llp_idx % DMA_ASYNC_LL_SIZE; + + if (!llp_idx) + continue; + + ridx = epfnv->rd_cnt[i] % DMA_ASYNC_LL_SIZE; + + while (llp_idx != ridx) { + epfnv->rd_cnt[i]++; + ridx = epfnv->rd_cnt[i] % DMA_ASYNC_LL_SIZE; + } + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + llp_iova = dma_channel_rd(epfnv->dma_base, i, + DMA_LLP_HIGH_OFF_RDCH); + llp_iova = (llp_iova << 32); + llp_iova |= dma_channel_rd(epfnv->dma_base, i, + DMA_LLP_LOW_OFF_RDCH); + llp_base = epf_bar0->ep_phy_addr + DMA_LL_RD_OFFSET(i); + llp_idx = ((llp_iova - llp_base) / sizeof(struct dma_ll)); + llp_idx = llp_idx % DMA_ASYNC_LL_SIZE; + + if (!llp_idx) + continue; + + ridx = epfnv->rd_cnt[DMA_WR_CHNL_NUM + i] % DMA_ASYNC_LL_SIZE; + + while (llp_idx != ridx) { + epfnv->rd_cnt[DMA_WR_CHNL_NUM + i]++; + ridx = epfnv->rd_cnt[DMA_WR_CHNL_NUM + i] % + DMA_ASYNC_LL_SIZE; + } + } +} + +static irqreturn_t pcie_dma_epf_irq(int irq, void *arg) +{ + return IRQ_WAKE_THREAD; +} + +static irqreturn_t pcie_dma_epf_irq_handler(int irq, void *arg) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *)arg; + int bit = 0; + u32 val; + + val = dma_common_rd(epfnv->dma_base, DMA_WRITE_INT_STATUS_OFF); + for_each_set_bit(bit, &epfnv->wr_busy, DMA_WR_CHNL_NUM) { + if (BIT(bit) & val) { + dma_common_wr(epfnv->dma_base, BIT(bit), + DMA_WRITE_INT_CLEAR_OFF); + epfnv->wr_end_time[bit] = ktime_get(); + epfnv->wr_busy &= ~(BIT(bit)); + wake_up(&epfnv->wr_wq[bit]); + } + } + + val = dma_common_rd(epfnv->dma_base, DMA_READ_INT_STATUS_OFF); + for_each_set_bit(bit, &epfnv->rd_busy, DMA_RD_CHNL_NUM) { + if (BIT(bit) & val) { + dma_common_wr(epfnv->dma_base, BIT(bit), + DMA_READ_INT_CLEAR_OFF); + epfnv->rd_end_time[bit] = ktime_get(); + epfnv->rd_busy &= ~(BIT(bit)); + wake_up(&epfnv->rd_wq[bit]); + } + } + + if (epfnv->async_dma) { + val = dma_common_rd(epfnv->dma_base, DMA_WRITE_INT_STATUS_OFF); + dma_common_wr(epfnv->dma_base, val, DMA_WRITE_INT_CLEAR_OFF); + val = dma_common_rd(epfnv->dma_base, DMA_READ_INT_STATUS_OFF); + dma_common_wr(epfnv->dma_base, val, DMA_READ_INT_CLEAR_OFF); + pcie_async_dma_handler(epfnv); + } + + return IRQ_HANDLED; +} + +static int edma_init(struct pcie_epf_dma *epfnv, bool lie) +{ + u32 val; + int i; + + /* Enable LIE or RIE for all write channels */ + if (lie) { + val = dma_common_rd(epfnv->dma_base, DMA_WRITE_INT_MASK_OFF); + val &= ~0xf; + val &= ~(0xf << 16); + dma_common_wr(epfnv->dma_base, val, DMA_WRITE_INT_MASK_OFF); + } else { + val = dma_common_rd(epfnv->dma_base, DMA_WRITE_INT_MASK_OFF); + val |= 0xf; + val |= (0xf << 16); + dma_common_wr(epfnv->dma_base, val, DMA_WRITE_INT_MASK_OFF); + } + + val = DMA_CH_CONTROL1_OFF_WRCH_LIE; + if (!lie) + val |= DMA_CH_CONTROL1_OFF_WRCH_RIE; + for (i = 0; i < DMA_WR_CHNL_NUM; i++) + dma_channel_wr(epfnv->dma_base, i, val, + DMA_CH_CONTROL1_OFF_WRCH); + + /* Enable LIE or RIE for all read channels */ + if (lie) { + val = dma_common_rd(epfnv->dma_base, DMA_READ_INT_MASK_OFF); + val &= ~0x3; + val &= ~(0x3 << 16); + dma_common_wr(epfnv->dma_base, val, DMA_READ_INT_MASK_OFF); + } else { + val = dma_common_rd(epfnv->dma_base, DMA_READ_INT_MASK_OFF); + val |= 0x3; + val |= (0x3 << 16); + dma_common_wr(epfnv->dma_base, val, DMA_READ_INT_MASK_OFF); + } + + val = DMA_CH_CONTROL1_OFF_RDCH_LIE; + if (!lie) + val |= DMA_CH_CONTROL1_OFF_RDCH_RIE; + for (i = 0; i < DMA_RD_CHNL_NUM; i++) + dma_channel_wr(epfnv->dma_base, i, val, + DMA_CH_CONTROL1_OFF_RDCH); + + dma_common_wr(epfnv->dma_base, WRITE_ENABLE, DMA_WRITE_ENGINE_EN_OFF); + dma_common_wr(epfnv->dma_base, READ_ENABLE, DMA_READ_ENGINE_EN_OFF); + + return 0; +} + +static void edma_deinit(struct pcie_epf_dma *epfnv) +{ + u32 val; + + /* Mask channel interrupts */ + val = dma_common_rd(epfnv->dma_base, DMA_WRITE_INT_MASK_OFF); + val |= 0xf; + val |= (0xf << 16); + dma_common_wr(epfnv->dma_base, val, DMA_WRITE_INT_MASK_OFF); + + val = dma_common_rd(epfnv->dma_base, DMA_READ_INT_MASK_OFF); + val |= 0x3; + val |= (0x3 << 16); + dma_common_wr(epfnv->dma_base, val, DMA_READ_INT_MASK_OFF); + + dma_common_wr(epfnv->dma_base, WRITE_DISABLE, DMA_WRITE_ENGINE_EN_OFF); + dma_common_wr(epfnv->dma_base, READ_DISABLE, DMA_READ_ENGINE_EN_OFF); +} + +static int edma_ll_init(struct pcie_epf_dma *epfnv) +{ + u32 val; + int i; + + /* Enable linked list mode and set CCS */ + val = DMA_CH_CONTROL1_OFF_WRCH_LLE | DMA_CH_CONTROL1_OFF_WRCH_CCS; + for (i = 0; i < DMA_WR_CHNL_NUM; i++) + dma_channel_wr(epfnv->dma_base, i, val, + DMA_CH_CONTROL1_OFF_WRCH); + + val = DMA_CH_CONTROL1_OFF_RDCH_LLE | DMA_CH_CONTROL1_OFF_WRCH_CCS; + for (i = 0; i < DMA_RD_CHNL_NUM; i++) + dma_channel_wr(epfnv->dma_base, i, val, + DMA_CH_CONTROL1_OFF_RDCH); + + return 0; +} + +static void edma_ll_deinit(struct pcie_epf_dma *epfnv) +{ + u32 val; + int i; + + /* Disable linked list mode and clear CCS */ + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + val = dma_channel_rd(epfnv->dma_base, i, + DMA_CH_CONTROL1_OFF_WRCH); + val &= ~(DMA_CH_CONTROL1_OFF_WRCH_LLE | + DMA_CH_CONTROL1_OFF_WRCH_CCS); + dma_channel_wr(epfnv->dma_base, i, val, + DMA_CH_CONTROL1_OFF_WRCH); + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + val = dma_channel_rd(epfnv->dma_base, i, + DMA_CH_CONTROL1_OFF_RDCH); + val &= ~(DMA_CH_CONTROL1_OFF_RDCH_LLE | + DMA_CH_CONTROL1_OFF_RDCH_CCS); + dma_channel_wr(epfnv->dma_base, i, val, + DMA_CH_CONTROL1_OFF_RDCH); + } +} + +static int edma_submit_direct_tx(struct pcie_epf_dma *epfnv, + struct edma_desc *desc, int ch) +{ + int ret = 0; + + epfnv->wr_busy |= 1 << ch; + + /* Populate desc in DMA registers */ + dma_channel_wr(epfnv->dma_base, ch, desc->sz, + DMA_TRANSFER_SIZE_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, ch, lower_32_bits(desc->src), + DMA_SAR_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, ch, upper_32_bits(desc->src), + DMA_SAR_HIGH_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, ch, lower_32_bits(desc->dst), + DMA_DAR_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, ch, upper_32_bits(desc->dst), + DMA_DAR_HIGH_OFF_WRCH); + + epfnv->wr_start_time[ch] = ktime_get(); + dma_common_wr(epfnv->dma_base, ch, DMA_WRITE_DOORBELL_OFF); + + /* Wait 5 sec to get DMA done interrupt */ + ret = wait_event_timeout(epfnv->wr_wq[ch], + !(epfnv->wr_busy & (1 << ch)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: DD WR CH: %d TO\n", __func__, ch); + ret = -ETIMEDOUT; + } + + return ret; +} + +static int edma_submit_direct_rx(struct pcie_epf_dma *epfnv, + struct edma_desc *desc, int ch) +{ + int ret = 0; + + epfnv->rd_busy |= 1 << ch; + + /* Populate desc in DMA registers */ + dma_channel_wr(epfnv->dma_base, ch, desc->sz, + DMA_TRANSFER_SIZE_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, ch, lower_32_bits(desc->src), + DMA_SAR_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, ch, upper_32_bits(desc->src), + DMA_SAR_HIGH_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, ch, lower_32_bits(desc->dst), + DMA_DAR_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, ch, upper_32_bits(desc->dst), + DMA_DAR_HIGH_OFF_RDCH); + + epfnv->rd_start_time[ch] = ktime_get(); + dma_common_wr(epfnv->dma_base, ch, DMA_READ_DOORBELL_OFF); + + ret = wait_event_timeout(epfnv->rd_wq[ch], + !(epfnv->rd_busy & (1 << ch)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: DD RD CH: %d TO\n", + __func__, ch); + ret = -ETIMEDOUT; + } + + return ret; +} + +static int edma_submit_direct_txrx(struct pcie_epf_dma *epfnv, + struct edma_desc *desc_wr, + struct edma_desc *desc_rd) +{ + int ret = 0, i; + + /* Configure all DMA write and read channels */ + epfnv->wr_busy = DMA_WR_CHNL_MASK; + epfnv->rd_busy = DMA_RD_CHNL_MASK; + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + dma_channel_wr(epfnv->dma_base, i, desc_wr[i].sz, + DMA_TRANSFER_SIZE_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, i, + lower_32_bits(desc_wr[i].src), + DMA_SAR_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, i, + upper_32_bits(desc_wr[i].src), + DMA_SAR_HIGH_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, i, + lower_32_bits(desc_wr[i].dst), + DMA_DAR_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, i, + upper_32_bits(desc_wr[i].dst), + DMA_DAR_HIGH_OFF_WRCH); + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + dma_channel_wr(epfnv->dma_base, i, desc_rd[i].sz, + DMA_TRANSFER_SIZE_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, i, + lower_32_bits(desc_rd[i].src), + DMA_SAR_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, i, + upper_32_bits(desc_rd[i].src), + DMA_SAR_HIGH_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, i, + lower_32_bits(desc_rd[i].dst), + DMA_DAR_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, i, + upper_32_bits(desc_rd[i].dst), + DMA_DAR_HIGH_OFF_RDCH); + } + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + dma_common_wr(epfnv->dma_base, i, DMA_WRITE_DOORBELL_OFF); + if (i < DMA_RD_CHNL_NUM) + dma_common_wr(epfnv->dma_base, i, + DMA_READ_DOORBELL_OFF); + } + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + ret = wait_event_timeout(epfnv->wr_wq[i], + !(epfnv->wr_busy & (1 << i)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: DD WR CH: %d TO\n", + __func__, i); + ret = -ETIMEDOUT; + goto fail; + } + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + ret = wait_event_timeout(epfnv->rd_wq[i], + !(epfnv->rd_busy & (1 << i)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: DD RD CH: %d TO\n", + __func__, i); + ret = -ETIMEDOUT; + goto fail; + } + } + +fail: + return ret; +} + +static int edma_submit_sync_tx(struct pcie_epf_dma *epfnv, + struct edma_desc *desc, + int nents, int ch, bool lie) +{ + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + dma_addr_t ll_phy_addr = epf_bar0->ep_phy_addr + DMA_LL_WR_OFFSET(ch); + struct dma_ll *dma_ll_virt; + int i, ret; + + epfnv->wr_busy |= 1 << ch; + + /* Program DMA LL base address in DMA LL pointer register */ + dma_channel_wr(epfnv->dma_base, ch, lower_32_bits(ll_phy_addr), + DMA_LLP_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, ch, upper_32_bits(ll_phy_addr), + DMA_LLP_HIGH_OFF_WRCH); + + /* Populate DMA descriptors in LL */ + dma_ll_virt = (struct dma_ll *) + (epfnv->bar0_virt + DMA_LL_WR_OFFSET(ch)); + for (i = 0; i < nents; i++) { + dma_ll_virt->size = desc[i].sz; + dma_ll_virt->src_low = lower_32_bits(desc[i].src); + dma_ll_virt->src_high = upper_32_bits(desc[i].src); + dma_ll_virt->dst_low = lower_32_bits(desc[i].dst); + dma_ll_virt->dst_high = upper_32_bits(desc[i].dst); + dma_ll_virt->ele.cb = 1; + dma_ll_virt++; + } + /* Set LIE or RIE in last element */ + dma_ll_virt--; + dma_ll_virt->ele.lie = 1; + if (!lie) + dma_ll_virt->ele.rie = 1; + + epfnv->wr_start_time[ch] = ktime_get(); + dma_common_wr(epfnv->dma_base, ch, DMA_WRITE_DOORBELL_OFF); + + ret = wait_event_timeout(epfnv->wr_wq[ch], + !(epfnv->wr_busy & (1 << ch)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: LL WR CH: %d TO\n", __func__, ch); + ret = -ETIMEDOUT; + } + + return ret; +} + +static int edma_submit_sync_rx(struct pcie_epf_dma *epfnv, + struct edma_desc *desc, + int nents, int ch, bool lie) +{ + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + dma_addr_t ll_phy_addr = epf_bar0->ep_phy_addr + DMA_LL_RD_OFFSET(ch); + struct dma_ll *dma_ll_virt; + int i, ret; + + epfnv->rd_busy |= 1 << ch; + + /* Program DMA LL base address in DMA LL pointer register */ + dma_channel_wr(epfnv->dma_base, ch, lower_32_bits(ll_phy_addr), + DMA_LLP_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, ch, upper_32_bits(ll_phy_addr), + DMA_LLP_HIGH_OFF_RDCH); + + /* Populate DMA descriptors in LL */ + dma_ll_virt = (struct dma_ll *) + (epfnv->bar0_virt + DMA_LL_RD_OFFSET(ch)); + for (i = 0; i < nents; i++) { + dma_ll_virt->size = desc[i].sz; + dma_ll_virt->src_low = lower_32_bits(desc[i].src); + dma_ll_virt->src_high = upper_32_bits(desc[i].src); + dma_ll_virt->dst_low = lower_32_bits(desc[i].dst); + dma_ll_virt->dst_high = upper_32_bits(desc[i].dst); + dma_ll_virt->ele.cb = 1; + dma_ll_virt++; + } + /* Set LIE or RIE in last element */ + dma_ll_virt--; + dma_ll_virt->ele.lie = 1; + if (!lie) + dma_ll_virt->ele.rie = 1; + + epfnv->rd_start_time[ch] = ktime_get(); + dma_common_wr(epfnv->dma_base, ch, DMA_READ_DOORBELL_OFF); + + ret = wait_event_timeout(epfnv->rd_wq[ch], + !(epfnv->rd_busy & (1 << ch)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: LL RD CH: %d TO\n", __func__, ch); + ret = -ETIMEDOUT; + } + + return ret; +} + +static int edma_submit_sync_txrx(struct pcie_epf_dma *epfnv, + struct edma_desc *desc_wr, + struct edma_desc *desc_rd, int nents) +{ + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + dma_addr_t phy_addr = epf_bar0->ep_phy_addr; + struct dma_ll *dma_ll_virt; + int i, j, ret; + + epfnv->wr_busy = DMA_WR_CHNL_MASK; + epfnv->rd_busy = DMA_RD_CHNL_MASK; + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + dma_channel_wr(epfnv->dma_base, i, + lower_32_bits(phy_addr + DMA_LL_WR_OFFSET(i)), + DMA_LLP_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, i, + upper_32_bits(phy_addr + DMA_LL_WR_OFFSET(i)), + DMA_LLP_HIGH_OFF_WRCH); + + dma_ll_virt = (struct dma_ll *) + (epfnv->bar0_virt + DMA_LL_WR_OFFSET(i)); + for (j = i * nents; j < ((i + 1) * nents); j++) { + dma_ll_virt->size = desc_wr[j].sz; + dma_ll_virt->src_low = lower_32_bits(desc_wr[j].src); + dma_ll_virt->src_high = upper_32_bits(desc_wr[j].src); + dma_ll_virt->dst_low = lower_32_bits(desc_wr[j].dst); + dma_ll_virt->dst_high = upper_32_bits(desc_wr[j].dst); + dma_ll_virt->ele.cb = 1; + dma_ll_virt++; + } + dma_ll_virt--; + dma_ll_virt->ele.lie = 1; + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + dma_channel_wr(epfnv->dma_base, i, + lower_32_bits(phy_addr + DMA_LL_RD_OFFSET(i)), + DMA_LLP_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, i, + upper_32_bits(phy_addr + DMA_LL_RD_OFFSET(i)), + DMA_LLP_HIGH_OFF_RDCH); + + dma_ll_virt = (struct dma_ll *) + (epfnv->bar0_virt + DMA_LL_RD_OFFSET(i)); + for (j = i * nents; j < ((i + 1) * nents); j++) { + dma_ll_virt->size = desc_rd[j].sz; + dma_ll_virt->src_low = lower_32_bits(desc_rd[j].src); + dma_ll_virt->src_high = upper_32_bits(desc_rd[j].src); + dma_ll_virt->dst_low = lower_32_bits(desc_rd[j].dst); + dma_ll_virt->dst_high = upper_32_bits(desc_rd[j].dst); + dma_ll_virt->ele.cb = 1; + dma_ll_virt++; + } + dma_ll_virt--; + dma_ll_virt->ele.lie = 1; + } + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + dma_common_wr(epfnv->dma_base, i, DMA_WRITE_DOORBELL_OFF); + if (i < DMA_RD_CHNL_NUM) + dma_common_wr(epfnv->dma_base, i, + DMA_READ_DOORBELL_OFF); + } + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + ret = wait_event_timeout(epfnv->wr_wq[i], + !(epfnv->wr_busy & (1 << i)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: LL WR CH: %d TO\n", + __func__, i); + ret = -ETIMEDOUT; + goto fail; + } + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + ret = wait_event_timeout(epfnv->rd_wq[i], + !(epfnv->rd_busy & (1 << i)), + msecs_to_jiffies(5000)); + if (ret == 0) { + dev_err(epfnv->fdev, "%s: LL RD CH: %d TO\n", + __func__, i); + ret = -ETIMEDOUT; + goto fail; + } + } + +fail: + return ret; +} + +/* debugfs to measure direct and LL DMA read/write perf on channel 0 */ +static int perf_test(struct seq_file *s, void *data) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *) + dev_get_drvdata(s->private); + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + struct edma_desc desc; + struct edma_desc ll_desc[DMA_LL_DEFAULT_SIZE]; + dma_addr_t ep_dma_addr = epf_bar0->ep_phy_addr + BAR0_DMA_BUF_OFFSET; + dma_addr_t rp_dma_addr = epf_bar0->rp_phy_addr + BAR0_DMA_BUF_OFFSET; + long long time; + int ch = 0, nents = DMA_LL_MIN_SIZE, i, ret; + + if (!rp_dma_addr) { + dev_err(epfnv->fdev, "RP DMA address is null\n"); + return 0; + } + + edma_init(epfnv, 1); + + /* Direct DMA perf test with size BAR0_DMA_BUF_SIZE */ + desc.src = ep_dma_addr; + desc.dst = rp_dma_addr; + desc.sz = BAR0_DMA_BUF_SIZE; + ret = edma_submit_direct_tx(epfnv, &desc, ch); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: DD WR, SZ: %lu B CH: %d failed\n", + __func__, desc.sz, ch); + goto fail; + } + + time = ktime_to_ns(epfnv->wr_end_time[ch]) - + ktime_to_ns(epfnv->wr_start_time[ch]); + dev_info(epfnv->fdev, "%s: DD WR, CH: %d SZ: %lu B, time: %lld ns\n", + __func__, ch, desc.sz, time); + + desc.src = rp_dma_addr; + desc.dst = ep_dma_addr; + desc.sz = BAR0_DMA_BUF_SIZE; + ret = edma_submit_direct_rx(epfnv, &desc, ch); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: DD RD, SZ: %lu B CH: %d failed\n", + __func__, desc.sz, ch); + goto fail; + } + time = ktime_to_ns(epfnv->rd_end_time[ch]) - + ktime_to_ns(epfnv->rd_start_time[ch]); + dev_info(epfnv->fdev, "%s: DD RD, CH: %d SZ: %lu B, time: %lld ns\n", + __func__, ch, desc.sz, time); + + /* Clean DMA LL */ + memset(epfnv->bar0_virt + DMA_LL_WR_OFFSET(0), 0, 6 * DMA_LL_SIZE); + edma_ll_init(epfnv); + + /* LL DMA perf test with size BAR0_DMA_BUF_SIZE and one desc */ + for (i = 0; i < nents; i++) { + ll_desc[i].src = ep_dma_addr + (i * BAR0_DMA_BUF_SIZE); + ll_desc[i].dst = rp_dma_addr + (i * BAR0_DMA_BUF_SIZE); + ll_desc[i].sz = BAR0_DMA_BUF_SIZE; + } + + ret = edma_submit_sync_tx(epfnv, ll_desc, nents, ch, 1); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: LL WR, SZ: %u B CH: %d failed\n", + __func__, BAR0_DMA_BUF_SIZE * nents, ch); + goto fail; + } + time = ktime_to_ns(epfnv->wr_end_time[ch]) - + ktime_to_ns(epfnv->wr_start_time[ch]); + dev_info(epfnv->fdev, "%s: LL WR, CH: %d N: %d SZ: %d B, time: %lld ns\n", + __func__, ch, nents, BAR0_DMA_BUF_SIZE, time); + + for (i = 0; i < nents; i++) { + ll_desc[i].src = rp_dma_addr + (i * BAR0_DMA_BUF_SIZE); + ll_desc[i].dst = ep_dma_addr + (i * BAR0_DMA_BUF_SIZE); + ll_desc[i].sz = BAR0_DMA_BUF_SIZE; + } + + ret = edma_submit_sync_rx(epfnv, ll_desc, nents, ch, 1); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: LL RD, SZ: %u B CH: %d failed\n", + __func__, BAR0_DMA_BUF_SIZE * nents, ch); + goto fail; + } + time = ktime_to_ns(epfnv->rd_end_time[ch]) - + ktime_to_ns(epfnv->rd_start_time[ch]); + dev_info(epfnv->fdev, "%s: LL RD, CH: %d N: %d SZ: %d B, time: %lld ns\n", + __func__, ch, nents, BAR0_DMA_BUF_SIZE, time); + + edma_ll_deinit(epfnv); + edma_deinit(epfnv); + +fail: + return 0; +} + +/* debugfs to stress direct and LL DMA on all wr & rd channels */ +static int stress_test(struct seq_file *s, void *data) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *) + dev_get_drvdata(s->private); + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + struct edma_desc desc_wr[DMA_WR_CHNL_NUM], desc_rd[DMA_RD_CHNL_NUM]; + struct edma_desc ll_desc_wr[DMA_WR_CHNL_NUM * DMA_LL_DEFAULT_SIZE]; + struct edma_desc ll_desc_rd[DMA_RD_CHNL_NUM * DMA_LL_DEFAULT_SIZE]; + dma_addr_t ep_dma_addr = epf_bar0->ep_phy_addr + BAR0_DMA_BUF_OFFSET; + dma_addr_t rp_dma_addr = epf_bar0->rp_phy_addr + BAR0_DMA_BUF_OFFSET; + int i, j, ret, nents = DMA_LL_DEFAULT_SIZE; + + if (!rp_dma_addr) { + dev_err(epfnv->fdev, "RP DMA address is null\n"); + return 0; + } + + edma_init(epfnv, 1); + + /* Direct DMA stress test with rand size < DMA_DD_BUF_SIZE */ + for (j = 0; j < DMA_WR_CHNL_NUM; j++) { + desc_wr[j].src = ep_dma_addr + (j * DMA_DD_BUF_SIZE); + desc_wr[j].dst = rp_dma_addr + (j * DMA_DD_BUF_SIZE); + } + + for (j = 0; j < DMA_RD_CHNL_NUM; j++) { + desc_rd[j].src = rp_dma_addr + + ((j + DMA_WR_CHNL_NUM) * DMA_DD_BUF_SIZE); + desc_rd[j].dst = ep_dma_addr + + ((j + DMA_WR_CHNL_NUM) * DMA_DD_BUF_SIZE); + } + + for (i = 0; i < epfnv->stress_count; i++) { + for (j = 0; j < DMA_WR_CHNL_NUM; j++) + desc_wr[j].sz = + (get_random_u32() % DMA_DD_BUF_SIZE) + 1; + for (j = 0; j < DMA_RD_CHNL_NUM; j++) + desc_rd[j].sz = + (get_random_u32() % DMA_DD_BUF_SIZE) + 1; + ret = edma_submit_direct_txrx(epfnv, desc_wr, desc_rd); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: DD stress failed\n", + __func__); + goto fail; + } + dev_info(epfnv->fdev, "%s: DD stress test iteration %d done\n", + __func__, i); + } + dev_info(epfnv->fdev, "%s: DD stress: all CH, rand SZ, count: %d success\n", + __func__, epfnv->stress_count); + + /* Clean DMA LL */ + memset(epfnv->bar0_virt + DMA_LL_WR_OFFSET(0), 0, 6 * DMA_LL_SIZE); + edma_ll_init(epfnv); + + /* LL DMA stress test with rand size < DMA_LL_BUF_SIZE per desc */ + for (i = 0; i < DMA_WR_CHNL_NUM * nents; i++) { + ll_desc_wr[i].src = ep_dma_addr + (i * DMA_LL_BUF_SIZE); + ll_desc_wr[i].dst = rp_dma_addr + (i * DMA_LL_BUF_SIZE); + } + for (i = 0; i < DMA_RD_CHNL_NUM * nents; i++) { + ll_desc_rd[i].src = rp_dma_addr + + ((i + DMA_WR_CHNL_NUM) * DMA_LL_BUF_SIZE); + ll_desc_rd[i].dst = ep_dma_addr + + ((i + DMA_WR_CHNL_NUM) * DMA_LL_BUF_SIZE); + } + + for (i = 0; i < epfnv->stress_count; i++) { + for (j = 0; j < DMA_WR_CHNL_NUM * nents; j++) + ll_desc_wr[j].sz = + (get_random_u32() % DMA_LL_BUF_SIZE) + 1; + for (j = 0; j < DMA_RD_CHNL_NUM * nents; j++) + ll_desc_rd[j].sz = + (get_random_u32() % DMA_LL_BUF_SIZE) + 1; + ret = edma_submit_sync_txrx(epfnv, ll_desc_wr, ll_desc_rd, + nents); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: DMA LL stress failed\n", + __func__); + goto fail; + } + dev_info(epfnv->fdev, "%s: LL stress test iteration %d done\n", + __func__, i); + } + dev_info(epfnv->fdev, "%s: LL stress: all CH, rand SZ, count: %d success\n", + __func__, epfnv->stress_count); + + edma_ll_deinit(epfnv); + edma_deinit(epfnv); + +fail: + return 0; +} + +/* debugfs to perform eDMA lib transfers and do CRC check */ +static int edmalib_test(struct seq_file *s, void *data) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *) + dev_get_drvdata(s->private); + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + + if (!epf_bar0->rp_phy_addr) { + dev_err(epfnv->fdev, "RP DMA address is null\n"); + return -1; + } + + epfnv->edma.src_dma_addr = epf_bar0->ep_phy_addr + BAR0_DMA_BUF_OFFSET; + epfnv->edma.dst_dma_addr = epf_bar0->rp_phy_addr + BAR0_DMA_BUF_OFFSET; + epfnv->edma.fdev = epfnv->fdev; + epfnv->edma.bar0_virt = epfnv->bar0_virt; + epfnv->edma.src_virt = epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET; + epfnv->edma.dma_base = epfnv->dma_base; + epfnv->edma.dma_size = epfnv->dma_size; + epfnv->edma.stress_count = epfnv->stress_count; + epfnv->edma.edma_ch = epfnv->edma_ch; + epfnv->edma.nents = epfnv->nents; + epfnv->edma.of_node = epfnv->cdev->of_node; + + return edmalib_common_test(&epfnv->edma); +} + +/* debugfs to perform direct & LL DMA and do CRC check */ +static int sanity_test(struct seq_file *s, void *data) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *) + dev_get_drvdata(s->private); + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + struct edma_desc desc; + struct edma_desc ll_desc[DMA_LL_DEFAULT_SIZE]; + dma_addr_t ep_dma_addr = epf_bar0->ep_phy_addr + BAR0_DMA_BUF_OFFSET; + dma_addr_t rp_dma_addr = epf_bar0->rp_phy_addr + BAR0_DMA_BUF_OFFSET; + int nents = DMA_LL_DEFAULT_SIZE; + int i, j, ret; + u32 crc; + + if (epfnv->dma_size > MAX_DMA_ELE_SIZE) { + dev_err(epfnv->fdev, "%s: dma_size should be <= 0x%x\n", + __func__, MAX_DMA_ELE_SIZE); + goto fail; + } + + if (!rp_dma_addr) { + dev_err(epfnv->fdev, "RP DMA address is null\n"); + return 0; + } + + edma_init(epfnv, 0); + + /* Direct DMA of size epfnv->dma_size */ + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + desc.src = ep_dma_addr; + desc.dst = rp_dma_addr; + desc.sz = epfnv->dma_size; + epf_bar0->wr_data[i].size = desc.sz; + /* generate random bytes to transfer */ + get_random_bytes(epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, + desc.sz); + ret = edma_submit_direct_tx(epfnv, &desc, i); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: DD WR CH: %d failed\n", + __func__, i); + goto fail; + } + crc = crc32_le(~0, epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, + desc.sz); + if (crc != epf_bar0->wr_data[i].crc) { + dev_err(epfnv->fdev, "%s: DD WR, SZ: %lu B CH: %d CRC failed\n", + __func__, desc.sz, i); + goto fail; + } + dev_info(epfnv->fdev, "%s: DD WR, SZ: %lu B CH: %d success\n", + __func__, desc.sz, i); + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + desc.src = rp_dma_addr; + desc.dst = ep_dma_addr; + desc.sz = epfnv->dma_size; + epf_bar0->rd_data[i].size = desc.sz; + /* Clear memory to receive data */ + memset(epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, 0, desc.sz); + ret = edma_submit_direct_rx(epfnv, &desc, i); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: DD RD CH: %d failed\n", + __func__, i); + goto fail; + } + crc = crc32_le(~0, epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, + desc.sz); + if (crc != epf_bar0->rd_data[i].crc) { + dev_err(epfnv->fdev, "%s: DD RD, SZ: %lu B CH: %d CRC failed\n", + __func__, desc.sz, i); + goto fail; + } + dev_info(epfnv->fdev, "%s: DD RD, SZ: %lu B CH: %d success\n", + __func__, desc.sz, i); + } + + /* Clean DMA LL all 6 channels */ + memset(epfnv->bar0_virt + DMA_LL_WR_OFFSET(0), 0, 6 * DMA_LL_SIZE); + edma_ll_init(epfnv); + + /* LL DMA with size epfnv->dma_size per desc */ + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + for (j = 0; j < nents; j++) { + ll_desc[j].src = ep_dma_addr + (j * epfnv->dma_size); + ll_desc[j].dst = rp_dma_addr + (j * epfnv->dma_size); + ll_desc[j].sz = epfnv->dma_size; + } + epf_bar0->wr_data[i].size = epfnv->dma_size * nents; + /* generate random bytes to transfer */ + get_random_bytes(epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, + epf_bar0->wr_data[i].size); + + ret = edma_submit_sync_tx(epfnv, ll_desc, nents, i, 0); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: LL WR CH: %d failed\n", + __func__, i); + goto fail; + } + crc = crc32_le(~0, epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, + epfnv->dma_size * nents); + if (crc != epf_bar0->wr_data[i].crc) { + dev_err(epfnv->fdev, "%s: LL WR, SZ: %u B CH: %d CRC failed\n", + __func__, epfnv->dma_size, i); + goto fail; + } + dev_info(epfnv->fdev, "%s: LL WR, SZ: %u B CH: %d success\n", + __func__, epfnv->dma_size, i); + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + for (j = 0; j < nents; j++) { + ll_desc[j].src = rp_dma_addr + (j * epfnv->dma_size); + ll_desc[j].dst = ep_dma_addr + (j * epfnv->dma_size); + ll_desc[j].sz = epfnv->dma_size; + } + epf_bar0->rd_data[i].size = epfnv->dma_size * nents; + /* Clear memory to receive data */ + memset(epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, 0, + epf_bar0->rd_data[i].size); + + ret = edma_submit_sync_rx(epfnv, ll_desc, nents, i, 0); + if (ret < 0) { + dev_err(epfnv->fdev, "%s: LL RD failed\n", __func__); + goto fail; + } + crc = crc32_le(~0, epfnv->bar0_virt + BAR0_DMA_BUF_OFFSET, + epfnv->dma_size * nents); + if (crc != epf_bar0->rd_data[i].crc) { + dev_err(epfnv->fdev, "%s: LL RD, SZ: %u B CH: %d CRC failed\n", + __func__, epfnv->dma_size, i); + goto fail; + } + dev_info(epfnv->fdev, "%s: LL RD, SZ: %u B CH: %d success\n", + __func__, epfnv->dma_size, i); + } + + edma_ll_deinit(epfnv); + edma_deinit(epfnv); + +fail: + return 0; +} + +#ifdef THREAD_ON_CPU +static int async_dma_test_fn(struct pcie_epf_dma *epfnv, int ch) +{ + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + dma_addr_t ep_dma_addr, rp_dma_addr, phy_addr; + struct dma_ll *dma_ll_virt; + u32 nents = epfnv->async_count, count = 0, idx, i; + + ep_dma_addr = epf_bar0->ep_phy_addr + BAR0_DMA_BUF_OFFSET + + (ch * DMA_ASYNC_LL_SIZE * SZ_64K); + rp_dma_addr = epf_bar0->rp_phy_addr + BAR0_DMA_BUF_OFFSET + + (ch * DMA_ASYNC_LL_SIZE * SZ_64K); + + epfnv->wr_cnt[ch] = 0; + epfnv->rd_cnt[ch] = 0; + + dma_ll_virt = (struct dma_ll *) + (epfnv->bar0_virt + DMA_LL_WR_OFFSET(ch)); + phy_addr = epf_bar0->ep_phy_addr + DMA_LL_WR_OFFSET(ch); + dma_ll_virt[DMA_ASYNC_LL_SIZE].src_low = lower_32_bits(phy_addr); + dma_ll_virt[DMA_ASYNC_LL_SIZE].src_high = upper_32_bits(phy_addr); + dma_ll_virt[DMA_ASYNC_LL_SIZE].ele.llp = 1; + dma_ll_virt[DMA_ASYNC_LL_SIZE].ele.tcb = 1; + epfnv->pcs[ch] = 1; + dma_ll_virt[DMA_ASYNC_LL_SIZE].ele.cb = !epfnv->pcs[ch]; + + for (i = 0; i < nents; i++) { + while ((epfnv->wr_cnt[ch] - epfnv->rd_cnt[ch] + 2) >= + DMA_ASYNC_LL_SIZE) { + msleep(100); + if (++count == 100) { + dev_info(epfnv->fdev, "%s: CH: %d LL is full wr_cnt: %u rd_cnt: %u\n", + __func__, ch, epfnv->wr_cnt[ch], + epfnv->rd_cnt[ch]); + goto fail; + } + } + count = 0; + + idx = i % DMA_ASYNC_LL_SIZE; + + dma_ll_virt[idx].size = SZ_64K; + if (ch < DMA_WR_CHNL_NUM) { + phy_addr = ep_dma_addr + + (idx % DMA_ASYNC_LL_SIZE) * SZ_64K; + dma_ll_virt[idx].src_low = lower_32_bits(phy_addr); + dma_ll_virt[idx].src_high = upper_32_bits(phy_addr); + phy_addr = rp_dma_addr + + (idx % DMA_ASYNC_LL_SIZE) * SZ_64K; + dma_ll_virt[idx].dst_low = lower_32_bits(phy_addr); + dma_ll_virt[idx].dst_high = upper_32_bits(phy_addr); + } else { + phy_addr = rp_dma_addr + + (idx % DMA_ASYNC_LL_SIZE) * SZ_64K; + dma_ll_virt[idx].src_low = lower_32_bits(phy_addr); + dma_ll_virt[idx].src_high = upper_32_bits(phy_addr); + phy_addr = ep_dma_addr + + (idx % DMA_ASYNC_LL_SIZE) * SZ_64K; + dma_ll_virt[idx].dst_low = lower_32_bits(phy_addr); + dma_ll_virt[idx].dst_high = upper_32_bits(phy_addr); + } + dma_ll_virt[idx].ele.lie = 1; + /* + * DMA desc should not be touched after CB bit set, add + * write barrier to stop desc writes going out of order + * wrt CB bit set. + */ + wmb(); + dma_ll_virt[idx].ele.cb = epfnv->pcs[ch]; + if (idx == (DMA_ASYNC_LL_SIZE - 1)) { + epfnv->pcs[ch] = !epfnv->pcs[ch]; + dma_ll_virt[idx + 1].ele.cb = epfnv->pcs[ch]; + } + if (ch < DMA_WR_CHNL_NUM) + dma_common_wr(epfnv->dma_base, ch, + DMA_WRITE_DOORBELL_OFF); + else + dma_common_wr(epfnv->dma_base, ch - DMA_WR_CHNL_NUM, + DMA_READ_DOORBELL_OFF); + epfnv->wr_cnt[ch]++; + /* Print status for very 10000 iterations */ + if ((i % 10000) == 0) + dev_info(epfnv->fdev, "%s: CH: %u async DMA test itr: %u done, wr_cnt: %u rd_cnt: %u\n", + __func__, ch, i, epfnv->wr_cnt[ch], + epfnv->rd_cnt[ch]); + } + count = 0; + while (epfnv->wr_cnt[ch] != epfnv->rd_cnt[ch]) { + msleep(20); + if (++count == 100) { + dev_info(epfnv->fdev, "%s: CH: %d async DMA test failed, wr_cnt: %u rd_cnt: %u\n", + __func__, ch, epfnv->wr_cnt[ch], + epfnv->rd_cnt[ch]); + goto fail; + } + } + dev_info(epfnv->fdev, "%s: CH: %d async DMA success\n", __func__, ch); + +fail: + epfnv->wr_cnt[ch] = 0; + epfnv->rd_cnt[ch] = 0; + + return 0; +} + +static int async_wr0_work(void *data) +{ + struct pcie_epf_dma *epfnv = data; + + async_dma_test_fn(epfnv, 0); + + epfnv->task_done++; + wake_up(&epfnv->task_wq); + + return 0; +} + +static int async_wr1_work(void *data) +{ + struct pcie_epf_dma *epfnv = data; + + async_dma_test_fn(epfnv, 1); + + epfnv->task_done++; + wake_up(&epfnv->task_wq); + + return 0; +} + +static int async_wr2_work(void *data) +{ + struct pcie_epf_dma *epfnv = data; + + async_dma_test_fn(epfnv, 2); + + epfnv->task_done++; + wake_up(&epfnv->task_wq); + + return 0; +} + +static int async_wr3_work(void *data) +{ + struct pcie_epf_dma *epfnv = data; + + async_dma_test_fn(epfnv, 3); + + epfnv->task_done++; + wake_up(&epfnv->task_wq); + + return 0; +} + +static int async_rd0_work(void *data) +{ + struct pcie_epf_dma *epfnv = data; + + async_dma_test_fn(epfnv, 4); + + epfnv->task_done++; + wake_up(&epfnv->task_wq); + + return 0; +} + +static int async_rd1_work(void *data) +{ + struct pcie_epf_dma *epfnv = data; + + async_dma_test_fn(epfnv, 5); + + epfnv->task_done++; + wake_up(&epfnv->task_wq); + + return 0; +} +#endif +static int async_dma_test(struct seq_file *s, void *data) +{ + struct pcie_epf_dma *epfnv = (struct pcie_epf_dma *) + dev_get_drvdata(s->private); + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + epfnv->bar0_virt; + dma_addr_t phy_addr; + int i; + + epfnv->task_done = 0; + epfnv->async_dma = true; + + edma_init(epfnv, 1); + /* Clean DMA LL all 6 channels */ + memset(epfnv->bar0_virt + DMA_LL_WR_OFFSET(0), 0, 6 * DMA_LL_SIZE); + edma_ll_init(epfnv); + + /* Program DMA LL base address in DMA LL pointer register */ + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + phy_addr = epf_bar0->ep_phy_addr + DMA_LL_WR_OFFSET(i); + dma_channel_wr(epfnv->dma_base, i, lower_32_bits(phy_addr), + DMA_LLP_LOW_OFF_WRCH); + dma_channel_wr(epfnv->dma_base, i, upper_32_bits(phy_addr), + DMA_LLP_HIGH_OFF_WRCH); + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + phy_addr = epf_bar0->ep_phy_addr + DMA_LL_RD_OFFSET(i); + dma_channel_wr(epfnv->dma_base, i, lower_32_bits(phy_addr), + DMA_LLP_LOW_OFF_RDCH); + dma_channel_wr(epfnv->dma_base, i, upper_32_bits(phy_addr), + DMA_LLP_HIGH_OFF_RDCH); + } + +#ifdef THREAD_ON_CPU + epfnv->wr0_task = kthread_create_on_cpu(async_wr0_work, epfnv, 0, + "async_wr0_work"); + if (IS_ERR(epfnv->wr0_task)) { + dev_err(epfnv->fdev, "failed to create async_wr0 thread\n"); + goto wr0_task; + } + epfnv->wr1_task = kthread_create_on_cpu(async_wr1_work, epfnv, 1, + "async_wr1_work"); + if (IS_ERR(epfnv->wr1_task)) { + dev_err(epfnv->fdev, "failed to create async_wr1 thread\n"); + goto wr1_task; + } + epfnv->wr2_task = kthread_create_on_cpu(async_wr2_work, epfnv, 2, + "async_wr2_work"); + if (IS_ERR(epfnv->wr2_task)) { + dev_err(epfnv->fdev, "failed to create async_wr2 thread\n"); + goto wr2_task; + } + epfnv->wr3_task = kthread_create_on_cpu(async_wr3_work, epfnv, 3, + "async_wr3_work"); + if (IS_ERR(epfnv->wr3_task)) { + dev_err(epfnv->fdev, "failed to create async_wr3 thread\n"); + goto wr3_task; + } + epfnv->rd0_task = kthread_create_on_cpu(async_rd0_work, epfnv, 4, + "async_rd0_work"); + if (IS_ERR(epfnv->rd0_task)) { + dev_err(epfnv->fdev, "failed to create async_rd0 thread\n"); + goto rd0_task; + } + epfnv->rd1_task = kthread_create_on_cpu(async_rd1_work, epfnv, 5, + "async_rd1_work"); + if (IS_ERR(epfnv->rd1_task)) { + dev_err(epfnv->fdev, "failed to create async_rd1 thread\n"); + goto rd1_task; + } +#endif + init_waitqueue_head(&epfnv->task_wq); + + wake_up_process(epfnv->wr0_task); + wake_up_process(epfnv->wr1_task); + wake_up_process(epfnv->wr2_task); + wake_up_process(epfnv->wr3_task); + wake_up_process(epfnv->rd0_task); + wake_up_process(epfnv->rd1_task); + + wait_event(epfnv->task_wq, + (epfnv->task_done == (DMA_WR_CHNL_NUM + DMA_RD_CHNL_NUM))); + dev_info(epfnv->fdev, "%s: Async DMA test done\n", __func__); + + edma_ll_deinit(epfnv); + edma_deinit(epfnv); + + epfnv->async_dma = false; + epfnv->task_done = 0; + + return 0; + +#ifdef THREAD_ON_CPU +rd1_task: + kthread_stop(epfnv->rd0_task); +rd0_task: + kthread_stop(epfnv->wr3_task); +wr3_task: + kthread_stop(epfnv->wr2_task); +wr2_task: + kthread_stop(epfnv->wr1_task); +wr1_task: + kthread_stop(epfnv->wr0_task); +wr0_task: +#endif + epfnv->async_dma = false; + epfnv->task_done = 0; + + return 0; +} + +static void init_debugfs(struct pcie_epf_dma *epfnv) +{ + debugfs_create_devm_seqfile(epfnv->fdev, "perf_test", epfnv->debugfs, + perf_test); + + debugfs_create_devm_seqfile(epfnv->fdev, "stress_test", epfnv->debugfs, + stress_test); + + debugfs_create_devm_seqfile(epfnv->fdev, "sanity_test", epfnv->debugfs, + sanity_test); + + debugfs_create_devm_seqfile(epfnv->fdev, "async_dma_test", + epfnv->debugfs, async_dma_test); + debugfs_create_devm_seqfile(epfnv->fdev, "edmalib_test", epfnv->debugfs, + edmalib_test); + + debugfs_create_u32("dma_size", 0644, epfnv->debugfs, &epfnv->dma_size); + epfnv->dma_size = SZ_1M; + epfnv->edma.st_as_ch = -1; + + debugfs_create_u32("edma_ch", 0644, epfnv->debugfs, &epfnv->edma_ch); + /* Enable ASYNC for ch 0 as default and other channels. Usage: + * BITS 0-3 - Async mode or sync mode for corresponding WR channels + * BITS 4-7 - Enable/disable corresponding WR channel + * BITS 8-9 - Async mode or sync mode for corresponding RD channels + * BITS 10-11 - Enable/disable or corresponding RD channels + * Bit 12 - Abort testing. + */ + epfnv->edma_ch = 0xF1; + + debugfs_create_u32("nents", 0644, epfnv->debugfs, &epfnv->nents); + /* Set DMA_LL_DEFAULT_SIZE as default nents, Max NUM_EDMA_DESC */ + epfnv->nents = DMA_LL_DEFAULT_SIZE; + + debugfs_create_u32("stress_count", 0644, epfnv->debugfs, + &epfnv->stress_count); + epfnv->stress_count = DEFAULT_STRESS_COUNT; + + debugfs_create_u32("async_count", 0644, epfnv->debugfs, + &epfnv->async_count); + epfnv->async_count = 4096; +} + +static void pcie_dma_epf_write_msi_msg(struct msi_desc *desc, + struct msi_msg *msg) +{ + /* TODO get rid of global variable gepfnv */ + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) + gepfnv->bar0_virt; + struct device *cdev = msi_desc_to_dev(desc); + int idx = desc->platform.msi_index; + + epf_bar0->msi_data[idx] = msg->data; + dev_info(cdev, "%s: MSI idx: %d data: %d\n", __func__, idx, msg->data); +} + +static int pcie_dma_epf_core_init(struct pci_epf *epf) +{ + struct pci_epc *epc = epf->epc; + struct device *fdev = &epf->dev; + struct pci_epf_bar *epf_bar; + int ret; + + ret = lpci_epc_write_header(epc, epf->func_no, epf->header); + if (ret < 0) { + dev_err(fdev, "Failed to write PCIe header: %d\n", ret); + return ret; + } + + epf_bar = &epf->bar[BAR_0]; + ret = lpci_epc_set_bar(epc, epf->func_no, epf_bar); + if (ret < 0) { + dev_err(fdev, "PCIe set BAR0 failed: %d\n", ret); + return ret; + } + + dev_info(fdev, "BAR0 phy_addr: %llx size: %lx\n", + epf_bar->phys_addr, epf_bar->size); + + ret = lpci_epc_set_msi(epc, epf->func_no, epf->msi_interrupts); + if (ret) { + dev_err(fdev, "pci_epc_set_msi() failed: %d\n", ret); + return ret; + } + + return 0; +} + +static int pcie_dma_epf_msi_init(struct pci_epf *epf) +{ + struct pcie_epf_dma *epfnv = epf_get_drvdata(epf); + struct pci_epc *epc = epf->epc; + struct device *cdev = epc->dev.parent; + struct device *fdev = &epf->dev; + struct msi_desc *desc; + int ret; + + /* LL DMA in sanity test will not work without MSI for EP */ + if (!cdev->msi_domain) { + dev_info(fdev, "msi_domain absent, no interrupts\n"); + return 0; + } + ret = platform_msi_domain_alloc_irqs(cdev, + DMA_WR_CHNL_NUM + DMA_RD_CHNL_NUM, + pcie_dma_epf_write_msi_msg); + if (ret < 0) { + dev_err(fdev, "failed to allocate MSIs\n"); + return ret; + } + + for_each_msi_entry(desc, cdev) { + switch (desc->platform.msi_index) { + case 0: + ret = request_irq(desc->irq, pcie_dma_epf_wr0_msi, 0, + "pcie_dma_wr0", epfnv); + if (ret < 0) + dev_err(fdev, "failed to register wr0 irq\n"); + break; + case 1: + ret = request_irq(desc->irq, pcie_dma_epf_wr1_msi, 0, + "pcie_dma_wr1", epfnv); + if (ret < 0) + dev_err(fdev, "failed to register wr1 irq\n"); + break; + case 2: + ret = request_irq(desc->irq, pcie_dma_epf_wr2_msi, 0, + "pcie_dma_wr2", epfnv); + if (ret < 0) + dev_err(fdev, "failed to register wr2 irq\n"); + break; + case 3: + ret = request_irq(desc->irq, pcie_dma_epf_wr3_msi, 0, + "pcie_dma_wr3", epfnv); + if (ret < 0) + dev_err(fdev, "failed to register wr3 irq\n"); + break; + case 4: + ret = request_irq(desc->irq, pcie_dma_epf_rd0_msi, 0, + "pcie_dma_rd0", epfnv); + if (ret < 0) + dev_err(fdev, "failed to register rd0 irq\n"); + break; + case 5: + ret = request_irq(desc->irq, pcie_dma_epf_rd1_msi, 0, + "pcie_dma_rd1", epfnv); + if (ret < 0) + dev_err(fdev, "failed to register rd1 irq\n"); + break; + default: + dev_err(fdev, "Unknown MSI irq: %d\n", desc->irq); + continue; + } + } + + return 0; +} + +static void pcie_dma_epf_msi_deinit(struct pci_epf *epf) +{ + struct pcie_epf_dma *epfnv = epf_get_drvdata(epf); + struct pci_epc *epc = epf->epc; + struct device *cdev = epc->dev.parent; + struct device *fdev = &epf->dev; + struct msi_desc *desc; + + /* LL DMA in sanity test will not work without MSI for EP */ + if (!cdev->msi_domain) { + dev_info(fdev, "msi_domain absent, no interrupts\n"); + return; + } + + for_each_msi_entry(desc, cdev) + free_irq(desc->irq, epfnv); + + platform_msi_domain_free_irqs(cdev); +} + +static int pcie_dma_epf_notifier(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct pci_epf *epf = container_of(nb, struct pci_epf, nb); + int ret; + + switch (val) { + case CORE_INIT: + ret = pcie_dma_epf_core_init(epf); + if (ret < 0) + return NOTIFY_BAD; + break; + + case LINK_UP: + break; + + default: + dev_err(&epf->dev, "Invalid notifier event\n"); + return NOTIFY_BAD; + } + + return NOTIFY_OK; +} + +#ifdef PCI_EPF_CORE_DEINIT +static int pcie_dma_epf_block_notifier(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct pci_epf *epf = container_of(nb, struct pci_epf, block_nb); + struct pcie_epf_dma *epfnv = epf_get_drvdata(epf); + void *cookie = epfnv->edma.cookie; + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) epfnv->bar0_virt; + + switch (val) { + case CORE_DEINIT: + epfnv->edma.cookie = NULL; + epf_bar0->rp_phy_addr = 0; + tegra_pcie_edma_deinit(cookie); + break; + + default: + dev_err(&epf->dev, "Invalid blocking notifier event\n"); + return NOTIFY_BAD; + } + + return NOTIFY_OK; +} +#endif + +static void pcie_dma_epf_unbind(struct pci_epf *epf) +{ + struct pcie_epf_dma *epfnv = epf_get_drvdata(epf); + struct pci_epc *epc = epf->epc; + struct pci_epf_bar *epf_bar = &epf->bar[BAR_0]; + void *cookie = epfnv->edma.cookie; + struct pcie_epf_bar0 *epf_bar0 = (struct pcie_epf_bar0 *) epfnv->bar0_virt; + + epfnv->edma.cookie = NULL; + epf_bar0->rp_phy_addr = 0; + tegra_pcie_edma_deinit(cookie); + + pcie_dma_epf_msi_deinit(epf); + pci_epc_stop(epc); + lpci_epc_clear_bar(epc, epf->func_no, epf_bar); + lpci_epf_free_space(epf, epfnv->bar0_virt, BAR_0); +} + +static int pcie_dma_epf_bind(struct pci_epf *epf) +{ + struct pci_epc *epc = epf->epc; + struct pcie_epf_dma *epfnv = epf_get_drvdata(epf); + struct device *fdev = &epf->dev; + struct device *cdev = epc->dev.parent; + struct platform_device *pdev = of_find_device_by_node(cdev->of_node); + struct pci_epf_bar *epf_bar = &epf->bar[BAR_0]; + struct pcie_epf_bar0 *epf_bar0; + struct resource *res; + char *name; + int ret, i; + + epfnv->fdev = fdev; + epfnv->cdev = cdev; + + epfnv->bar0_virt = lpci_epf_alloc_space(epf, BAR0_SIZE, BAR_0, SZ_64K); + if (!epfnv->bar0_virt) { + dev_err(fdev, "Failed to allocate memory for BAR0\n"); + return -ENOMEM; + } + get_random_bytes(epfnv->bar0_virt, BAR0_SIZE); + memset(epfnv->bar0_virt, 0, BAR0_HEADER_SIZE); + + /* Update BAR header with EP DMA PHY addr */ + epf_bar0 = (struct pcie_epf_bar0 *)epfnv->bar0_virt; + epf_bar0->ep_phy_addr = epf_bar->phys_addr; + /* Set BAR0 mem type as 64-bit */ + epf_bar->flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | + PCI_BASE_ADDRESS_MEM_PREFETCH; + + epf->nb.notifier_call = pcie_dma_epf_notifier; + pci_epc_register_notifier(epc, &epf->nb); + +#ifdef PCI_EPF_CORE_DEINIT + epf->block_nb.notifier_call = pcie_dma_epf_block_notifier; + pci_epc_register_block_notifier(epc, &epf->block_nb); +#endif + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "atu_dma"); + if (!res) { + dev_err(fdev, "missing atu_dma resource in DT\n"); + ret = PTR_ERR(res); + goto fail_atu_dma; + } + + epfnv->dma_base = devm_ioremap(fdev, res->start + DMA_OFFSET, + resource_size(res) - DMA_OFFSET); + if (IS_ERR(epfnv->dma_base)) { + ret = PTR_ERR(epfnv->dma_base); + dev_err(fdev, "dma region map failed: %d\n", ret); + goto fail_atu_dma; + } + + epfnv->irq = platform_get_irq_byname(pdev, "intr"); + if (!epfnv->irq) { + dev_err(fdev, "failed to get intr interrupt\n"); + ret = -ENODEV; + goto fail_atu_dma; + } + + name = devm_kasprintf(fdev, GFP_KERNEL, "%s_epf_dma_test", pdev->name); + if (!name) { + ret = -ENOMEM; + goto fail_atu_dma; + } + + ret = devm_request_threaded_irq(fdev, epfnv->irq, pcie_dma_epf_irq, + pcie_dma_epf_irq_handler, + IRQF_SHARED, + name, epfnv); + if (ret < 0) { + dev_err(fdev, "failed to request \"intr\" irq\n"); + goto fail_atu_dma; + } + + ret = pcie_dma_epf_msi_init(epf); + if (ret < 0) { + dev_err(fdev, "failed to init platform msi: %d\n", ret); + goto fail_atu_dma; + } + + for (i = 0; i < DMA_WR_CHNL_NUM; i++) { + init_waitqueue_head(&epfnv->wr_wq[i]); + init_waitqueue_head(&epfnv->edma.wr_wq[i]); + } + + for (i = 0; i < DMA_RD_CHNL_NUM; i++) { + init_waitqueue_head(&epfnv->rd_wq[i]); + init_waitqueue_head(&epfnv->edma.rd_wq[i]); + } + + epfnv->debugfs = debugfs_create_dir(name, NULL); + init_debugfs(epfnv); + + return 0; + +fail_atu_dma: + lpci_epf_free_space(epf, epfnv->bar0_virt, BAR_0); + + return ret; +} + +static const struct pci_epf_device_id pcie_dma_epf_ids[] = { + { + .name = "tegra_pcie_dma_epf", + }, + {}, +}; + +static int pcie_dma_epf_probe(struct pci_epf *epf) +{ + struct device *dev = &epf->dev; + struct pcie_epf_dma *epfnv; + + epfnv = devm_kzalloc(dev, sizeof(*epfnv), GFP_KERNEL); + if (!epfnv) + return -ENOMEM; + + epfnv->edma.ll_desc = devm_kzalloc(dev, sizeof(*epfnv->ll_desc) * NUM_EDMA_DESC, + GFP_KERNEL); + if (!epfnv) + return -ENOMEM; + + gepfnv = epfnv; + epf_set_drvdata(epf, epfnv); + + epfnv->header.vendorid = PCI_VENDOR_ID_NVIDIA; + epfnv->header.deviceid = 0x1AD6; + epfnv->header.baseclass_code = PCI_BASE_CLASS_MEMORY; + epfnv->header.interrupt_pin = PCI_INTERRUPT_INTA; + epf->header = &epfnv->header; + + return 0; +} + +static struct pci_epf_ops ops = { + .unbind = pcie_dma_epf_unbind, + .bind = pcie_dma_epf_bind, +}; + +static struct pci_epf_driver test_driver = { + .driver.name = "pcie_dma_epf", + .probe = pcie_dma_epf_probe, + .id_table = pcie_dma_epf_ids, + .ops = &ops, + .owner = THIS_MODULE, +}; + +static int __init pcie_dma_epf_init(void) +{ + int ret; + + ret = pci_epf_register_driver(&test_driver); + if (ret < 0) { + pr_err("Failed to register PCIe DMA EPF driver: %d\n", ret); + return ret; + } + + return 0; +} +module_init(pcie_dma_epf_init); + +static void __exit pcie_dma_epf_exit(void) +{ + pci_epf_unregister_driver(&test_driver); +} +module_exit(pcie_dma_epf_exit); + +MODULE_DESCRIPTION("TEGRA PCIe DMA EPF driver"); +MODULE_AUTHOR("Om Prakash Singh "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/endpoint/functions/pci-epf-wrapper.h b/drivers/pci/endpoint/functions/pci-epf-wrapper.h new file mode 100644 index 00000000..8856a979 --- /dev/null +++ b/drivers/pci/endpoint/functions/pci-epf-wrapper.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * PCIe EDMA Framework + * + * Copyright (C) 2022 NVIDIA Corporation. All rights reserved. + */ + +#ifndef PCI_EPF_WRAPPER_H +#define PCI_EPF_WRAPPER_H + +#include + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(5, 14, 0)) +#define lpci_epc_write_header(A, B, C) pci_epc_write_header(A, B, 0, C) +#define lpci_epc_raise_irq(A, B, C, D) pci_epc_raise_irq(A, B, 0, C, D) +#define lpci_epc_clear_bar(A, B, C) pci_epc_clear_bar(A, B, 0, C) +#define lpci_epc_set_msi(A, B, C) pci_epc_set_msi(A, B, 0, C) +#define lpci_epc_set_bar(A, B, C) pci_epc_set_bar(A, B, 0, C) +#define lpci_epc_unmap_addr(A, B, C) pci_epc_unmap_addr(A, B, 0, C) +#define lpci_epf_free_space(A, B, C) pci_epf_free_space(A, B, C, PRIMARY_INTERFACE) +#define lpci_epf_alloc_space(A, B, C, D) pci_epf_alloc_space(A, B, C, D, PRIMARY_INTERFACE) +#else +#define lpci_epc_write_header(A, B, C) pci_epc_write_header(A, B, C) +#define lpci_epc_raise_irq(A, B, C, D) pci_epc_raise_irq(A, B, C, D) +#define lpci_epc_clear_bar(A, B, C) pci_epc_clear_bar(A, B, C) +#define lpci_epc_set_msi(A, B, C) pci_epc_set_msi(A, B, C) +#define lpci_epc_set_bar(A, B, C) pci_epc_set_bar(A, B, C) +#define lpci_epc_unmap_addr(A, B, C) pci_epc_unmap_addr(A, B, C) +#define lpci_epf_free_space(A, B, C) pci_epf_free_space(A, B, C) +#define lpci_epf_alloc_space(A, B, C, D) pci_epf_alloc_space(A, B, C, D) +#endif /* LINUX_VERSION_CODE */ + +#endif /* PCI_EPF_WRAPPER_H */ diff --git a/include/linux/pcie_dma.h b/include/linux/pcie_dma.h new file mode 100644 index 00000000..2c5bedda --- /dev/null +++ b/include/linux/pcie_dma.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PCIe DMA test framework for Tegra PCIe + * + * Copyright (C) 2021-2022 NVIDIA Corporation. All rights reserved. + */ + +#ifndef PCIE_DMA_H +#define PCIE_DMA_H + +/* Update DMA_DD_BUF_SIZE and DMA_LL_BUF_SIZE when changing BAR0_SIZE */ +#define BAR0_SIZE SZ_256M + +/* Header includes RP/EP DMA addresses, EP MSI, LL, etc. */ +#define BAR0_HEADER_OFFSET 0x0 +#define BAR0_HEADER_SIZE SZ_1M +#define DMA_LL_OFFSET SZ_4K +#define DMA_LL_SIZE SZ_4K /* 4K size of LL serve 170 desc */ +#define DMA_LL_WR_OFFSET(i) (DMA_LL_OFFSET + ((i) * DMA_LL_SIZE)) +#define DMA_LL_RD_OFFSET(i) (DMA_LL_WR_OFFSET(4) + ((i) * DMA_LL_SIZE)) +#define DMA_LL_MIN_SIZE 1 +#define DMA_LL_DEFAULT_SIZE 8 +#define DMA_ASYNC_LL_SIZE 160 + +#define BAR0_MSI_OFFSET SZ_64K + +/* DMA'able memory range */ +#define BAR0_DMA_BUF_OFFSET SZ_1M +#define BAR0_DMA_BUF_SIZE (BAR0_SIZE - SZ_1M) +#define DMA_DD_BUF_SIZE SZ_32M +#define DMA_LL_BUF_SIZE SZ_4M + +/* Each DMA LL channel gets DMA_DD_BUF_SIZE and each desc DMA_LL_BUF_SIZE */ +#define DMA_LL_WR_BUF(i) (BAR0_DMA_BUF_OFFSET + (i) * DMA_DD_BUF_SIZE) +#define DMA_LL_RD_BUF(i) (DMA_LL_WR_BUF(4) + (i) * DMA_DD_BUF_SIZE) + +#define DEFAULT_STRESS_COUNT 10 + +#define MAX_DMA_ELE_SIZE SZ_16M + +/* DMA base offset starts at 0x20000 from ATU_DMA base */ +#define DMA_OFFSET 0x20000 + +#define DMA_RD_CHNL_NUM 2 +#define DMA_RD_CHNL_MASK 0x3 +#define DMA_WR_CHNL_NUM 4 +#define DMA_WR_CHNL_MASK 0xf + +/* DMA common registers */ +#define DMA_WRITE_ENGINE_EN_OFF 0xC +#define WRITE_ENABLE BIT(0) +#define WRITE_DISABLE 0x0 + +#define DMA_WRITE_DOORBELL_OFF 0x10 +#define DMA_WRITE_DOORBELL_OFF_WR_STOP BIT(31) + +#define DMA_READ_ENGINE_EN_OFF 0x2C +#define READ_ENABLE BIT(0) +#define READ_DISABLE 0x0 + +#define DMA_READ_DOORBELL_OFF 0x30 +#define DMA_READ_DOORBELL_OFF_RD_STOP BIT(31) + +#define DMA_WRITE_INT_STATUS_OFF 0x4C +#define DMA_WRITE_INT_MASK_OFF 0x54 +#define DMA_WRITE_INT_CLEAR_OFF 0x58 +#define DMA_WRITE_INT_DONE_MASK 0xF +#define DMA_WRITE_INT_ABORT_MASK 0xF0000 + +#define DMA_WRITE_ERR_STATUS_OFF 0x5C + +#define DMA_WRITE_DONE_IMWR_LOW_OFF 0x60 +#define DMA_WRITE_DONE_IMWR_HIGH_OFF 0x64 +#define DMA_WRITE_ABORT_IMWR_LOW_OFF 0x68 +#define DMA_WRITE_ABORT_IMWR_HIGH_OFF 0x6C + +#define DMA_WRITE_IMWR_DATA_OFF_BASE 0x70 + +#define DMA_READ_INT_STATUS_OFF 0xA0 +#define DMA_READ_INT_MASK_OFF 0xA8 +#define DMA_READ_INT_CLEAR_OFF 0xAC +#define DMA_READ_INT_DONE_MASK 0xF +#define DMA_READ_INT_ABORT_MASK 0xF0000 + +#define DMA_READ_DONE_IMWR_LOW_OFF 0xCC +#define DMA_READ_DONE_IMWR_HIGH_OFF 0xD0 +#define DMA_READ_ABORT_IMWR_LOW_OFF 0xD4 +#define DMA_READ_ABORT_IMWR_HIGH_OFF 0xD8 + +#define DMA_READ_IMWR_DATA_OFF_BASE 0xDC + +/* DMA channel specific registers */ +#define DMA_CH_CONTROL1_OFF_WRCH 0x0 +#define DMA_CH_CONTROL1_OFF_WRCH_LLE BIT(9) +#define DMA_CH_CONTROL1_OFF_WRCH_CCS BIT(8) +#define DMA_CH_CONTROL1_OFF_WRCH_RIE BIT(4) +#define DMA_CH_CONTROL1_OFF_WRCH_LIE BIT(3) +#define DMA_CH_CONTROL1_OFF_WRCH_LLP BIT(2) +#define DMA_TRANSFER_SIZE_OFF_WRCH 0x8 +#define DMA_SAR_LOW_OFF_WRCH 0xC +#define DMA_SAR_HIGH_OFF_WRCH 0x10 +#define DMA_DAR_LOW_OFF_WRCH 0x14 +#define DMA_DAR_HIGH_OFF_WRCH 0x18 +#define DMA_LLP_LOW_OFF_WRCH 0x1C +#define DMA_LLP_HIGH_OFF_WRCH 0x20 + +#define DMA_CH_CONTROL1_OFF_RDCH 0x100 +#define DMA_CH_CONTROL1_OFF_RDCH_LLE BIT(9) +#define DMA_CH_CONTROL1_OFF_RDCH_CCS BIT(8) +#define DMA_CH_CONTROL1_OFF_RDCH_RIE BIT(4) +#define DMA_CH_CONTROL1_OFF_RDCH_LIE BIT(3) +#define DMA_CH_CONTROL1_OFF_RDCH_LLP BIT(2) +#define DMA_TRANSFER_SIZE_OFF_RDCH 0x108 +#define DMA_SAR_LOW_OFF_RDCH 0x10C +#define DMA_SAR_HIGH_OFF_RDCH 0x110 +#define DMA_DAR_LOW_OFF_RDCH 0x114 +#define DMA_DAR_HIGH_OFF_RDCH 0x118 +#define DMA_LLP_LOW_OFF_RDCH 0x11C +#define DMA_LLP_HIGH_OFF_RDCH 0x120 + +struct sanity_data { + u32 size; + u32 crc; +}; + +/* First 1MB of BAR0 is reserved for control data */ +struct pcie_epf_bar0 { + /* RP system memory allocated for EP DMA operations */ + u64 rp_phy_addr; + /* EP system memory allocated as BAR */ + u64 ep_phy_addr; + /* MSI data for RP -> EP interrupts */ + u32 msi_data[DMA_WR_CHNL_NUM + DMA_RD_CHNL_NUM]; + struct sanity_data wr_data[DMA_WR_CHNL_NUM]; + struct sanity_data rd_data[DMA_RD_CHNL_NUM]; +}; + +struct dma_ll_ctrl { + u32 cb:1; + u32 tcb:1; + u32 llp:1; + u32 lie:1; + u32 rie:1; +}; + +struct dma_ll { + volatile struct dma_ll_ctrl ele; + u32 size; + u32 src_low; + u32 src_high; + u32 dst_low; + u32 dst_high; +}; + +static inline void dma_common_wr16(void __iomem *p, u32 val, u32 offset) +{ + writew(val, offset + p); +} + +static inline u16 dma_common_rd16(void __iomem *p, u32 offset) +{ + return readw(offset + p); +} + +static inline void dma_common_wr(void __iomem *p, u32 val, u32 offset) +{ + writel(val, offset + p); +} + +static inline u32 dma_common_rd(void __iomem *p, u32 offset) +{ + return readl(offset + p); +} + +static inline void dma_channel_wr(void __iomem *p, u8 channel, u32 val, + u32 offset) +{ + writel(val, (0x200 * (channel + 1)) + offset + p); +} + +static inline u32 dma_channel_rd(void __iomem *p, u8 channel, u32 offset) +{ + return readl((0x200 * (channel + 1)) + offset + p); +} + +#endif diff --git a/include/linux/tegra-pcie-edma-test-common.h b/include/linux/tegra-pcie-edma-test-common.h new file mode 100644 index 00000000..fe7785f6 --- /dev/null +++ b/include/linux/tegra-pcie-edma-test-common.h @@ -0,0 +1,291 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * PCIe DMA EPF Library for Tegra PCIe + * + * Copyright (C) 2022 NVIDIA Corporation. All rights reserved. + */ + +#ifndef TEGRA_PCIE_EDMA_TEST_COMMON_H +#define TEGRA_PCIE_EDMA_TEST_COMMON_H + +#include +#include +#include + +#define EDMA_ABORT_TEST_EN (edma->edma_ch & 0x40000000) +#define IS_EDMA_CH_ENABLED(i) (edma->edma_ch & ((BIT(i) << 4))) +#define IS_EDMA_CH_ASYNC(i) (edma->edma_ch & BIT(i)) +#define REMOTE_EDMA_TEST_EN (edma->edma_ch & 0x80000000) +#define EDMA_PERF (edma->tsz / (diff / 1000)) +#define EDMA_CPERF ((edma->tsz * (edma->nents / edma->nents_per_ch)) / (diff / 1000)) + +#define EDMA_PRIV_CH_OFF 16 +#define EDMA_PRIV_LR_OFF 20 +#define EDMA_PRIV_XF_OFF 21 + +struct edmalib_common { + struct device *fdev; + void *bar0_virt; + void *src_virt; + void __iomem *dma_base; + u32 dma_size; + dma_addr_t src_dma_addr; + dma_addr_t dst_dma_addr; + dma_addr_t bar0_phy; + u32 stress_count; + void *cookie; + struct device_node *of_node; + wait_queue_head_t wr_wq[DMA_WR_CHNL_NUM]; + wait_queue_head_t rd_wq[DMA_RD_CHNL_NUM]; + unsigned long wr_busy; + unsigned long rd_busy; + ktime_t edma_start_time[DMA_WR_CHNL_NUM]; + u64 tsz; + u32 edma_ch; + u32 prev_edma_ch; + u32 nents; + struct tegra_pcie_edma_desc *ll_desc; + int priv_iter[DMA_WR_CHNL_NUM]; + struct pcie_tegra_edma_remote_info edma_remote; + u32 nents_per_ch; + u32 st_as_ch; + u32 ls_as_ch; +}; + +static struct edmalib_common *l_edma; + +static void edma_final_complete(void *priv, edma_xfer_status_t status, + struct tegra_pcie_edma_desc *desc) +{ + struct edmalib_common *edma = l_edma; + int cb = *(int *)priv; + u32 ch = (cb >> EDMA_PRIV_CH_OFF) & 0xF; + edma_xfer_type_t xfer_type = (cb >> EDMA_PRIV_XF_OFF) & 0x1; + char *xfer_str[2] = {"WR", "RD"}; + u32 l_r = (cb >> EDMA_PRIV_LR_OFF) & 0x1; + char *l_r_str[2] = {"local", "remote"}; + u64 diff = ktime_to_ns(ktime_get()) - ktime_to_ns(edma->edma_start_time[ch]); + u64 cdiff = ktime_to_ns(ktime_get()) - ktime_to_ns(edma->edma_start_time[edma->st_as_ch]); + + cb = cb & 0xFFFF; + if (EDMA_ABORT_TEST_EN && status == EDMA_XFER_SUCCESS) + dma_common_wr(edma->dma_base, DMA_WRITE_DOORBELL_OFF_WR_STOP | (ch + 1), + DMA_WRITE_DOORBELL_OFF); + + dev_info(edma->fdev, "%s: %s-%s-Async complete for chan %d with status %d. Total desc %d of Sz %d Bytes done in time %llu nsec. Perf is %llu Mbps\n", + __func__, xfer_str[xfer_type], l_r_str[l_r], ch, status, edma->nents_per_ch*(cb+1), + edma->dma_size, diff, EDMA_PERF); + + if (ch == edma->ls_as_ch) + dev_info(edma->fdev, "%s: All Async channels. Cumulative Perf %llu Mbps, time %llu nsec\n", + __func__, EDMA_CPERF, cdiff); +} + +static void edma_complete(void *priv, edma_xfer_status_t status, struct tegra_pcie_edma_desc *desc) +{ + struct edmalib_common *edma = l_edma; + int cb = *(int *)priv; + u32 ch = (cb >> 16); + + if (BIT(ch) & edma->wr_busy) { + edma->wr_busy &= ~(BIT(ch)); + wake_up(&edma->wr_wq[ch]); + } + + if (status == 0) + dev_dbg(edma->fdev, "%s: status %d, cb %d\n", __func__, status, cb); +} + +/* debugfs to perform eDMA lib transfers and do CRC check */ +static int edmalib_common_test(struct edmalib_common *edma) +{ + struct tegra_pcie_edma_desc *ll_desc = edma->ll_desc; + dma_addr_t src_dma_addr = edma->src_dma_addr; + dma_addr_t dst_dma_addr = edma->dst_dma_addr; + u32 nents = edma->nents, num_chans = 0, nents_per_ch = 0, nent_id = 0, chan_count; + u32 i, j, k, max_size, db_off, num_descriptors; + edma_xfer_status_t ret; + struct tegra_pcie_edma_init_info info = {}; + struct tegra_pcie_edma_chans_info *chan_info; + struct tegra_pcie_edma_xfer_info tx_info = {}; + u64 diff; + edma_xfer_type_t xfer_type; + char *xfer_str[2] = {"WR", "RD"}; + u32 l_r; + char *l_r_str[2] = {"local", "remote"}; + + if (!edma->stress_count) { + tegra_pcie_edma_deinit(edma->cookie); + edma->cookie = NULL; + return 0; + } + + l_edma = edma; + + if (EDMA_ABORT_TEST_EN) { + edma->edma_ch &= ~0xFF; + /* only channel 0, 2 is ASYNC, where chan 0 async gets aborted */ + edma->edma_ch |= 0xF5; + } + + if (edma->cookie && edma->prev_edma_ch != edma->edma_ch) { + edma->st_as_ch = -1; + dev_info(edma->fdev, "edma_ch changed from 0x%x != 0x%x, deinit\n", + edma->prev_edma_ch, edma->edma_ch); + tegra_pcie_edma_deinit(edma->cookie); + edma->cookie = NULL; + } + + info.np = edma->of_node; + + if (REMOTE_EDMA_TEST_EN) { + num_descriptors = 1024; + info.rx[0].desc_phy_base = edma->bar0_phy + SZ_512K; + info.rx[0].desc_iova = 0xf0000000 + SZ_512K; + info.rx[1].desc_phy_base = edma->bar0_phy + SZ_512K + SZ_256K; + info.rx[1].desc_iova = 0xf0000000 + SZ_512K + SZ_256K; + info.edma_remote = &edma->edma_remote; + chan_count = DMA_RD_CHNL_NUM; + chan_info = &info.rx[0]; + xfer_type = EDMA_XFER_READ; + db_off = DMA_WRITE_DOORBELL_OFF; + l_r = 1; + } else { + chan_count = DMA_WR_CHNL_NUM; + num_descriptors = 4096; + chan_info = &info.tx[0]; + xfer_type = EDMA_XFER_WRITE; + db_off = DMA_READ_DOORBELL_OFF; + l_r = 0; + } + + for (i = 0; i < chan_count; i++) { + struct tegra_pcie_edma_chans_info *ch = chan_info + i; + + ch->ch_type = IS_EDMA_CH_ASYNC(i) ? EDMA_CHAN_XFER_ASYNC : + EDMA_CHAN_XFER_SYNC; + if (IS_EDMA_CH_ENABLED(i)) { + if (edma->st_as_ch == -1) + edma->st_as_ch = i; + edma->ls_as_ch = i; + ch->num_descriptors = num_descriptors; + num_chans++; + } else + ch->num_descriptors = 0; + } + + max_size = (BAR0_DMA_BUF_SIZE - BAR0_DMA_BUF_OFFSET) / 2; + if (((edma->dma_size * nents) > max_size) || (nents > NUM_EDMA_DESC)) { + dev_err(edma->fdev, "%s: max dma size including all nents(%d), max_nents(%d), dma_size(%d) should be <= 0x%x\n", + __func__, nents, NUM_EDMA_DESC, edma->dma_size, max_size); + return 0; + } + + nents_per_ch = nents / num_chans; + if (nents_per_ch == 0) { + dev_err(edma->fdev, "%s: nents(%d) < enabled chanes(%d)\n", + __func__, nents, num_chans); + return 0; + } + + for (j = 0; j < nents; j++) { + ll_desc->src = src_dma_addr + (j * edma->dma_size); + ll_desc->dst = dst_dma_addr + (j * edma->dma_size); + dev_dbg(edma->fdev, "src %llx, dst %llx at %d\n", ll_desc->src, ll_desc->dst, j); + ll_desc->sz = edma->dma_size; + ll_desc++; + } + ll_desc = edma->ll_desc; + + edma->tsz = (u64)edma->stress_count * (nents_per_ch) * (u64)edma->dma_size * 8UL; + + if (!edma->cookie && (edma->prev_edma_ch != edma->edma_ch)) { + dev_info(edma->fdev, "%s: re-init edma lib prev_ch(%x) != current chans(%x)\n", + __func__, edma->prev_edma_ch, edma->edma_ch); + edma->cookie = tegra_pcie_edma_initialize(&info); + edma->prev_edma_ch = edma->edma_ch; + } + + edma->nents_per_ch = nents_per_ch; + + /* generate random bytes to transfer */ + get_random_bytes(edma->src_virt, edma->dma_size * nents_per_ch); + dev_info(edma->fdev, "%s: EDMA LIB %s started for %d chans, size %d Bytes, iterations: %d of descriptors %d\n", + __func__, xfer_str[xfer_type], num_chans, edma->dma_size, edma->stress_count, + nents_per_ch); + + /* LL DMA with size epfnv->dma_size per desc */ + for (i = 0; i < chan_count; i++) { + int ch = i; + struct tegra_pcie_edma_chans_info *ch_info = chan_info + i; + + if (ch_info->num_descriptors == 0) + continue; + + edma->edma_start_time[i] = ktime_get(); + tx_info.desc = &ll_desc[nent_id++ * nents_per_ch]; + for (k = 0; k < edma->stress_count; k++) { + tx_info.channel_num = ch; + tx_info.type = xfer_type; + tx_info.nents = nents_per_ch; + if (ch_info->ch_type == EDMA_CHAN_XFER_ASYNC) { + if (k == edma->stress_count - 1) + tx_info.complete = edma_final_complete; + else + tx_info.complete = edma_complete; + } + edma->priv_iter[ch] = k | (xfer_type << EDMA_PRIV_XF_OFF) | + (l_r << EDMA_PRIV_LR_OFF) | (ch << EDMA_PRIV_CH_OFF); + tx_info.priv = &edma->priv_iter[ch]; + ret = tegra_pcie_edma_submit_xfer(edma->cookie, &tx_info); + if (ret == EDMA_XFER_FAIL_NOMEM) { + /** Retry after 20 msec */ + dev_dbg(edma->fdev, "%s: EDMA_XFER_FAIL_NOMEM stress count %d on channel %d iter %d\n", + __func__, edma->stress_count, i, k); + ret = wait_event_timeout(edma->wr_wq[i], + !(edma->wr_busy & (1 << i)), + msecs_to_jiffies(500)); + /* Do a more sleep to avoid repeated wait and wake calls */ + msleep(100); + if (ret == 0) { + dev_err(edma->fdev, "%s: %d timedout\n", __func__, i); + ret = -ETIMEDOUT; + goto fail; + } + k--; + continue; + } else if ((ret != EDMA_XFER_SUCCESS) && (ret != EDMA_XFER_FAIL_NOMEM)) { + dev_err(edma->fdev, "%s: LL %d, SZ: %u B CH: %d failed. %d at iter %d ret: %d\n", + __func__, xfer_type, edma->dma_size, ch, ret, k, ret); + if (EDMA_ABORT_TEST_EN) { + msleep(5000); + break; + } + goto fail; + } + dev_dbg(edma->fdev, "%s: LL EDMA LIB %d, SZ: %u B CH: %d iter %d\n", + __func__, xfer_type, edma->dma_size, ch, i); + } + if (EDMA_ABORT_TEST_EN && i == 0) { + msleep(edma->stress_count); + dma_common_wr(edma->dma_base, DMA_WRITE_DOORBELL_OFF_WR_STOP, + db_off); + } + diff = ktime_to_ns(ktime_get()) - ktime_to_ns(edma->edma_start_time[i]); + if (ch_info->ch_type == EDMA_CHAN_XFER_SYNC) + dev_info(edma->fdev, "%s: EDMA LIB %s-%s-SYNC done for %d iter on channel %d. Total Size %llu bytes, time %llu nsec. Perf is %llu Mbps\n", + __func__, xfer_str[xfer_type], l_r_str[l_r], edma->stress_count, i, + edma->tsz, diff, EDMA_PERF); + } + dev_info(edma->fdev, "%s: EDMA LIB submit done\n", __func__); + + return 0; +fail: + if (ret != EDMA_XFER_DEINIT) { + tegra_pcie_edma_deinit(edma->cookie); + edma->cookie = NULL; + } + return -1; +} + +#endif /* TEGRA_PCIE_EDMA_TEST_COMMON_H */