misc: mods: update MODS driver from Perforce

Bug 1949265

Change-Id: If7e55bcbf181d0b230a792ff0f557000482598df
Signed-off-by: Chris Dragan <kdragan@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/1517878
GVS: Gerrit_Virtual_Submit
Reviewed-by: Laxman Dewangan <ldewangan@nvidia.com>
This commit is contained in:
Chris Dragan
2017-07-12 01:25:41 -07:00
committed by Laxman Dewangan
parent 324b7be6e5
commit 9f0eb7f789
6 changed files with 190 additions and 471 deletions

View File

@@ -24,7 +24,7 @@
/* Driver version */
#define MODS_DRIVER_VERSION_MAJOR 3
#define MODS_DRIVER_VERSION_MINOR 73
#define MODS_DRIVER_VERSION_MINOR 75
#define MODS_DRIVER_VERSION ((MODS_DRIVER_VERSION_MAJOR << 8) | \
((MODS_DRIVER_VERSION_MINOR/10) << 4) | \
(MODS_DRIVER_VERSION_MINOR%10))
@@ -957,6 +957,13 @@ struct MODS_GET_ATS_ADDRESS_RANGE {
__s32 numa_memory_node;
};
/* MODS_ESC_SET_NVLINK_SYSMEM_TRAINED */
struct MODS_SET_NVLINK_SYSMEM_TRAINED {
/* IN */
struct mods_pci_dev_2 pci_device;
__u8 trained;
};
#pragma pack(pop)
/* ************************************************************************* */
@@ -1184,5 +1191,8 @@ struct MODS_GET_ATS_ADDRESS_RANGE {
#define MODS_ESC_GET_ATS_ADDRESS_RANGE \
_IOWR(MODS_IOC_MAGIC, 101, \
struct MODS_GET_ATS_ADDRESS_RANGE)
#define MODS_ESC_SET_NVLINK_SYSMEM_TRAINED \
_IOW(MODS_IOC_MAGIC, 102, \
struct MODS_SET_NVLINK_SYSMEM_TRAINED)
#endif /* _MODS_H_ */

View File

@@ -30,6 +30,10 @@
#include "mods_config.h"
#include "mods.h"
#ifdef MODS_HAS_SET_MEMORY_HEADER
#include <asm/set_memory.h>
#endif
#ifndef true
#define true 1
#define false 0
@@ -63,8 +67,9 @@ struct mods_file_private_data {
struct list_head *mods_alloc_list;
struct list_head *mods_mapping_list;
struct list_head *mods_pci_res_map_list;
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
struct list_head *mods_ppc_tce_bypass_list;
struct list_head *mods_nvlink_sysmem_trained_list;
#endif
wait_queue_head_t interrupt_event;
struct en_dev_entry *enabled_devices;
@@ -154,7 +159,7 @@ int mods_check_debug_level(int mask);
int mods_get_multi_instance(void);
void mods_set_multi_instance(int mi);
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
void mods_set_ppc_tce_bypass(int bypass);
int mods_get_ppc_tce_bypass(void);
@@ -164,6 +169,16 @@ struct PPC_TCE_BYPASS {
u64 dma_mask;
struct list_head list;
};
int mods_is_nvlink_sysmem_trained(struct file *fp,
struct pci_dev *dev);
/* NvLink Trained tracking */
struct NVL_TRAINED {
struct pci_dev *dev;
u8 trained;
struct list_head list;
};
#endif
#define IRQ_MAX (256+PCI_IRQ_MAX)
@@ -359,11 +374,16 @@ const char *mods_get_prot_str(u32 mem_type);
int mods_unregister_all_alloc(struct file *fp);
struct MODS_MEM_INFO *mods_find_alloc(struct file *fp, u64 phys_addr);
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
/* ppc64 */
int mods_unregister_all_ppc_tce_bypass(struct file *fp);
int mods_unregister_all_nvlink_sysmem_trained(struct file *fp);
#endif
#ifdef CONFIG_PCI
int mods_enable_device(struct mods_file_private_data *priv,
struct pci_dev *pdev);
int mods_unregister_all_pci_res_mappings(struct file *fp);
#define MODS_UNREGISTER_PCI_MAP(fp) mods_unregister_all_pci_res_mappings(fp)
#else
@@ -397,18 +417,21 @@ int esc_mods_virtual_to_phys(struct file *fp,
int esc_mods_phys_to_virtual(struct file *fp,
struct MODS_PHYSICAL_TO_VIRTUAL *p);
int esc_mods_memory_barrier(struct file *fp);
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
int esc_mods_set_ppc_tce_bypass(struct file *fp,
struct MODS_SET_PPC_TCE_BYPASS *p);
int esc_mods_get_ats_address_range(struct file *fp,
struct MODS_GET_ATS_ADDRESS_RANGE *p);
#endif
int esc_mods_dma_map_memory(struct file *fp,
struct MODS_DMA_MAP_MEMORY *p);
int esc_mods_dma_unmap_memory(struct file *fp,
struct MODS_DMA_MAP_MEMORY *p);
#if defined(CONFIG_PPC64)
/* ppc64 */
int esc_mods_set_ppc_tce_bypass(struct file *fp,
struct MODS_SET_PPC_TCE_BYPASS *p);
int esc_mods_get_ats_address_range(struct file *fp,
struct MODS_GET_ATS_ADDRESS_RANGE *p);
int esc_mods_set_nvlink_sysmem_trained(struct file *fp,
struct MODS_SET_NVLINK_SYSMEM_TRAINED *p);
#endif
/* acpi */
#ifdef CONFIG_ACPI
int esc_mods_eval_acpi_method(struct file *fp,

View File

@@ -67,7 +67,7 @@ static struct nv_device *get_dev(void)
}
#ifdef CONFIG_PCI
static int mods_enable_device(struct mods_file_private_data *priv,
int mods_enable_device(struct mods_file_private_data *priv,
struct pci_dev *pdev)
{
int ret = -1;

View File

@@ -111,7 +111,7 @@ struct pci_driver mods_pci_driver = {
static int debug = -0x80000000;
static int multi_instance = MODS_MULTI_INSTANCE_DEFAULT_VALUE;
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
static int ppc_tce_bypass = MODS_PPC_TCE_BYPASS_DEFAULT;
void mods_set_ppc_tce_bypass(int bypass)
@@ -246,7 +246,7 @@ module_param(multi_instance, int, 0644);
MODULE_PARM_DESC(multi_instance,
"allows more than one client to simultaneously open the driver");
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
module_param(ppc_tce_bypass, int, 0644);
MODULE_PARM_DESC(ppc_tce_bypass,
"PPC TCE bypass (0=sys default, 1=force bypass, 2=force non bypass)");
@@ -524,8 +524,9 @@ static int mods_krnl_open(struct inode *ip, struct file *fp)
struct list_head *mods_alloc_list;
struct list_head *mods_mapping_list;
struct list_head *mods_pci_res_map_list;
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
struct list_head *mods_ppc_tce_bypass_list;
struct list_head *mods_nvlink_sysmem_trained_list;
#endif
struct mods_file_private_data *private_data;
int id = 0;
@@ -556,7 +557,7 @@ static int mods_krnl_open(struct inode *ip, struct file *fp)
return -ENOMEM;
}
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
mods_ppc_tce_bypass_list =
kmalloc(sizeof(struct list_head), GFP_KERNEL | __GFP_NORETRY);
if (unlikely(!mods_ppc_tce_bypass_list)) {
@@ -566,6 +567,17 @@ static int mods_krnl_open(struct inode *ip, struct file *fp)
LOG_EXT();
return -ENOMEM;
}
mods_nvlink_sysmem_trained_list =
kmalloc(sizeof(struct list_head), GFP_KERNEL | __GFP_NORETRY);
if (unlikely(!mods_nvlink_sysmem_trained_list)) {
kfree(mods_alloc_list);
kfree(mods_mapping_list);
kfree(mods_pci_res_map_list);
kfree(mods_ppc_tce_bypass_list);
LOG_EXT();
return -ENOMEM;
}
#endif
private_data = kmalloc(sizeof(*private_data),
@@ -574,8 +586,9 @@ static int mods_krnl_open(struct inode *ip, struct file *fp)
kfree(mods_alloc_list);
kfree(mods_mapping_list);
kfree(mods_pci_res_map_list);
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
kfree(mods_ppc_tce_bypass_list);
kfree(mods_nvlink_sysmem_trained_list);
#endif
LOG_EXT();
return -ENOMEM;
@@ -587,8 +600,9 @@ static int mods_krnl_open(struct inode *ip, struct file *fp)
kfree(mods_alloc_list);
kfree(mods_mapping_list);
kfree(mods_pci_res_map_list);
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
kfree(mods_ppc_tce_bypass_list);
kfree(mods_nvlink_sysmem_trained_list);
#endif
kfree(private_data);
LOG_EXT();
@@ -604,9 +618,12 @@ static int mods_krnl_open(struct inode *ip, struct file *fp)
private_data->mods_alloc_list = mods_alloc_list;
private_data->mods_mapping_list = mods_mapping_list;
private_data->mods_pci_res_map_list = mods_pci_res_map_list;
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
INIT_LIST_HEAD(mods_ppc_tce_bypass_list);
INIT_LIST_HEAD(mods_nvlink_sysmem_trained_list);
private_data->mods_ppc_tce_bypass_list = mods_ppc_tce_bypass_list;
private_data->mods_nvlink_sysmem_trained_list
= mods_nvlink_sysmem_trained_list;
#endif
private_data->enabled_devices = 0;
private_data->mem_type.dma_addr = 0;
@@ -649,10 +666,14 @@ static int mods_krnl_close(struct inode *ip, struct file *fp)
if (ret)
mods_error_printk("failed to free pci mappings\n");
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
ret = mods_unregister_all_ppc_tce_bypass(fp);
if (ret)
mods_error_printk("failed to restore dma bypass\n");
ret = mods_unregister_all_nvlink_sysmem_trained(fp);
if (ret)
mods_error_printk("failed to free nvlink trained\n");
#endif
mods_disable_all_devices(private_data);
@@ -660,8 +681,9 @@ static int mods_krnl_close(struct inode *ip, struct file *fp)
kfree(private_data->mods_alloc_list);
kfree(private_data->mods_mapping_list);
kfree(private_data->mods_pci_res_map_list);
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
kfree(private_data->mods_ppc_tce_bypass_list);
kfree(private_data->mods_nvlink_sysmem_trained_list);
#endif
kfree(private_data);
@@ -1187,7 +1209,7 @@ static long mods_krnl_ioctl(struct file *fp,
esc_mods_phys_to_virtual, MODS_PHYSICAL_TO_VIRTUAL);
break;
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
#if defined(CONFIG_PPC64)
case MODS_ESC_SET_PPC_TCE_BYPASS:
MODS_IOCTL(MODS_ESC_SET_PPC_TCE_BYPASS,
esc_mods_set_ppc_tce_bypass,
@@ -1199,6 +1221,11 @@ static long mods_krnl_ioctl(struct file *fp,
esc_mods_get_ats_address_range,
MODS_GET_ATS_ADDRESS_RANGE);
break;
case MODS_ESC_SET_NVLINK_SYSMEM_TRAINED:
MODS_IOCTL(MODS_ESC_SET_NVLINK_SYSMEM_TRAINED,
esc_mods_set_nvlink_sysmem_trained,
MODS_SET_NVLINK_SYSMEM_TRAINED);
break;
#endif
case MODS_ESC_DMA_MAP_MEMORY:

View File

@@ -36,6 +36,9 @@ static int mods_post_alloc(struct MODS_PHYS_CHUNK *pt,
static void mods_pre_free(struct MODS_PHYS_CHUNK *pt,
struct MODS_MEM_INFO *p_mem_info);
static u64 mods_compress_nvlink_addr(struct pci_dev *dev, u64 addr);
static u64 mods_expand_nvlink_addr(struct pci_dev *dev, u64 addr47);
/****************************
* DMA MAP HELPER FUNCTIONS *
****************************/
@@ -47,6 +50,8 @@ static void mods_dma_unmap_page(struct MODS_DMA_MAP *p_dma_map,
if (!pm->pt)
return;
pm->map_addr = mods_expand_nvlink_addr(p_dma_map->dev, pm->map_addr);
pci_unmap_page(p_dma_map->dev,
pm->map_addr,
(1U<<pm->pt->order)*PAGE_SIZE,
@@ -155,6 +160,9 @@ static void mods_dma_map_pages(struct MODS_MEM_INFO *p_mem_info,
(1U << pt->order) * PAGE_SIZE,
DMA_BIDIRECTIONAL);
pm->map_addr = mods_compress_nvlink_addr(p_dma_map->dev,
pm->map_addr);
mods_debug_printk(DEBUG_MEM_DETAILED,
"%s : Mapped map_addr=0x%llx, dma_addr=0x%llx on dev %x:%x:%x.%x\n",
__func__,
@@ -742,6 +750,11 @@ int esc_mods_device_alloc_pages_2(struct file *fp,
p_mem_info->dev = dev;
#if defined(MODS_HAS_DEV_TO_NUMA_NODE)
p_mem_info->numa_node = dev_to_node(&dev->dev);
#endif
#if defined(MODS_HAS_PNV_PCI_GET_NPU_DEV)
if (!mods_is_nvlink_sysmem_trained(fp, dev) &&
pnv_pci_get_npu_dev(dev, 0))
p_mem_info->numa_node = 0;
#endif
mods_debug_printk(DEBUG_MEM_DETAILED,
"affinity %x:%x.%x node %d\n",
@@ -1178,349 +1191,6 @@ int esc_mods_memory_barrier(struct file *fp)
#endif
}
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
static struct PPC_TCE_BYPASS *mods_find_ppc_tce_bypass(struct file *fp,
struct pci_dev *dev)
{
MODS_PRIV private_data = fp->private_data;
struct list_head *plist_head;
struct list_head *plist_iter;
struct PPC_TCE_BYPASS *p_ppc_tce_bypass;
plist_head = private_data->mods_ppc_tce_bypass_list;
list_for_each(plist_iter, plist_head) {
p_ppc_tce_bypass = list_entry(plist_iter,
struct PPC_TCE_BYPASS,
list);
if (dev == p_ppc_tce_bypass->dev)
return p_ppc_tce_bypass;
}
/* The device has never had its dma mask changed */
return NULL;
}
static int mods_register_ppc_tce_bypass(struct file *fp,
struct pci_dev *dev,
u64 original_mask)
{
MODS_PRIV private_data = fp->private_data;
struct PPC_TCE_BYPASS *p_ppc_tce_bypass;
/* only register the first time in order to restore the true actual dma
* mask
*/
if (mods_find_ppc_tce_bypass(fp, dev) != NULL) {
mods_debug_printk(DEBUG_MEM,
"TCE bypass already registered on dev %x:%x:%x.%x\n",
pci_domain_nr(dev->bus),
dev->bus->number,
PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
return OK;
}
if (unlikely(mutex_lock_interruptible(&private_data->mtx)))
return -EINTR;
p_ppc_tce_bypass = kmalloc(sizeof(struct PPC_TCE_BYPASS),
GFP_KERNEL | __GFP_NORETRY);
if (unlikely(!p_ppc_tce_bypass)) {
mods_error_printk("failed to allocate TCE bypass struct\n");
LOG_EXT();
return -ENOMEM;
}
p_ppc_tce_bypass->dev = dev;
p_ppc_tce_bypass->dma_mask = original_mask;
list_add(&p_ppc_tce_bypass->list,
private_data->mods_ppc_tce_bypass_list);
mods_debug_printk(DEBUG_MEM,
"Registered TCE bypass on dev %x:%x:%x.%x\n",
pci_domain_nr(dev->bus),
dev->bus->number,
PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
mutex_unlock(&private_data->mtx);
return OK;
}
static int mods_unregister_ppc_tce_bypass(struct file *fp, struct pci_dev *dev)
{
struct PPC_TCE_BYPASS *p_ppc_tce_bypass;
MODS_PRIV private_data = fp->private_data;
struct list_head *head = private_data->mods_ppc_tce_bypass_list;
struct list_head *iter;
LOG_ENT();
if (unlikely(mutex_lock_interruptible(&private_data->mtx)))
return -EINTR;
list_for_each(iter, head) {
p_ppc_tce_bypass =
list_entry(iter, struct PPC_TCE_BYPASS, list);
if (p_ppc_tce_bypass->dev == dev) {
int ret = 0;
list_del(iter);
mutex_unlock(&private_data->mtx);
ret = pci_set_dma_mask(p_ppc_tce_bypass->dev,
p_ppc_tce_bypass->dma_mask);
dma_set_coherent_mask(&p_ppc_tce_bypass->dev->dev,
dev->dma_mask);
mods_debug_printk(DEBUG_MEM,
"Restored dma_mask on dev %x:%x:%x.%x to %llx\n",
pci_domain_nr(p_ppc_tce_bypass->dev->bus),
p_ppc_tce_bypass->dev->bus->number,
PCI_SLOT(p_ppc_tce_bypass->dev->devfn),
PCI_FUNC(p_ppc_tce_bypass->dev->devfn),
p_ppc_tce_bypass->dma_mask);
kfree(p_ppc_tce_bypass);
LOG_EXT();
return ret;
}
}
mutex_unlock(&private_data->mtx);
mods_error_printk(
"Failed to unregister TCE bypass on dev %x:%x:%x.%x\n",
pci_domain_nr(dev->bus),
dev->bus->number,
PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
LOG_EXT();
return -EINVAL;
}
int mods_unregister_all_ppc_tce_bypass(struct file *fp)
{
MODS_PRIV private_data = fp->private_data;
struct list_head *head = private_data->mods_ppc_tce_bypass_list;
struct list_head *iter;
struct list_head *tmp;
list_for_each_safe(iter, tmp, head) {
struct PPC_TCE_BYPASS *p_ppc_tce_bypass;
int ret;
p_ppc_tce_bypass =
list_entry(iter, struct PPC_TCE_BYPASS, list);
ret = mods_unregister_ppc_tce_bypass(fp, p_ppc_tce_bypass->dev);
if (ret)
return ret;
}
return OK;
}
int esc_mods_set_ppc_tce_bypass(struct file *fp,
struct MODS_SET_PPC_TCE_BYPASS *p)
{
int ret = OK;
dma_addr_t dma_addr;
unsigned int devfn = PCI_DEVFN(p->pci_device.device,
p->pci_device.function);
struct pci_dev *dev = MODS_PCI_GET_SLOT(p->pci_device.domain,
p->pci_device.bus,
devfn);
u64 original_dma_mask;
u32 bypass_mode = p->mode;
u32 cur_bypass_mode = MODS_PPC_TCE_BYPASS_OFF;
u64 dma_mask = DMA_BIT_MASK(64);
LOG_ENT();
if (!dev) {
mods_error_printk(
"PCI device not found %x:%x:%x.%x\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
LOG_EXT();
return -EINVAL;
}
original_dma_mask = dev->dma_mask;
if (bypass_mode == MODS_PPC_TCE_BYPASS_DEFAULT)
bypass_mode = mods_get_ppc_tce_bypass();
if (original_dma_mask == DMA_BIT_MASK(64))
cur_bypass_mode = MODS_PPC_TCE_BYPASS_ON;
/*
* Linux on IBM POWER8 offers 2 different DMA set-ups, sometimes
* referred to as "windows".
*
* The "default window" provides a 2GB region of PCI address space
* located below the 32-bit line. The IOMMU is used to provide a
* "rich" mapping--any page in system memory can be mapped at an
* arbitrary address within this window. The mappings are dynamic
* and pass in and out of being as pci_map*()/pci_unmap*() calls
* are made.
*
* Dynamic DMA Windows (sometimes "Huge DDW", also PPC TCE Bypass "ON")
* provides a linear
* mapping of the system's entire physical address space at some
* fixed offset above the 59-bit line. IOMMU is still used, and
* pci_map*()/pci_unmap*() are still required, but mappings are
* static. They're effectively set up in advance, and any given
* system page will always map to the same PCI bus address. I.e.
* physical 0x00000000xxxxxxxx => PCI 0x08000000xxxxxxxx
*
* Linux on POWER8 will only provide the DDW-style full linear
* mapping when the driver claims support for 64-bit DMA addressing
* (a pre-requisite because the PCI addresses used in this case will
* be near the top of the 64-bit range). The linear mapping
* is not available in all system configurations.
*
* Detect whether the linear mapping is present by claiming
* 64-bit support and then mapping physical page 0. For historical
* reasons, Linux on POWER8 will never map a page to PCI address 0x0.
* In the "default window" case page 0 will be mapped to some
* non-zero address below the 32-bit line. In the
* DDW/linear-mapping case, it will be mapped to address 0 plus
* some high-order offset.
*
* If the linear mapping is present and sane then return the offset
* as the starting address for all DMA mappings.
*/
if ((bypass_mode != MODS_PPC_TCE_BYPASS_DEFAULT) &&
(cur_bypass_mode != bypass_mode)) {
/* Set DMA mask appropriately here */
if (bypass_mode == MODS_PPC_TCE_BYPASS_OFF)
dma_mask = p->device_dma_mask;
if (pci_set_dma_mask(dev, dma_mask) != 0) {
mods_error_printk(
"pci_set_dma_mask failed on dev %x:%x:%x.%x\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
LOG_EXT();
return -EINVAL;
}
}
dma_addr = pci_map_single(dev, NULL, 1, DMA_BIDIRECTIONAL);
if (pci_dma_mapping_error(dev, dma_addr)) {
pci_set_dma_mask(dev, original_dma_mask);
mods_error_printk(
"pci_map_single failed on dev %x:%x:%x.%x\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
LOG_EXT();
return -EINVAL;
}
pci_unmap_single(dev, dma_addr, 1, DMA_BIDIRECTIONAL);
if (bypass_mode == MODS_PPC_TCE_BYPASS_ON) {
bool bBypassFailed = false;
/*
* From IBM: "For IODA2, native DMA bypass or KVM TCE-based
* implementation of full 64-bit DMA support will establish a
* window in address-space with the high 14 bits being constant
* and the bottom up-to-50 bits varying with the mapping."
*
* Unfortunately, we don't have any good interfaces or
* definitions from the kernel to get information about the DMA
* offset assigned by OS. However, we have been told that the
* offset will be defined by the top 14 bits of the address,
* and bits 40-49 will not vary for any DMA mappings until 1TB
* of system memory is surpassed; this limitation is essential
* for us to function properly since our current GPUs only
* support 40 physical address bits. We are in a fragile place
* where we need to tell the OS that we're capable of 64-bit
* addressing, while relying on the assumption that the top 24
* bits will not vary in this case.
*
* The way we try to compute the window, then, is mask the trial
* mapping against the DMA capabilities of the device. That way,
* devices with greater addressing capabilities will only take
* the bits it needs to define the window.
*/
if ((dma_addr & DMA_BIT_MASK(32)) != 0) {
/*
* Huge DDW not available - page 0 mapped to non-zero
* address below the 32-bit line.
*/
mods_warning_printk(
"Enabling PPC TCE bypass mode failed due to platform on device %x:%x:%x.%x\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
bBypassFailed = true;
} else if ((dma_addr & original_dma_mask) != 0) {
/*
* The physical window straddles our addressing limit
* boundary, e.g., for an adapter that can address up to
* 1TB, the window crosses the 40-bit limit so that the
* lower end of the range has different bits 63:40 than
* the higher end of the range. We can only handle a
* single, static value for bits 63:40, so we must fall
* back here.
*/
mods_warning_printk(
"Enabling PPC TCE bypass mode failed due to memory size on device %x:%x:%x.%x\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
bBypassFailed = true;
}
if (bBypassFailed)
pci_set_dma_mask(dev, original_dma_mask);
}
mods_debug_printk(DEBUG_MEM,
"%s ppc tce bypass on device %x:%x:%x.%x with dma mask 0x%llx\n",
(dev->dma_mask == DMA_BIT_MASK(64)) ? "Enabled" : "Disabled",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function,
dev->dma_mask);
p->dma_base_address = dma_addr & ~(p->device_dma_mask);
mods_debug_printk(DEBUG_MEM,
"dma base address 0x%0llx on device %x:%x:%x.%x\n",
p->dma_base_address,
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
/* Update the coherent mask to match */
dma_set_coherent_mask(&dev->dev, dev->dma_mask);
if (original_dma_mask != dev->dma_mask)
ret = mods_register_ppc_tce_bypass(fp, dev, original_dma_mask);
LOG_EXT();
return ret;
}
#endif
int esc_mods_dma_map_memory(struct file *fp,
struct MODS_DMA_MAP_MEMORY *p)
{
@@ -1806,3 +1476,56 @@ static void mods_pre_free(struct MODS_PHYS_CHUNK *pt,
kunmap(pt->p_page + i);
}
}
/*
* Starting on Power9 systems, DMA addresses for NVLink are no longer
* the same as used over PCIE.
*
* Power9 supports a 56-bit Real Address. This address range is compressed
* when accessed over NvLink to allow the GPU to access all of memory using
* its 47-bit Physical address.
*
* If there is an NPU device present on the system, it implies that NvLink
* sysmem links are present and we need to apply the required address
* conversion for NvLink within the driver. This is intended to be temporary
* to ease the transition to kernel APIs to handle NvLink DMA mappings
* via the NPU device.
*
* Note, a deviation from the documented compression scheme is that the
* upper address bits (i.e. bit 56-63) instead of being set to zero are
* preserved during NvLink address compression so the orignal PCIE DMA
* address can be reconstructed on expansion. These bits can be safely
* ignored on NvLink since they are truncated by the GPU.
*/
static u64 mods_compress_nvlink_addr(struct pci_dev *dev, u64 addr)
{
u64 addr47 = addr;
/* Note, one key difference from the documented compression scheme
* is that BIT59 used for TCE bypass mode on PCIe is preserved during
* NVLink address compression to allow for the resulting DMA address to
* be used transparently on PCIe.
*/
#if defined(MODS_HAS_PNV_PCI_GET_NPU_DEV)
if (pnv_pci_get_npu_dev(dev, 0)) {
addr47 = addr & (1LLU << 59);
addr47 |= ((addr >> 45) & 0x3) << 43;
addr47 |= ((addr >> 49) & 0x3) << 45;
addr47 |= addr & ((1LLU << 43) - 1);
}
#endif
return addr47;
}
static u64 mods_expand_nvlink_addr(struct pci_dev *dev, u64 addr47)
{
u64 addr = addr47;
#if defined(MODS_HAS_PNV_PCI_GET_NPU_DEV)
if (pnv_pci_get_npu_dev(dev, 0)) {
addr = addr47 & ((1LLU << 43) - 1);
addr |= (addr47 & (3ULL << 43)) << 2;
addr |= (addr47 & (3ULL << 45)) << 4;
addr |= addr47 & ~((1ULL << 56) - 1);
}
#endif
return addr;
}

View File

@@ -107,6 +107,7 @@ int mods_unregister_all_pci_res_mappings(struct file *fp)
int esc_mods_find_pci_dev_2(struct file *pfile,
struct MODS_FIND_PCI_DEVICE_2 *p)
{
MODS_PRIV private_data = pfile->private_data;
struct pci_dev *dev;
int index = 0;
@@ -124,6 +125,16 @@ int esc_mods_find_pci_dev_2(struct file *pfile,
p->pci_device.bus = dev->bus->number;
p->pci_device.device = PCI_SLOT(dev->devfn);
p->pci_device.function = PCI_FUNC(dev->devfn);
/* Enable device on the PCI bus */
if (mods_enable_device(private_data, dev)) {
mods_error_printk(
"unable to enable dev %04x:%02x:%02x.%x\n",
(unsigned int)p->pci_device.domain,
(unsigned int)p->pci_device.bus,
(unsigned int)p->pci_device.device,
(unsigned int)p->pci_device.function);
return -EINVAL;
}
return OK;
}
dev = pci_get_device(p->vendor_id, p->device_id, dev);
@@ -136,6 +147,7 @@ int esc_mods_find_pci_dev_2(struct file *pfile,
int esc_mods_find_pci_dev(struct file *pfile,
struct MODS_FIND_PCI_DEVICE *p)
{
MODS_PRIV private_data = pfile->private_data;
struct pci_dev *dev;
int index = 0;
@@ -152,6 +164,15 @@ int esc_mods_find_pci_dev(struct file *pfile,
p->bus_number = dev->bus->number;
p->device_number = PCI_SLOT(dev->devfn);
p->function_number = PCI_FUNC(dev->devfn);
/* Enable device on the PCI bus */
if (mods_enable_device(private_data, dev)) {
mods_error_printk(
"unable to enable dev %02x:%02x.%x\n",
(unsigned int)p->bus_number,
(unsigned int)p->device_number,
(unsigned int)p->function_number);
return -EINVAL;
}
return OK;
}
/* Only return devices in the first domain, but don't assume
@@ -168,6 +189,7 @@ int esc_mods_find_pci_dev(struct file *pfile,
int esc_mods_find_pci_class_code_2(struct file *pfile,
struct MODS_FIND_PCI_CLASS_CODE_2 *p)
{
MODS_PRIV private_data = pfile->private_data;
struct pci_dev *dev;
int index = 0;
@@ -182,6 +204,16 @@ int esc_mods_find_pci_class_code_2(struct file *pfile,
p->pci_device.bus = dev->bus->number;
p->pci_device.device = PCI_SLOT(dev->devfn);
p->pci_device.function = PCI_FUNC(dev->devfn);
/* Enable device on the PCI bus */
if (mods_enable_device(private_data, dev)) {
mods_error_printk(
"unable to enable dev %04x:%02x:%02x.%x\n",
(unsigned int)p->pci_device.domain,
(unsigned int)p->pci_device.bus,
(unsigned int)p->pci_device.device,
(unsigned int)p->pci_device.function);
return -EINVAL;
}
return OK;
}
dev = pci_get_class(p->class_code, dev);
@@ -194,6 +226,7 @@ int esc_mods_find_pci_class_code_2(struct file *pfile,
int esc_mods_find_pci_class_code(struct file *pfile,
struct MODS_FIND_PCI_CLASS_CODE *p)
{
MODS_PRIV private_data = pfile->private_data;
struct pci_dev *dev;
int index = 0;
@@ -207,6 +240,15 @@ int esc_mods_find_pci_class_code(struct file *pfile,
p->bus_number = dev->bus->number;
p->device_number = PCI_SLOT(dev->devfn);
p->function_number = PCI_FUNC(dev->devfn);
/* Enable device on the PCI bus */
if (mods_enable_device(private_data, dev)) {
mods_error_printk(
"unable to enable dev %02x:%02x.%x\n",
(unsigned int)p->bus_number,
(unsigned int)p->device_number,
(unsigned int)p->function_number);
return -EINVAL;
}
return OK;
}
/* Only return devices in the first domain, but don't assume
@@ -817,109 +859,3 @@ int esc_mods_pci_unmap_resource(struct file *fp,
return OK;
#endif
}
#if defined(MODS_HAS_SET_PPC_TCE_BYPASS)
int esc_mods_get_ats_address_range(struct file *fp,
struct MODS_GET_ATS_ADDRESS_RANGE *p)
{
unsigned int devfn;
struct pci_dev *dev;
struct pci_dev *npu_dev;
struct device_node *mem_node = NULL;
const __u32 *val32;
const __u64 *val64;
int len;
int ret = -EINVAL;
LOG_ENT();
mods_debug_printk(DEBUG_PCICFG,
"get ats addr, dev %04x:%x:%02x:%x, npu index %d\n",
(int)p->pci_device.domain,
(int)p->pci_device.bus,
(int)p->pci_device.device,
(int)p->pci_device.function,
(int)p->npu_index);
devfn = PCI_DEVFN(p->pci_device.device, p->pci_device.function);
dev = MODS_PCI_GET_SLOT(p->pci_device.domain, p->pci_device.bus, devfn);
if (dev == NULL) {
mods_error_printk("PCI device %04x:%x:%02x.%x not found\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
goto exit;
}
npu_dev = pnv_pci_get_npu_dev(dev, p->npu_index);
if (npu_dev == NULL) {
mods_error_printk("NPU device for %04x:%x:%02x.%x not found\n",
p->pci_device.domain,
p->pci_device.bus,
p->pci_device.device,
p->pci_device.function);
goto exit;
}
p->npu_device.domain = pci_domain_nr(npu_dev->bus);
p->npu_device.bus = npu_dev->bus->number;
p->npu_device.device = PCI_SLOT(npu_dev->devfn);
p->npu_device.function = PCI_FUNC(npu_dev->devfn);
mods_debug_printk(DEBUG_PCICFG,
"Found NPU device %04x:%x:%02x.%x\n",
p->npu_device.domain,
p->npu_device.bus,
p->npu_device.device,
p->npu_device.function);
val32 = (const __u32 *)of_get_property(npu_dev->dev.of_node,
"memory-region",
&len);
if (!val32 || len < 4) {
mods_error_printk("Property memory-region for NPU not found\n");
goto exit;
}
mem_node = of_find_node_by_phandle(be32_to_cpu(*val32));
if (!mem_node) {
mods_error_printk("Node memory-region for NPU not found\n");
goto exit;
}
p->numa_memory_node = of_node_to_nid(mem_node);
if (p->numa_memory_node == NUMA_NO_NODE) {
mods_error_printk("NUMA node for NPU not found\n");
goto exit;
}
val64 = (const __u64 *)of_get_property(npu_dev->dev.of_node,
"ibm,device-tgt-addr",
&len);
if (!val64 || len < 8) {
mods_error_printk(
"Property ibm,device-tgt-addr for NPU not found\n");
goto exit;
}
p->phys_addr = be64_to_cpu(*val64);
val64 = (const __u64 *)of_get_property(mem_node, "reg", &len);
if (!val64 || len < 16) {
mods_error_printk("Property reg for memory region not found\n");
goto exit;
}
p->guest_addr = be64_to_cpu(val64[0]);
p->aperture_size = be64_to_cpu(val64[1]);
ret = OK;
exit:
if (mem_node)
of_node_put(mem_node);
LOG_EXT();
return ret;
}
#endif