From 9f0eb7f789625d90864c0f91b4b05ce75e0bbba8 Mon Sep 17 00:00:00 2001 From: Chris Dragan Date: Wed, 12 Jul 2017 01:25:41 -0700 Subject: [PATCH] misc: mods: update MODS driver from Perforce Bug 1949265 Change-Id: If7e55bcbf181d0b230a792ff0f557000482598df Signed-off-by: Chris Dragan Reviewed-on: https://git-master.nvidia.com/r/1517878 GVS: Gerrit_Virtual_Submit Reviewed-by: Laxman Dewangan --- drivers/misc/mods/mods.h | 12 +- drivers/misc/mods/mods_internal.h | 43 +++- drivers/misc/mods/mods_irq.c | 2 +- drivers/misc/mods/mods_krnl.c | 47 +++- drivers/misc/mods/mods_mem.c | 409 +++++------------------------- drivers/misc/mods/mods_pci.c | 148 +++-------- 6 files changed, 190 insertions(+), 471 deletions(-) diff --git a/drivers/misc/mods/mods.h b/drivers/misc/mods/mods.h index 1d59872b..5f4a66f7 100644 --- a/drivers/misc/mods/mods.h +++ b/drivers/misc/mods/mods.h @@ -24,7 +24,7 @@ /* Driver version */ #define MODS_DRIVER_VERSION_MAJOR 3 -#define MODS_DRIVER_VERSION_MINOR 73 +#define MODS_DRIVER_VERSION_MINOR 75 #define MODS_DRIVER_VERSION ((MODS_DRIVER_VERSION_MAJOR << 8) | \ ((MODS_DRIVER_VERSION_MINOR/10) << 4) | \ (MODS_DRIVER_VERSION_MINOR%10)) @@ -957,6 +957,13 @@ struct MODS_GET_ATS_ADDRESS_RANGE { __s32 numa_memory_node; }; +/* MODS_ESC_SET_NVLINK_SYSMEM_TRAINED */ +struct MODS_SET_NVLINK_SYSMEM_TRAINED { + /* IN */ + struct mods_pci_dev_2 pci_device; + __u8 trained; +}; + #pragma pack(pop) /* ************************************************************************* */ @@ -1184,5 +1191,8 @@ struct MODS_GET_ATS_ADDRESS_RANGE { #define MODS_ESC_GET_ATS_ADDRESS_RANGE \ _IOWR(MODS_IOC_MAGIC, 101, \ struct MODS_GET_ATS_ADDRESS_RANGE) +#define MODS_ESC_SET_NVLINK_SYSMEM_TRAINED \ + _IOW(MODS_IOC_MAGIC, 102, \ + struct MODS_SET_NVLINK_SYSMEM_TRAINED) #endif /* _MODS_H_ */ diff --git a/drivers/misc/mods/mods_internal.h b/drivers/misc/mods/mods_internal.h index 3f8180cc..e2d96ad3 100644 --- a/drivers/misc/mods/mods_internal.h +++ b/drivers/misc/mods/mods_internal.h @@ -30,6 +30,10 @@ #include "mods_config.h" #include "mods.h" +#ifdef MODS_HAS_SET_MEMORY_HEADER +#include +#endif + #ifndef true #define true 1 #define false 0 @@ -63,8 +67,9 @@ struct mods_file_private_data { struct list_head *mods_alloc_list; struct list_head *mods_mapping_list; struct list_head *mods_pci_res_map_list; -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) struct list_head *mods_ppc_tce_bypass_list; + struct list_head *mods_nvlink_sysmem_trained_list; #endif wait_queue_head_t interrupt_event; struct en_dev_entry *enabled_devices; @@ -154,7 +159,7 @@ int mods_check_debug_level(int mask); int mods_get_multi_instance(void); void mods_set_multi_instance(int mi); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) void mods_set_ppc_tce_bypass(int bypass); int mods_get_ppc_tce_bypass(void); @@ -164,6 +169,16 @@ struct PPC_TCE_BYPASS { u64 dma_mask; struct list_head list; }; + +int mods_is_nvlink_sysmem_trained(struct file *fp, + struct pci_dev *dev); + +/* NvLink Trained tracking */ +struct NVL_TRAINED { + struct pci_dev *dev; + u8 trained; + struct list_head list; +}; #endif #define IRQ_MAX (256+PCI_IRQ_MAX) @@ -359,11 +374,16 @@ const char *mods_get_prot_str(u32 mem_type); int mods_unregister_all_alloc(struct file *fp); struct MODS_MEM_INFO *mods_find_alloc(struct file *fp, u64 phys_addr); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) +/* ppc64 */ int mods_unregister_all_ppc_tce_bypass(struct file *fp); + +int mods_unregister_all_nvlink_sysmem_trained(struct file *fp); #endif #ifdef CONFIG_PCI +int mods_enable_device(struct mods_file_private_data *priv, + struct pci_dev *pdev); int mods_unregister_all_pci_res_mappings(struct file *fp); #define MODS_UNREGISTER_PCI_MAP(fp) mods_unregister_all_pci_res_mappings(fp) #else @@ -397,18 +417,21 @@ int esc_mods_virtual_to_phys(struct file *fp, int esc_mods_phys_to_virtual(struct file *fp, struct MODS_PHYSICAL_TO_VIRTUAL *p); int esc_mods_memory_barrier(struct file *fp); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) -int esc_mods_set_ppc_tce_bypass(struct file *fp, - struct MODS_SET_PPC_TCE_BYPASS *p); -int esc_mods_get_ats_address_range(struct file *fp, - struct MODS_GET_ATS_ADDRESS_RANGE *p); -#endif - int esc_mods_dma_map_memory(struct file *fp, struct MODS_DMA_MAP_MEMORY *p); int esc_mods_dma_unmap_memory(struct file *fp, struct MODS_DMA_MAP_MEMORY *p); +#if defined(CONFIG_PPC64) +/* ppc64 */ +int esc_mods_set_ppc_tce_bypass(struct file *fp, + struct MODS_SET_PPC_TCE_BYPASS *p); +int esc_mods_get_ats_address_range(struct file *fp, + struct MODS_GET_ATS_ADDRESS_RANGE *p); +int esc_mods_set_nvlink_sysmem_trained(struct file *fp, + struct MODS_SET_NVLINK_SYSMEM_TRAINED *p); +#endif + /* acpi */ #ifdef CONFIG_ACPI int esc_mods_eval_acpi_method(struct file *fp, diff --git a/drivers/misc/mods/mods_irq.c b/drivers/misc/mods/mods_irq.c index 2871594f..62b9d4d7 100644 --- a/drivers/misc/mods/mods_irq.c +++ b/drivers/misc/mods/mods_irq.c @@ -67,7 +67,7 @@ static struct nv_device *get_dev(void) } #ifdef CONFIG_PCI -static int mods_enable_device(struct mods_file_private_data *priv, +int mods_enable_device(struct mods_file_private_data *priv, struct pci_dev *pdev) { int ret = -1; diff --git a/drivers/misc/mods/mods_krnl.c b/drivers/misc/mods/mods_krnl.c index 15f78666..52644c4f 100644 --- a/drivers/misc/mods/mods_krnl.c +++ b/drivers/misc/mods/mods_krnl.c @@ -111,7 +111,7 @@ struct pci_driver mods_pci_driver = { static int debug = -0x80000000; static int multi_instance = MODS_MULTI_INSTANCE_DEFAULT_VALUE; -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) static int ppc_tce_bypass = MODS_PPC_TCE_BYPASS_DEFAULT; void mods_set_ppc_tce_bypass(int bypass) @@ -246,7 +246,7 @@ module_param(multi_instance, int, 0644); MODULE_PARM_DESC(multi_instance, "allows more than one client to simultaneously open the driver"); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) module_param(ppc_tce_bypass, int, 0644); MODULE_PARM_DESC(ppc_tce_bypass, "PPC TCE bypass (0=sys default, 1=force bypass, 2=force non bypass)"); @@ -524,8 +524,9 @@ static int mods_krnl_open(struct inode *ip, struct file *fp) struct list_head *mods_alloc_list; struct list_head *mods_mapping_list; struct list_head *mods_pci_res_map_list; -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) struct list_head *mods_ppc_tce_bypass_list; + struct list_head *mods_nvlink_sysmem_trained_list; #endif struct mods_file_private_data *private_data; int id = 0; @@ -556,7 +557,7 @@ static int mods_krnl_open(struct inode *ip, struct file *fp) return -ENOMEM; } -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) mods_ppc_tce_bypass_list = kmalloc(sizeof(struct list_head), GFP_KERNEL | __GFP_NORETRY); if (unlikely(!mods_ppc_tce_bypass_list)) { @@ -566,6 +567,17 @@ static int mods_krnl_open(struct inode *ip, struct file *fp) LOG_EXT(); return -ENOMEM; } + + mods_nvlink_sysmem_trained_list = + kmalloc(sizeof(struct list_head), GFP_KERNEL | __GFP_NORETRY); + if (unlikely(!mods_nvlink_sysmem_trained_list)) { + kfree(mods_alloc_list); + kfree(mods_mapping_list); + kfree(mods_pci_res_map_list); + kfree(mods_ppc_tce_bypass_list); + LOG_EXT(); + return -ENOMEM; + } #endif private_data = kmalloc(sizeof(*private_data), @@ -574,8 +586,9 @@ static int mods_krnl_open(struct inode *ip, struct file *fp) kfree(mods_alloc_list); kfree(mods_mapping_list); kfree(mods_pci_res_map_list); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) kfree(mods_ppc_tce_bypass_list); + kfree(mods_nvlink_sysmem_trained_list); #endif LOG_EXT(); return -ENOMEM; @@ -587,8 +600,9 @@ static int mods_krnl_open(struct inode *ip, struct file *fp) kfree(mods_alloc_list); kfree(mods_mapping_list); kfree(mods_pci_res_map_list); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) kfree(mods_ppc_tce_bypass_list); + kfree(mods_nvlink_sysmem_trained_list); #endif kfree(private_data); LOG_EXT(); @@ -604,9 +618,12 @@ static int mods_krnl_open(struct inode *ip, struct file *fp) private_data->mods_alloc_list = mods_alloc_list; private_data->mods_mapping_list = mods_mapping_list; private_data->mods_pci_res_map_list = mods_pci_res_map_list; -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) INIT_LIST_HEAD(mods_ppc_tce_bypass_list); + INIT_LIST_HEAD(mods_nvlink_sysmem_trained_list); private_data->mods_ppc_tce_bypass_list = mods_ppc_tce_bypass_list; + private_data->mods_nvlink_sysmem_trained_list + = mods_nvlink_sysmem_trained_list; #endif private_data->enabled_devices = 0; private_data->mem_type.dma_addr = 0; @@ -649,10 +666,14 @@ static int mods_krnl_close(struct inode *ip, struct file *fp) if (ret) mods_error_printk("failed to free pci mappings\n"); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) ret = mods_unregister_all_ppc_tce_bypass(fp); if (ret) mods_error_printk("failed to restore dma bypass\n"); + + ret = mods_unregister_all_nvlink_sysmem_trained(fp); + if (ret) + mods_error_printk("failed to free nvlink trained\n"); #endif mods_disable_all_devices(private_data); @@ -660,8 +681,9 @@ static int mods_krnl_close(struct inode *ip, struct file *fp) kfree(private_data->mods_alloc_list); kfree(private_data->mods_mapping_list); kfree(private_data->mods_pci_res_map_list); -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) kfree(private_data->mods_ppc_tce_bypass_list); + kfree(private_data->mods_nvlink_sysmem_trained_list); #endif kfree(private_data); @@ -1187,7 +1209,7 @@ static long mods_krnl_ioctl(struct file *fp, esc_mods_phys_to_virtual, MODS_PHYSICAL_TO_VIRTUAL); break; -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) +#if defined(CONFIG_PPC64) case MODS_ESC_SET_PPC_TCE_BYPASS: MODS_IOCTL(MODS_ESC_SET_PPC_TCE_BYPASS, esc_mods_set_ppc_tce_bypass, @@ -1199,6 +1221,11 @@ static long mods_krnl_ioctl(struct file *fp, esc_mods_get_ats_address_range, MODS_GET_ATS_ADDRESS_RANGE); break; + case MODS_ESC_SET_NVLINK_SYSMEM_TRAINED: + MODS_IOCTL(MODS_ESC_SET_NVLINK_SYSMEM_TRAINED, + esc_mods_set_nvlink_sysmem_trained, + MODS_SET_NVLINK_SYSMEM_TRAINED); + break; #endif case MODS_ESC_DMA_MAP_MEMORY: diff --git a/drivers/misc/mods/mods_mem.c b/drivers/misc/mods/mods_mem.c index 430583b5..e883fe94 100644 --- a/drivers/misc/mods/mods_mem.c +++ b/drivers/misc/mods/mods_mem.c @@ -36,6 +36,9 @@ static int mods_post_alloc(struct MODS_PHYS_CHUNK *pt, static void mods_pre_free(struct MODS_PHYS_CHUNK *pt, struct MODS_MEM_INFO *p_mem_info); +static u64 mods_compress_nvlink_addr(struct pci_dev *dev, u64 addr); +static u64 mods_expand_nvlink_addr(struct pci_dev *dev, u64 addr47); + /**************************** * DMA MAP HELPER FUNCTIONS * ****************************/ @@ -47,6 +50,8 @@ static void mods_dma_unmap_page(struct MODS_DMA_MAP *p_dma_map, if (!pm->pt) return; + pm->map_addr = mods_expand_nvlink_addr(p_dma_map->dev, pm->map_addr); + pci_unmap_page(p_dma_map->dev, pm->map_addr, (1U<pt->order)*PAGE_SIZE, @@ -155,6 +160,9 @@ static void mods_dma_map_pages(struct MODS_MEM_INFO *p_mem_info, (1U << pt->order) * PAGE_SIZE, DMA_BIDIRECTIONAL); + pm->map_addr = mods_compress_nvlink_addr(p_dma_map->dev, + pm->map_addr); + mods_debug_printk(DEBUG_MEM_DETAILED, "%s : Mapped map_addr=0x%llx, dma_addr=0x%llx on dev %x:%x:%x.%x\n", __func__, @@ -742,6 +750,11 @@ int esc_mods_device_alloc_pages_2(struct file *fp, p_mem_info->dev = dev; #if defined(MODS_HAS_DEV_TO_NUMA_NODE) p_mem_info->numa_node = dev_to_node(&dev->dev); +#endif +#if defined(MODS_HAS_PNV_PCI_GET_NPU_DEV) + if (!mods_is_nvlink_sysmem_trained(fp, dev) && + pnv_pci_get_npu_dev(dev, 0)) + p_mem_info->numa_node = 0; #endif mods_debug_printk(DEBUG_MEM_DETAILED, "affinity %x:%x.%x node %d\n", @@ -1178,349 +1191,6 @@ int esc_mods_memory_barrier(struct file *fp) #endif } -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) -static struct PPC_TCE_BYPASS *mods_find_ppc_tce_bypass(struct file *fp, - struct pci_dev *dev) -{ - MODS_PRIV private_data = fp->private_data; - struct list_head *plist_head; - struct list_head *plist_iter; - struct PPC_TCE_BYPASS *p_ppc_tce_bypass; - - plist_head = private_data->mods_ppc_tce_bypass_list; - - list_for_each(plist_iter, plist_head) { - p_ppc_tce_bypass = list_entry(plist_iter, - struct PPC_TCE_BYPASS, - list); - if (dev == p_ppc_tce_bypass->dev) - return p_ppc_tce_bypass; - } - - /* The device has never had its dma mask changed */ - return NULL; -} - -static int mods_register_ppc_tce_bypass(struct file *fp, - struct pci_dev *dev, - u64 original_mask) -{ - MODS_PRIV private_data = fp->private_data; - struct PPC_TCE_BYPASS *p_ppc_tce_bypass; - - /* only register the first time in order to restore the true actual dma - * mask - */ - if (mods_find_ppc_tce_bypass(fp, dev) != NULL) { - mods_debug_printk(DEBUG_MEM, - "TCE bypass already registered on dev %x:%x:%x.%x\n", - pci_domain_nr(dev->bus), - dev->bus->number, - PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - return OK; - } - - if (unlikely(mutex_lock_interruptible(&private_data->mtx))) - return -EINTR; - - p_ppc_tce_bypass = kmalloc(sizeof(struct PPC_TCE_BYPASS), - GFP_KERNEL | __GFP_NORETRY); - if (unlikely(!p_ppc_tce_bypass)) { - mods_error_printk("failed to allocate TCE bypass struct\n"); - LOG_EXT(); - return -ENOMEM; - } - - p_ppc_tce_bypass->dev = dev; - p_ppc_tce_bypass->dma_mask = original_mask; - - list_add(&p_ppc_tce_bypass->list, - private_data->mods_ppc_tce_bypass_list); - - mods_debug_printk(DEBUG_MEM, - "Registered TCE bypass on dev %x:%x:%x.%x\n", - pci_domain_nr(dev->bus), - dev->bus->number, - PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - mutex_unlock(&private_data->mtx); - return OK; -} - -static int mods_unregister_ppc_tce_bypass(struct file *fp, struct pci_dev *dev) -{ - struct PPC_TCE_BYPASS *p_ppc_tce_bypass; - MODS_PRIV private_data = fp->private_data; - struct list_head *head = private_data->mods_ppc_tce_bypass_list; - struct list_head *iter; - - LOG_ENT(); - - if (unlikely(mutex_lock_interruptible(&private_data->mtx))) - return -EINTR; - - list_for_each(iter, head) { - p_ppc_tce_bypass = - list_entry(iter, struct PPC_TCE_BYPASS, list); - - if (p_ppc_tce_bypass->dev == dev) { - int ret = 0; - - list_del(iter); - - mutex_unlock(&private_data->mtx); - - ret = pci_set_dma_mask(p_ppc_tce_bypass->dev, - p_ppc_tce_bypass->dma_mask); - dma_set_coherent_mask(&p_ppc_tce_bypass->dev->dev, - dev->dma_mask); - mods_debug_printk(DEBUG_MEM, - "Restored dma_mask on dev %x:%x:%x.%x to %llx\n", - pci_domain_nr(p_ppc_tce_bypass->dev->bus), - p_ppc_tce_bypass->dev->bus->number, - PCI_SLOT(p_ppc_tce_bypass->dev->devfn), - PCI_FUNC(p_ppc_tce_bypass->dev->devfn), - p_ppc_tce_bypass->dma_mask); - - kfree(p_ppc_tce_bypass); - - LOG_EXT(); - return ret; - } - } - - mutex_unlock(&private_data->mtx); - - mods_error_printk( - "Failed to unregister TCE bypass on dev %x:%x:%x.%x\n", - pci_domain_nr(dev->bus), - dev->bus->number, - PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - LOG_EXT(); - - return -EINVAL; - -} - -int mods_unregister_all_ppc_tce_bypass(struct file *fp) -{ - MODS_PRIV private_data = fp->private_data; - struct list_head *head = private_data->mods_ppc_tce_bypass_list; - struct list_head *iter; - struct list_head *tmp; - - list_for_each_safe(iter, tmp, head) { - struct PPC_TCE_BYPASS *p_ppc_tce_bypass; - int ret; - - p_ppc_tce_bypass = - list_entry(iter, struct PPC_TCE_BYPASS, list); - ret = mods_unregister_ppc_tce_bypass(fp, p_ppc_tce_bypass->dev); - if (ret) - return ret; - } - - return OK; -} - -int esc_mods_set_ppc_tce_bypass(struct file *fp, - struct MODS_SET_PPC_TCE_BYPASS *p) -{ - int ret = OK; - dma_addr_t dma_addr; - unsigned int devfn = PCI_DEVFN(p->pci_device.device, - p->pci_device.function); - struct pci_dev *dev = MODS_PCI_GET_SLOT(p->pci_device.domain, - p->pci_device.bus, - devfn); - u64 original_dma_mask; - u32 bypass_mode = p->mode; - u32 cur_bypass_mode = MODS_PPC_TCE_BYPASS_OFF; - u64 dma_mask = DMA_BIT_MASK(64); - - LOG_ENT(); - - if (!dev) { - mods_error_printk( - "PCI device not found %x:%x:%x.%x\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - LOG_EXT(); - return -EINVAL; - } - - original_dma_mask = dev->dma_mask; - - if (bypass_mode == MODS_PPC_TCE_BYPASS_DEFAULT) - bypass_mode = mods_get_ppc_tce_bypass(); - - if (original_dma_mask == DMA_BIT_MASK(64)) - cur_bypass_mode = MODS_PPC_TCE_BYPASS_ON; - - - /* - * Linux on IBM POWER8 offers 2 different DMA set-ups, sometimes - * referred to as "windows". - * - * The "default window" provides a 2GB region of PCI address space - * located below the 32-bit line. The IOMMU is used to provide a - * "rich" mapping--any page in system memory can be mapped at an - * arbitrary address within this window. The mappings are dynamic - * and pass in and out of being as pci_map*()/pci_unmap*() calls - * are made. - * - * Dynamic DMA Windows (sometimes "Huge DDW", also PPC TCE Bypass "ON") - * provides a linear - * mapping of the system's entire physical address space at some - * fixed offset above the 59-bit line. IOMMU is still used, and - * pci_map*()/pci_unmap*() are still required, but mappings are - * static. They're effectively set up in advance, and any given - * system page will always map to the same PCI bus address. I.e. - * physical 0x00000000xxxxxxxx => PCI 0x08000000xxxxxxxx - * - * Linux on POWER8 will only provide the DDW-style full linear - * mapping when the driver claims support for 64-bit DMA addressing - * (a pre-requisite because the PCI addresses used in this case will - * be near the top of the 64-bit range). The linear mapping - * is not available in all system configurations. - * - * Detect whether the linear mapping is present by claiming - * 64-bit support and then mapping physical page 0. For historical - * reasons, Linux on POWER8 will never map a page to PCI address 0x0. - * In the "default window" case page 0 will be mapped to some - * non-zero address below the 32-bit line. In the - * DDW/linear-mapping case, it will be mapped to address 0 plus - * some high-order offset. - * - * If the linear mapping is present and sane then return the offset - * as the starting address for all DMA mappings. - */ - if ((bypass_mode != MODS_PPC_TCE_BYPASS_DEFAULT) && - (cur_bypass_mode != bypass_mode)) { - /* Set DMA mask appropriately here */ - if (bypass_mode == MODS_PPC_TCE_BYPASS_OFF) - dma_mask = p->device_dma_mask; - - if (pci_set_dma_mask(dev, dma_mask) != 0) { - mods_error_printk( - "pci_set_dma_mask failed on dev %x:%x:%x.%x\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - LOG_EXT(); - return -EINVAL; - } - } - - dma_addr = pci_map_single(dev, NULL, 1, DMA_BIDIRECTIONAL); - if (pci_dma_mapping_error(dev, dma_addr)) { - pci_set_dma_mask(dev, original_dma_mask); - mods_error_printk( - "pci_map_single failed on dev %x:%x:%x.%x\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - LOG_EXT(); - return -EINVAL; - } - pci_unmap_single(dev, dma_addr, 1, DMA_BIDIRECTIONAL); - - if (bypass_mode == MODS_PPC_TCE_BYPASS_ON) { - bool bBypassFailed = false; - - /* - * From IBM: "For IODA2, native DMA bypass or KVM TCE-based - * implementation of full 64-bit DMA support will establish a - * window in address-space with the high 14 bits being constant - * and the bottom up-to-50 bits varying with the mapping." - * - * Unfortunately, we don't have any good interfaces or - * definitions from the kernel to get information about the DMA - * offset assigned by OS. However, we have been told that the - * offset will be defined by the top 14 bits of the address, - * and bits 40-49 will not vary for any DMA mappings until 1TB - * of system memory is surpassed; this limitation is essential - * for us to function properly since our current GPUs only - * support 40 physical address bits. We are in a fragile place - * where we need to tell the OS that we're capable of 64-bit - * addressing, while relying on the assumption that the top 24 - * bits will not vary in this case. - * - * The way we try to compute the window, then, is mask the trial - * mapping against the DMA capabilities of the device. That way, - * devices with greater addressing capabilities will only take - * the bits it needs to define the window. - */ - if ((dma_addr & DMA_BIT_MASK(32)) != 0) { - /* - * Huge DDW not available - page 0 mapped to non-zero - * address below the 32-bit line. - */ - mods_warning_printk( - "Enabling PPC TCE bypass mode failed due to platform on device %x:%x:%x.%x\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - bBypassFailed = true; - } else if ((dma_addr & original_dma_mask) != 0) { - /* - * The physical window straddles our addressing limit - * boundary, e.g., for an adapter that can address up to - * 1TB, the window crosses the 40-bit limit so that the - * lower end of the range has different bits 63:40 than - * the higher end of the range. We can only handle a - * single, static value for bits 63:40, so we must fall - * back here. - */ - mods_warning_printk( - "Enabling PPC TCE bypass mode failed due to memory size on device %x:%x:%x.%x\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - bBypassFailed = true; - } - if (bBypassFailed) - pci_set_dma_mask(dev, original_dma_mask); - } - - mods_debug_printk(DEBUG_MEM, - "%s ppc tce bypass on device %x:%x:%x.%x with dma mask 0x%llx\n", - (dev->dma_mask == DMA_BIT_MASK(64)) ? "Enabled" : "Disabled", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function, - dev->dma_mask); - - p->dma_base_address = dma_addr & ~(p->device_dma_mask); - - mods_debug_printk(DEBUG_MEM, - "dma base address 0x%0llx on device %x:%x:%x.%x\n", - p->dma_base_address, - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - - /* Update the coherent mask to match */ - dma_set_coherent_mask(&dev->dev, dev->dma_mask); - - if (original_dma_mask != dev->dma_mask) - ret = mods_register_ppc_tce_bypass(fp, dev, original_dma_mask); - - LOG_EXT(); - return ret; -} -#endif - int esc_mods_dma_map_memory(struct file *fp, struct MODS_DMA_MAP_MEMORY *p) { @@ -1806,3 +1476,56 @@ static void mods_pre_free(struct MODS_PHYS_CHUNK *pt, kunmap(pt->p_page + i); } } + +/* + * Starting on Power9 systems, DMA addresses for NVLink are no longer + * the same as used over PCIE. + * + * Power9 supports a 56-bit Real Address. This address range is compressed + * when accessed over NvLink to allow the GPU to access all of memory using + * its 47-bit Physical address. + * + * If there is an NPU device present on the system, it implies that NvLink + * sysmem links are present and we need to apply the required address + * conversion for NvLink within the driver. This is intended to be temporary + * to ease the transition to kernel APIs to handle NvLink DMA mappings + * via the NPU device. + * + * Note, a deviation from the documented compression scheme is that the + * upper address bits (i.e. bit 56-63) instead of being set to zero are + * preserved during NvLink address compression so the orignal PCIE DMA + * address can be reconstructed on expansion. These bits can be safely + * ignored on NvLink since they are truncated by the GPU. + */ +static u64 mods_compress_nvlink_addr(struct pci_dev *dev, u64 addr) +{ + u64 addr47 = addr; + /* Note, one key difference from the documented compression scheme + * is that BIT59 used for TCE bypass mode on PCIe is preserved during + * NVLink address compression to allow for the resulting DMA address to + * be used transparently on PCIe. + */ +#if defined(MODS_HAS_PNV_PCI_GET_NPU_DEV) + if (pnv_pci_get_npu_dev(dev, 0)) { + addr47 = addr & (1LLU << 59); + addr47 |= ((addr >> 45) & 0x3) << 43; + addr47 |= ((addr >> 49) & 0x3) << 45; + addr47 |= addr & ((1LLU << 43) - 1); + } +#endif + return addr47; +} + +static u64 mods_expand_nvlink_addr(struct pci_dev *dev, u64 addr47) +{ + u64 addr = addr47; +#if defined(MODS_HAS_PNV_PCI_GET_NPU_DEV) + if (pnv_pci_get_npu_dev(dev, 0)) { + addr = addr47 & ((1LLU << 43) - 1); + addr |= (addr47 & (3ULL << 43)) << 2; + addr |= (addr47 & (3ULL << 45)) << 4; + addr |= addr47 & ~((1ULL << 56) - 1); + } +#endif + return addr; +} diff --git a/drivers/misc/mods/mods_pci.c b/drivers/misc/mods/mods_pci.c index f9eed9a0..126974ae 100644 --- a/drivers/misc/mods/mods_pci.c +++ b/drivers/misc/mods/mods_pci.c @@ -107,6 +107,7 @@ int mods_unregister_all_pci_res_mappings(struct file *fp) int esc_mods_find_pci_dev_2(struct file *pfile, struct MODS_FIND_PCI_DEVICE_2 *p) { + MODS_PRIV private_data = pfile->private_data; struct pci_dev *dev; int index = 0; @@ -124,6 +125,16 @@ int esc_mods_find_pci_dev_2(struct file *pfile, p->pci_device.bus = dev->bus->number; p->pci_device.device = PCI_SLOT(dev->devfn); p->pci_device.function = PCI_FUNC(dev->devfn); + /* Enable device on the PCI bus */ + if (mods_enable_device(private_data, dev)) { + mods_error_printk( + "unable to enable dev %04x:%02x:%02x.%x\n", + (unsigned int)p->pci_device.domain, + (unsigned int)p->pci_device.bus, + (unsigned int)p->pci_device.device, + (unsigned int)p->pci_device.function); + return -EINVAL; + } return OK; } dev = pci_get_device(p->vendor_id, p->device_id, dev); @@ -136,6 +147,7 @@ int esc_mods_find_pci_dev_2(struct file *pfile, int esc_mods_find_pci_dev(struct file *pfile, struct MODS_FIND_PCI_DEVICE *p) { + MODS_PRIV private_data = pfile->private_data; struct pci_dev *dev; int index = 0; @@ -152,6 +164,15 @@ int esc_mods_find_pci_dev(struct file *pfile, p->bus_number = dev->bus->number; p->device_number = PCI_SLOT(dev->devfn); p->function_number = PCI_FUNC(dev->devfn); + /* Enable device on the PCI bus */ + if (mods_enable_device(private_data, dev)) { + mods_error_printk( + "unable to enable dev %02x:%02x.%x\n", + (unsigned int)p->bus_number, + (unsigned int)p->device_number, + (unsigned int)p->function_number); + return -EINVAL; + } return OK; } /* Only return devices in the first domain, but don't assume @@ -168,6 +189,7 @@ int esc_mods_find_pci_dev(struct file *pfile, int esc_mods_find_pci_class_code_2(struct file *pfile, struct MODS_FIND_PCI_CLASS_CODE_2 *p) { + MODS_PRIV private_data = pfile->private_data; struct pci_dev *dev; int index = 0; @@ -182,6 +204,16 @@ int esc_mods_find_pci_class_code_2(struct file *pfile, p->pci_device.bus = dev->bus->number; p->pci_device.device = PCI_SLOT(dev->devfn); p->pci_device.function = PCI_FUNC(dev->devfn); + /* Enable device on the PCI bus */ + if (mods_enable_device(private_data, dev)) { + mods_error_printk( + "unable to enable dev %04x:%02x:%02x.%x\n", + (unsigned int)p->pci_device.domain, + (unsigned int)p->pci_device.bus, + (unsigned int)p->pci_device.device, + (unsigned int)p->pci_device.function); + return -EINVAL; + } return OK; } dev = pci_get_class(p->class_code, dev); @@ -194,6 +226,7 @@ int esc_mods_find_pci_class_code_2(struct file *pfile, int esc_mods_find_pci_class_code(struct file *pfile, struct MODS_FIND_PCI_CLASS_CODE *p) { + MODS_PRIV private_data = pfile->private_data; struct pci_dev *dev; int index = 0; @@ -207,6 +240,15 @@ int esc_mods_find_pci_class_code(struct file *pfile, p->bus_number = dev->bus->number; p->device_number = PCI_SLOT(dev->devfn); p->function_number = PCI_FUNC(dev->devfn); + /* Enable device on the PCI bus */ + if (mods_enable_device(private_data, dev)) { + mods_error_printk( + "unable to enable dev %02x:%02x.%x\n", + (unsigned int)p->bus_number, + (unsigned int)p->device_number, + (unsigned int)p->function_number); + return -EINVAL; + } return OK; } /* Only return devices in the first domain, but don't assume @@ -817,109 +859,3 @@ int esc_mods_pci_unmap_resource(struct file *fp, return OK; #endif } - -#if defined(MODS_HAS_SET_PPC_TCE_BYPASS) -int esc_mods_get_ats_address_range(struct file *fp, - struct MODS_GET_ATS_ADDRESS_RANGE *p) -{ - unsigned int devfn; - struct pci_dev *dev; - struct pci_dev *npu_dev; - struct device_node *mem_node = NULL; - const __u32 *val32; - const __u64 *val64; - int len; - int ret = -EINVAL; - - LOG_ENT(); - - mods_debug_printk(DEBUG_PCICFG, - "get ats addr, dev %04x:%x:%02x:%x, npu index %d\n", - (int)p->pci_device.domain, - (int)p->pci_device.bus, - (int)p->pci_device.device, - (int)p->pci_device.function, - (int)p->npu_index); - - devfn = PCI_DEVFN(p->pci_device.device, p->pci_device.function); - dev = MODS_PCI_GET_SLOT(p->pci_device.domain, p->pci_device.bus, devfn); - if (dev == NULL) { - mods_error_printk("PCI device %04x:%x:%02x.%x not found\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - goto exit; - } - - npu_dev = pnv_pci_get_npu_dev(dev, p->npu_index); - if (npu_dev == NULL) { - mods_error_printk("NPU device for %04x:%x:%02x.%x not found\n", - p->pci_device.domain, - p->pci_device.bus, - p->pci_device.device, - p->pci_device.function); - goto exit; - } - - p->npu_device.domain = pci_domain_nr(npu_dev->bus); - p->npu_device.bus = npu_dev->bus->number; - p->npu_device.device = PCI_SLOT(npu_dev->devfn); - p->npu_device.function = PCI_FUNC(npu_dev->devfn); - - mods_debug_printk(DEBUG_PCICFG, - "Found NPU device %04x:%x:%02x.%x\n", - p->npu_device.domain, - p->npu_device.bus, - p->npu_device.device, - p->npu_device.function); - - val32 = (const __u32 *)of_get_property(npu_dev->dev.of_node, - "memory-region", - &len); - if (!val32 || len < 4) { - mods_error_printk("Property memory-region for NPU not found\n"); - goto exit; - } - - mem_node = of_find_node_by_phandle(be32_to_cpu(*val32)); - if (!mem_node) { - mods_error_printk("Node memory-region for NPU not found\n"); - goto exit; - } - - p->numa_memory_node = of_node_to_nid(mem_node); - if (p->numa_memory_node == NUMA_NO_NODE) { - mods_error_printk("NUMA node for NPU not found\n"); - goto exit; - } - - val64 = (const __u64 *)of_get_property(npu_dev->dev.of_node, - "ibm,device-tgt-addr", - &len); - if (!val64 || len < 8) { - mods_error_printk( - "Property ibm,device-tgt-addr for NPU not found\n"); - goto exit; - } - - p->phys_addr = be64_to_cpu(*val64); - - val64 = (const __u64 *)of_get_property(mem_node, "reg", &len); - if (!val64 || len < 16) { - mods_error_printk("Property reg for memory region not found\n"); - goto exit; - } - - p->guest_addr = be64_to_cpu(val64[0]); - p->aperture_size = be64_to_cpu(val64[1]); - - ret = OK; - -exit: - if (mem_node) - of_node_put(mem_node); - LOG_EXT(); - return ret; -} -#endif