/* * SPDX-FileCopyrightText: Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include "nv-pci-table.h" #include "nv-pci-types.h" #include "nv-pci.h" #include "nv-msi.h" #include "nv-hypervisor.h" #include "nv-reg.h" #if defined(NV_VGPU_KVM_BUILD) #include "nv-vgpu-vfio-interface.h" #endif #include #include #include #include #include #include #if defined(CONFIG_PM_DEVFREQ) #include #if defined(CONFIG_DEVFREQ_THERMAL) \ && defined(NV_DEVFREQ_DEV_PROFILE_HAS_IS_COOLING_DEVICE) \ && defined(NV_THERMAL_ZONE_FOR_EACH_TRIP_PRESENT) \ && defined(NV_THERMAL_BIND_CDEV_TO_TRIP_PRESENT) \ && defined(NV_THERMAL_UNBIND_CDEV_FROM_TRIP_PRESENT) #include #define NV_HAS_COOLING_SUPPORTED 1 #else #define NV_HAS_COOLING_SUPPORTED 0 #endif #endif #if defined(CONFIG_INTERCONNECT) \ && defined(NV_ICC_SET_BW_PRESENT) \ && defined(NV_DEVM_ICC_GET_PRESENT) #include #define NV_HAS_ICC_SUPPORTED 1 #else #define NV_HAS_ICC_SUPPORTED 0 #endif #if defined(NV_SEQ_READ_ITER_PRESENT) #include #include #endif #include "detect-self-hosted.h" #if !defined(NV_BUS_TYPE_HAS_IOMMU_OPS) #include #endif #if NV_IS_EXPORT_SYMBOL_GPL_pci_ats_supported #include #endif /* * Set the default devfreq suspend frequency to 315 MHz * considering the balance between power consumption * and performance based on various scenarios. */ #define NV_PCI_TEGRA_DEVFREQ_SUSPEND_FREQ 315000000 extern int NVreg_GrdmaPciTopoCheckOverride; static void nv_check_and_exclude_gpu( nvidia_stack_t *sp, nv_state_t *nv ) { char *uuid_str; uuid_str = rm_get_gpu_uuid(sp, nv); if (uuid_str == NULL) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "Unable to read UUID"); return; } if (nv_is_uuid_in_gpu_exclusion_list(uuid_str)) { NV_STATUS rm_status = rm_exclude_adapter(sp, nv); if (rm_status != NV_OK) { NV_DEV_PRINTF_STATUS(NV_DBG_ERRORS, nv, rm_status, "Failed to exclude GPU %s", uuid_str); goto done; } nv->flags |= NV_FLAG_EXCLUDE; NV_DEV_PRINTF(NV_DBG_INFO, nv, "Excluded GPU %s successfully\n", uuid_str); } done: os_free_mem(uuid_str); } static NvBool nv_treat_missing_irq_as_error(void) { #if defined(NV_LINUX_PCIE_MSI_SUPPORTED) return (nv_get_hypervisor_type() != OS_HYPERVISOR_HYPERV); #else return NV_TRUE; #endif } static void nv_get_pci_sysfs_config ( struct pci_dev *pci_dev, nv_linux_state_t *nvl ) { #if NV_FILESYSTEM_ACCESS_AVAILABLE char filename[50]; int ret; ret = snprintf(filename, sizeof(filename), "/sys/bus/pci/devices/%04x:%02x:%02x.0/config", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev)); if (ret > 0 && ret < sizeof(filename)) { struct file *file = filp_open(filename, O_RDONLY, 0); if (!IS_ERR(file)) { #if defined(NV_SEQ_READ_ITER_PRESENT) /* * Sanity check for confirming if file path is mounted over * sysfs file system. */ if ((file->f_inode != NULL) && (file->f_inode->i_sb != NULL) && (strcmp(file->f_inode->i_sb->s_id, "sysfs") == 0)) { struct seq_file *sf = file->private_data; /* * Sanity check for confirming if 'file->private_data' * actually points to 'struct seq_file'. */ if ((sf != NULL) && (sf->file == file) && (sf->op == NULL)) { struct kernfs_open_file *of = sf->private; /* * Sanity check for confirming if 'sf->private' * actually points to 'struct kernfs_open_file'. */ if ((of != NULL) && (of->file == file) && (of->seq_file == sf)) { nvl->sysfs_config_file = file; } } } if (nvl->sysfs_config_file == NULL) { filp_close(file, NULL); } #else nvl->sysfs_config_file = file; #endif } } #endif } static int nv_resize_pcie_bars(struct pci_dev *pci_dev) { #if defined(NV_PCI_REBAR_GET_POSSIBLE_SIZES_PRESENT) u16 cmd; int r, old_size, requested_size; unsigned long sizes; int ret = 0; #if NV_IS_EXPORT_SYMBOL_PRESENT_pci_find_host_bridge struct pci_host_bridge *host; #endif if (NVreg_EnableResizableBar == 0) { nv_printf(NV_DBG_INFO, "NVRM: resizable BAR disabled by regkey, skipping\n"); return 0; } // Check if BAR1 has PCIe rebar capabilities sizes = pci_rebar_get_possible_sizes(pci_dev, NV_GPU_BAR1); if (sizes == 0) { /* ReBAR not available. Nothing to do. */ return 0; } /* Try to resize the BAR to the largest supported size */ requested_size = fls(sizes) - 1; /* Save the current size, just in case things go wrong */ old_size = pci_rebar_bytes_to_size(pci_resource_len(pci_dev, NV_GPU_BAR1)); if (old_size == requested_size) { nv_printf(NV_DBG_INFO, "NVRM: %04x:%02x:%02x.%x: BAR1 already at requested size.\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); return 0; } #if NV_IS_EXPORT_SYMBOL_PRESENT_pci_find_host_bridge /* If the kernel will refuse us, don't even try to resize, but give an informative error */ host = pci_find_host_bridge(pci_dev->bus); if (host->preserve_config) { nv_printf(NV_DBG_INFO, "NVRM: Not resizing BAR because the firmware forbids moving windows.\n"); return 0; } #endif nv_printf(NV_DBG_INFO, "NVRM: %04x:%02x:%02x.%x: Attempting to resize BAR1.\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); /* Disable memory decoding - required by the kernel APIs */ pci_read_config_word(pci_dev, PCI_COMMAND, &cmd); pci_write_config_word(pci_dev, PCI_COMMAND, cmd & ~PCI_COMMAND_MEMORY); /* Release BAR1 */ pci_release_resource(pci_dev, NV_GPU_BAR1); /* Release BAR3 - we don't want to resize it, it's in the same bridge, so we'll want to move it */ pci_release_resource(pci_dev, NV_GPU_BAR3); resize: /* Attempt to resize BAR1 to the largest supported size */ r = pci_resize_resource(pci_dev, NV_GPU_BAR1, requested_size); if (r) { if (r == -ENOSPC) { /* step through smaller sizes down to original size */ if (requested_size > old_size) { clear_bit(fls(sizes) - 1, &sizes); requested_size = fls(sizes) - 1; goto resize; } else { nv_printf(NV_DBG_ERRORS, "NVRM: No address space to allocate resized BAR1.\n"); } } else if (r == -EOPNOTSUPP) { nv_printf(NV_DBG_WARNINGS, "NVRM: BAR resize resource not supported.\n"); } else { nv_printf(NV_DBG_WARNINGS, "NVRM: BAR resizing failed with error `%d`.\n", r); } } /* Re-attempt assignment of PCIe resources */ pci_assign_unassigned_bus_resources(pci_dev->bus); if ((pci_resource_flags(pci_dev, NV_GPU_BAR1) & IORESOURCE_UNSET) || (pci_resource_flags(pci_dev, NV_GPU_BAR3) & IORESOURCE_UNSET)) { if (requested_size != old_size) { /* Try to get the BAR back with the original size */ requested_size = old_size; goto resize; } /* Something went horribly wrong and the kernel didn't manage to re-allocate BAR1. This is unlikely (because we had space before), but can happen. */ nv_printf(NV_DBG_ERRORS, "NVRM: FATAL: Failed to re-allocate BAR1.\n"); ret = -ENODEV; } /* Re-enable memory decoding */ pci_write_config_word(pci_dev, PCI_COMMAND, cmd); return ret; #else nv_printf(NV_DBG_INFO, "NVRM: Resizable BAR is not supported on this kernel version.\n"); return 0; #endif /* NV_PCI_REBAR_GET_POSSIBLE_SIZES_PRESENT */ } #if defined(CONFIG_ACPI_NUMA) && NV_IS_EXPORT_SYMBOL_PRESENT_pxm_to_node /* * Parse the SRAT table to look for numa node associated with the GPU. * * find_gpu_numa_nodes_in_srat() is strongly associated with * nv_init_coherent_link_info(). Hence matching the conditions wrapping. */ static NvU32 find_gpu_numa_nodes_in_srat(nv_linux_state_t *nvl) { NvU32 gi_dbdf, dev_dbdf, pxm_count = 0; struct acpi_table_header *table_header; struct acpi_subtable_header *subtable_header; unsigned long table_end, subtable_header_length; struct acpi_srat_generic_affinity *gi; NvU32 numa_node = NUMA_NO_NODE; if (NV_PCI_DEVFN(nvl->pci_dev) != 0) { nv_printf(NV_DBG_ERRORS, "NVRM: Failing to parse SRAT GI for %04x:%02x:%02x.%x " "since non-zero device function is not supported.\n", NV_PCI_DOMAIN_NUMBER(nvl->pci_dev), NV_PCI_BUS_NUMBER(nvl->pci_dev), NV_PCI_SLOT_NUMBER(nvl->pci_dev), PCI_FUNC(nvl->pci_dev->devfn)); return 0; } if (acpi_get_table(ACPI_SIG_SRAT, 0, &table_header)) { nv_printf(NV_DBG_INFO, "NVRM: Failed to parse the SRAT table.\n"); return 0; } table_end = (unsigned long)table_header + table_header->length; subtable_header = (struct acpi_subtable_header *) ((unsigned long)table_header + sizeof(struct acpi_table_srat)); subtable_header_length = subtable_header->length; dev_dbdf = NV_PCI_DOMAIN_NUMBER(nvl->pci_dev) << 16 | NV_PCI_BUS_NUMBER(nvl->pci_dev) << 8 | NV_PCI_DEVFN(nvl->pci_dev); /* * On baremetal and passthrough, there could be upto 8 generic initiators. * This is not a hack as a device can have any number of initiators hardware * supports. */ while (subtable_header_length && (((unsigned long)subtable_header) + subtable_header_length <= table_end)) { if (subtable_header->type == ACPI_SRAT_TYPE_GENERIC_AFFINITY) { NvU8 busAtByte2, busAtByte3; gi = (struct acpi_srat_generic_affinity *) subtable_header; busAtByte2 = gi->device_handle[2]; busAtByte3 = gi->device_handle[3]; // Device and function should be zero enforced by above check gi_dbdf = *((NvU16 *)(&gi->device_handle[0])) << 16 | (busAtByte2 != 0 ? busAtByte2 : busAtByte3) << 8; if (gi_dbdf == dev_dbdf) { numa_node = pxm_to_node(gi->proximity_domain); if (numa_node < MAX_NUMNODES) { pxm_count++; set_bit(numa_node, nvl->coherent_link_info.free_node_bitmap); } else { /* We shouldn't be here. This is a mis-configuration. */ nv_printf(NV_DBG_INFO, "NVRM: Invalid node-id found.\n"); pxm_count = 0; goto exit; } nv_printf(NV_DBG_INFO, "NVRM: matching SRAT GI entry: 0x%x 0x%x 0x%x 0x%x PXM: %d\n", gi->device_handle[3], gi->device_handle[2], gi->device_handle[1], gi->device_handle[0], gi->proximity_domain); if ((busAtByte2) == 0 && (busAtByte3) != 0) { /* * TODO: Remove this WAR once Hypervisor stack is updated * to fix this bug and after all CSPs have moved to using * the updated Hypervisor stack with fix. */ nv_printf(NV_DBG_WARNINGS, "NVRM: PCIe bus value picked from byte 3 offset in SRAT GI entry: 0x%x 0x%x 0x%x 0x%x PXM: %d\n" "NVRM: Hypervisor stack is old and not following ACPI spec defined offset.\n" "NVRM: Please consider upgrading the Hypervisor stack as this workaround will be removed in future release.\n", gi->device_handle[3], gi->device_handle[2], gi->device_handle[1], gi->device_handle[0], gi->proximity_domain); } } } subtable_header = (struct acpi_subtable_header *) ((unsigned long) subtable_header + subtable_header_length); subtable_header_length = subtable_header->length; } exit: acpi_put_table(table_header); return pxm_count; } #endif // defined(CONFIG_ACPI_NUMA) && NV_IS_EXPORT_SYMBOL_PRESENT_pxm_to_node static void nv_init_coherent_link_info ( nv_state_t *nv ) { #if defined(CONFIG_ACPI_NUMA) && NV_IS_EXPORT_SYMBOL_PRESENT_pxm_to_node nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv); NvU64 pa = 0; NvU64 pxm_start = 0; NvU64 pxm_count = 0; NvU32 pxm; NvU32 gi_found = 0, node; if (!NVCPU_IS_AARCH64) return; if (!dev_is_pci(nvl->dev)) return; gi_found = find_gpu_numa_nodes_in_srat(nvl); if (!gi_found && (device_property_read_u64(nvl->dev, "nvidia,gpu-mem-pxm-start", &pxm_start) != 0 || device_property_read_u64(nvl->dev, "nvidia,gpu-mem-pxm-count", &pxm_count) != 0)) goto failed; if (device_property_read_u64(nvl->dev, "nvidia,gpu-mem-base-pa", &pa) == 0) { nvl->coherent_link_info.gpu_mem_pa = pa; } else { unsigned int gpu_bar1_offset, gpu_bar2_offset; /* * This implies that the DSD key for PXM start and count is present * while the one for Physical Address (PA) is absent. */ if (nv_get_hypervisor_type() == OS_HYPERVISOR_UNKNOWN) { /* Fail for the baremetal case */ goto failed; } /* * For the virtualization usecase on SHH, the coherent GPU memory * PA is exposed as BAR2 to the VM and the "nvidia,gpu-mem-base-pa" * is not present. Set the GPU memory PA to the BAR2 start address. * * In the case of passthrough, reserved memory portion of the coherent * GPU memory is exposed as BAR1 */ /* * Hopper+ uses 64-bit BARs, so GPU BAR2 should be at BAR4/5 and * GPU BAR1 is at BAR2/3 */ gpu_bar1_offset = 2; gpu_bar2_offset = 4; /* * cannot use nv->bars[] here as it is not populated correctly if BAR1 is * not present but BAR2 is, even though PCIe spec allows it. Not fixing * nv->bars[] since this is not a valid scenario with the actual HW and * possible only with this host emulated BAR scenario. */ if (!((NV_PCI_RESOURCE_VALID(nvl->pci_dev, gpu_bar2_offset)) && (NV_PCI_RESOURCE_FLAGS(nvl->pci_dev, gpu_bar2_offset) & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY)) { // BAR2 contains the cacheable part of the coherent FB region and must have. goto failed; } nvl->coherent_link_info.gpu_mem_pa = NV_PCI_RESOURCE_START(nvl->pci_dev, gpu_bar2_offset); if ((pci_devid_is_self_hosted_hopper(nv->pci_info.device_id)) && (NV_PCI_RESOURCE_VALID(nvl->pci_dev, gpu_bar1_offset)) && (NV_PCI_RESOURCE_FLAGS(nvl->pci_dev, gpu_bar1_offset) & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY) { // Present only in passthrough case for self-hosted hopper. nvl->coherent_link_info.rsvd_mem_pa = NV_PCI_RESOURCE_START(nvl->pci_dev, gpu_bar1_offset); // // Unset nv->bars[1] only for self-hosted Hopper as BAR1 in virtualization case // for hopper is used to convey RM reserved memory information and doesn't contain // the traditional GPU BAR2. Starting from Blackwell BAR1 will be the real BAR1. // memset(&nv->bars[1], 0, sizeof(nv->bars[1])); } // // Unset nv->bars[2] for all self-hosted systems as BAR2 in the virtualization case // is used only to convey the coherent GPU memory information and doesn't contain // the traditional GPU BAR2. This is to ensure the coherent FB addresses don't // inadvertently pass the IS_FB_OFFSET or IS_IMEM_OFFSET checks. // memset(&nv->bars[2], 0, sizeof(nv->bars[2])); } NV_DEV_PRINTF(NV_DBG_INFO, nv, "DSD properties: \n"); NV_DEV_PRINTF(NV_DBG_INFO, nv, "\tGPU memory PA: 0x%lx \n", nvl->coherent_link_info.gpu_mem_pa); if (pci_devid_is_self_hosted_hopper(nv->pci_info.device_id)) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "\tGPU reserved memory PA: 0x%lx \n", nvl->coherent_link_info.rsvd_mem_pa); } if (!gi_found) { for (pxm = pxm_start; pxm < (pxm_start + pxm_count); pxm++) { node = pxm_to_node(pxm); if (node != NUMA_NO_NODE) { set_bit(node, nvl->coherent_link_info.free_node_bitmap); } } } for (node = 0; (node = find_next_bit(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES, node)) != MAX_NUMNODES; node++) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "\tNVRM: GPU memory NUMA node: %u\n", node); } if (NVreg_EnableUserNUMAManagement && !os_is_vgx_hyper()) { NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE); nvl->numa_info.use_auto_online = NV_TRUE; if (!bitmap_empty(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES)) { nvl->numa_info.node_id = find_first_bit(nvl->coherent_link_info.free_node_bitmap, MAX_NUMNODES); } NV_DEV_PRINTF(NV_DBG_SETUP, nv, "GPU NUMA information: node id: %u PA: 0x%llx\n", nvl->numa_info.node_id, nvl->coherent_link_info.gpu_mem_pa); } else { NV_DEV_PRINTF(NV_DBG_SETUP, nv, "User-mode NUMA onlining disabled.\n"); } return; failed: NV_DEV_PRINTF(NV_DBG_SETUP, nv, "Cannot get coherent link info.\n"); #endif // defined(CONFIG_ACPI_NUMA) && NV_IS_EXPORT_SYMBOL_PRESENT_pxm_to_node return; } #if defined(CONFIG_PM_DEVFREQ) #define to_tegra_devfreq_dev(x) \ container_of(x, struct nv_pci_tegra_devfreq_dev, dev) struct nv_pci_tegra_devfreq_data { const char *clk_name; const char *icc_name; const unsigned int gpc_fuse_field; const TEGRASOC_DEVFREQ_CLK devfreq_clk; }; #if NV_HAS_COOLING_SUPPORTED struct nv_pci_tegra_thermal_data { const char *tz_name; const struct thermal_trip *passive_trip; struct list_head zones; }; #endif struct nv_pci_tegra_devfreq_dev { TEGRASOC_DEVFREQ_CLK devfreq_clk; int domain; struct device dev; struct list_head gpc_cluster; struct list_head nvd_cluster; struct nv_pci_tegra_devfreq_dev *gpc_master; struct nv_pci_tegra_devfreq_dev *nvd_master; struct clk *clk; struct devfreq *devfreq; bool boost_enabled; struct delayed_work boost_disable; #if NV_HAS_ICC_SUPPORTED struct icc_path *icc_path; #endif #if NV_HAS_COOLING_SUPPORTED struct list_head therm_zones; #endif }; static const struct nv_pci_tegra_devfreq_data gb10b_tegra_devfreq_table[] = { { .clk_name = "gpc0clk", .icc_name = "gpu-write", .gpc_fuse_field = BIT(0), .devfreq_clk = TEGRASOC_DEVFREQ_CLK_GPC, }, { .clk_name = "gpc1clk", .icc_name = "gpu-write", .gpc_fuse_field = BIT(1), .devfreq_clk = TEGRASOC_DEVFREQ_CLK_GPC, }, { .clk_name = "gpc2clk", .icc_name = "gpu-write", .gpc_fuse_field = BIT(2), .devfreq_clk = TEGRASOC_DEVFREQ_CLK_GPC, }, { .clk_name = "nvdclk", .icc_name = "video-write", .devfreq_clk = TEGRASOC_DEVFREQ_CLK_NVD, }, { .clk_name = "sysclk" }, { .clk_name = "uprocclk" }, }; static void nv_pci_gb10b_device_release(struct device *dev) { ; } static int nv_pci_gb10b_devfreq_target(struct device *dev, unsigned long *freq, u32 flags) { struct pci_dev *pdev = to_pci_dev(dev->parent); nv_linux_state_t *nvl = pci_get_drvdata(pdev); struct nv_pci_tegra_devfreq_dev *tdev = to_tegra_devfreq_dev(dev), *tptr; unsigned long rate; #if NV_HAS_ICC_SUPPORTED static const unsigned num_mss_ports = 8; static const unsigned mss_port_bandwidth = 32; static const unsigned gpu_bus_bandwidth = num_mss_ports * mss_port_bandwidth; u32 kBps; #endif // // GPU can be under suspending/suspended/resuming state defined the runtime // PM (RPM) framework. When device is under either of these states, DVFS based // on GPU load information should be disabled. // // Complete load-based DVFS cycle involve GPU load query through rmapi and // clock scaling through BPMP MRQ_CLK mailbox request, which will awake the // GPU and contradict the suspended state. // if (!pm_runtime_active(&pdev->dev)) { *freq = tdev->devfreq->scaling_min_freq; return 0; } clk_set_rate(tdev->clk, *freq); *freq = clk_get_rate(tdev->clk); #if NV_HAS_ICC_SUPPORTED if (tdev->icc_path != NULL) { kBps = Bps_to_icc(*freq * gpu_bus_bandwidth * 400 / 1000); if (tdev->boost_enabled) { kBps = UINT_MAX; } icc_set_bw(tdev->icc_path, kBps, 0); } #endif rate = 0; list_for_each_entry(tptr, &tdev->gpc_cluster, gpc_cluster) { if (tptr->gpc_master != NULL) { rate = max(rate, clk_get_rate(tptr->gpc_master->clk)); } if (tptr->nvd_master != NULL) { rate = max(rate, clk_get_rate(tptr->nvd_master->clk)); } if (tdev->boost_enabled && (tptr == nvl->sys_devfreq_dev || tptr == nvl->pwr_devfreq_dev)) { clk_set_rate(tptr->clk, ULONG_MAX); } else { clk_set_rate(tptr->clk, rate); } } rate = 0; list_for_each_entry(tptr, &tdev->nvd_cluster, nvd_cluster) { if (tptr->gpc_master != NULL) { rate = max(rate, clk_get_rate(tptr->gpc_master->clk)); } if (tptr->nvd_master != NULL) { rate = max(rate, clk_get_rate(tptr->nvd_master->clk)); } if (tdev->boost_enabled && (tptr == nvl->sys_devfreq_dev || tptr == nvl->pwr_devfreq_dev)) { clk_set_rate(tptr->clk, ULONG_MAX); } else { clk_set_rate(tptr->clk, rate); } } return 0; } static int nv_pci_tegra_devfreq_get_cur_freq(struct device *dev, unsigned long *freq) { struct nv_pci_tegra_devfreq_dev *tdev = to_tegra_devfreq_dev(dev); // // When GPU is suspended(railgated), the PM runtime suspend callback should // suspend all devfreq devices, and devfreq cycle should not be triggered. // // However, users are still able to change the devfreq governor from the // sysfs interface and indirectly invoke the update_devfreq function, which // will further call the get_dev_status callback function. // if (pm_runtime_active(dev->parent)) *freq = clk_get_rate(tdev->clk); else *freq = tdev->devfreq->scaling_min_freq; return 0; } static int nv_pci_tegra_devfreq_get_dev_status(struct device *dev, struct devfreq_dev_status *stat) { struct pci_dev *pdev = to_pci_dev(dev->parent); nv_linux_state_t *nvl = pci_get_drvdata(pdev); nv_state_t *nv = NV_STATE_PTR(nvl); nvidia_stack_t *sp = NULL; struct nv_pci_tegra_devfreq_dev *tdev = to_tegra_devfreq_dev(dev); unsigned int load = 0; int retval = 0; NV_STATUS status; // // GPU can be under suspending/suspended/resuming state defined the runtime // PM (RPM) framework. When device is under either of these states, DVFS based // on GPU load information should be disabled. // // Complete load-based DVFS cycle involve GPU load query through rmapi and // clock scaling through BPMP MRQ_CLK mailbox request, which will awake the // GPU and contradict the suspended state. // if (!pm_runtime_active(&pdev->dev)) { stat->total_time = 100; stat->busy_time = 0; stat->current_frequency = tdev->devfreq->scaling_min_freq; return 0; } retval = nv_kmem_cache_alloc_stack(&sp); if (retval != 0) { dev_warn(&pdev->dev, "fail to nv_kmem_cache_alloc_stack: %d\n", retval); return -ENOMEM; } // // Fetch the load value in percentage from the specified clock domain. If the // load information is unavailable, just consider the load as 100% so that the // devfreq core will scale the underlying clock to Fmax to prevent any // performance drop. // status = rm_pmu_perfmon_get_load(sp, nv, &load, tdev->devfreq_clk); if (status != NV_OK) { load = 100; } // Load calculation equals to (busy_time / total_time) in devfreq governors // and devfreq governors expect total_time and busy_time in the same unit stat->total_time = 100; stat->busy_time = load; stat->current_frequency = clk_get_rate(tdev->clk); nv_kmem_cache_free_stack(sp); return retval; } static void populate_opp_table(struct nv_pci_tegra_devfreq_dev *tdev) { unsigned long max_rate, min_rate, step, rate; long val; /* Get the max rate of the clock */ val = clk_round_rate(tdev->clk, ULONG_MAX); max_rate = (val < 0) ? ULONG_MAX : (unsigned long)val; /* Get the min rate of the clock */ val = clk_round_rate(tdev->clk, 0); min_rate = (val < 0) ? ULONG_MAX : (unsigned long)val; /* Get the step size of the clock */ step = (min_rate == ULONG_MAX) ? ULONG_MAX : (min_rate + 1); val = clk_round_rate(tdev->clk, step); if ((val < 0) || (val < min_rate)) { step = 0; } else { step = (unsigned long)val - min_rate; } /* Create the OPP table */ rate = min_rate; do { dev_pm_opp_add(&tdev->dev, rate, 0); rate += step; } while (rate <= max_rate && step); } static void nv_pci_tegra_devfreq_remove_opps(struct nv_pci_tegra_devfreq_dev *tdev) { #if defined(NV_DEVFREQ_HAS_FREQ_TABLE) unsigned long *freq_table = tdev->devfreq->freq_table; unsigned int max_state = tdev->devfreq->max_state; #else unsigned long *freq_table = tdev->devfreq->profile->freq_table; unsigned int max_state = tdev->devfreq->profile->max_state; #endif int i; for (i = 0; i < max_state; i++) { dev_pm_opp_remove(&tdev->dev, freq_table[i]); } } #if NV_HAS_COOLING_SUPPORTED static int nv_pci_tegra_thermal_get_passive_trip_cb(struct thermal_trip *trip, void *arg) { const struct thermal_trip **ptrip = arg; /* Return zero to continue the search */ if (trip->type != THERMAL_TRIP_PASSIVE) return 0; /* Return nonzero to terminate the search */ *ptrip = trip; return -1; } static int nv_pci_tegra_init_cooling_device(struct nv_pci_tegra_devfreq_dev *tdev) { struct device *pdev = tdev->dev.parent; const struct thermal_trip *passive_trip = NULL; struct devfreq *devfreq = tdev->devfreq; struct nv_pci_tegra_thermal_data *data; struct thermal_zone_device *tzdev; int i, err, val, n_strings, n_elems; u32 temp_min, temp_max; const char *tz_name; if (!devfreq->cdev) { nv_printf(NV_DBG_ERRORS, "NVRM: devfreq cooling cannot be found\n"); return -ENODEV; } if (!pdev->of_node) { nv_printf(NV_DBG_ERRORS, "NVRM: associated OF node cannot be found\n"); return -ENODEV; } val = of_property_count_strings(pdev->of_node, "nvidia,thermal-zones"); if (val == -EINVAL) { return 0; } else if (val < 0) { nv_printf(NV_DBG_ERRORS, "NVRM: nvidia,thermal-zones DT property format error\n"); return val; } n_strings = val; val = of_property_count_u32_elems(pdev->of_node, "nvidia,cooling-device"); if (val == -EINVAL) { return 0; } else if (val < 0) { nv_printf(NV_DBG_ERRORS, "NVRM: nvidia,cooling-device DT property format error\n"); return val; } n_elems = val; if ((n_elems >> 1) != n_strings) { nv_printf(NV_DBG_ERRORS, "NVRM: number of strings specified in nvidia,thermal-zones needs to" "be exact half the number of elements specified nvidia,cooling-device\n"); return -EINVAL; } if (((n_elems >> 1) == 0) && ((n_elems & 1) == 1)) { nv_printf(NV_DBG_ERRORS, "NVRM: number of elements specified in nvidia,cooling-device needs" "to be an even number\n"); return -EINVAL; } for (i = 0; i < n_strings; i++) { data = devm_kzalloc(pdev, sizeof(*data), GFP_KERNEL); if (data == NULL) { err = -ENOMEM; goto err_nv_pci_tegra_init_cooling_device; } of_property_read_string_index(pdev->of_node, "nvidia,thermal-zones", i, &tz_name); of_property_read_u32_index(pdev->of_node, "nvidia,cooling-device", (i << 1) + 0, &temp_min); of_property_read_u32_index(pdev->of_node, "nvidia,cooling-device", (i << 1) + 1, &temp_max); tzdev = thermal_zone_get_zone_by_name(tz_name); if (IS_ERR(tzdev)) { nv_printf(NV_DBG_ERRORS, "NVRM: fail to get %s thermal_zone_device\n", tz_name); err = -ENODEV; goto err_nv_pci_tegra_init_cooling_device; } thermal_zone_for_each_trip(tzdev, nv_pci_tegra_thermal_get_passive_trip_cb, &passive_trip); if (passive_trip == NULL) { nv_printf(NV_DBG_ERRORS, "NVRM: fail to find passive_trip in %s thermal_zone_device\n", tz_name); err = -ENODEV; goto err_nv_pci_tegra_init_cooling_device; } val = thermal_bind_cdev_to_trip(tzdev, passive_trip, devfreq->cdev, temp_max, temp_min, THERMAL_WEIGHT_DEFAULT); if (val != 0) { nv_printf(NV_DBG_ERRORS, "NVRM: fail to bind devfreq cooling device with %s thermal_zone_device\n", tz_name); err = -ENODEV; goto err_nv_pci_tegra_init_cooling_device; } data->tz_name = tz_name; data->passive_trip = passive_trip; list_add_tail(&data->zones, &tdev->therm_zones); } return 0; err_nv_pci_tegra_init_cooling_device: list_for_each_entry(data, &tdev->therm_zones, zones) { tzdev = thermal_zone_get_zone_by_name(data->tz_name); if (IS_ERR(tzdev)) { continue; } thermal_unbind_cdev_from_trip(tzdev, data->passive_trip, devfreq->cdev); } return err; } #endif static int nv_pci_gb10b_add_devfreq_device(struct nv_pci_tegra_devfreq_dev *tdev) { struct devfreq_dev_profile *profile; int err; populate_opp_table(tdev); profile = devm_kzalloc(&tdev->dev, sizeof(*profile), GFP_KERNEL); if (profile == NULL) { err = -ENOMEM; goto err_nv_pci_gb10b_add_devfreq_device_opp; } profile->target = nv_pci_gb10b_devfreq_target; profile->get_cur_freq = nv_pci_tegra_devfreq_get_cur_freq; profile->get_dev_status = nv_pci_tegra_devfreq_get_dev_status; profile->initial_freq = clk_get_rate(tdev->clk); profile->polling_ms = 25; #if NV_HAS_COOLING_SUPPORTED profile->is_cooling_device = true; #endif tdev->devfreq = devm_devfreq_add_device(&tdev->dev, profile, DEVFREQ_GOV_PERFORMANCE, NULL); if (IS_ERR(tdev->devfreq)) { err = PTR_ERR(tdev->devfreq); goto err_nv_pci_gb10b_add_devfreq_device_opp; } #if NV_HAS_COOLING_SUPPORTED err = nv_pci_tegra_init_cooling_device(tdev); if (err) { goto err_nv_pci_gb10b_add_devfreq_device; } #endif return 0; #if NV_HAS_COOLING_SUPPORTED err_nv_pci_gb10b_add_devfreq_device: devm_devfreq_remove_device(&tdev->dev, tdev->devfreq); #endif err_nv_pci_gb10b_add_devfreq_device_opp: nv_pci_tegra_devfreq_remove_opps(tdev); return err; } static int nv_pci_gb10b_register_devfreq(struct pci_dev *pdev) { nv_linux_state_t *nvl = pci_get_drvdata(pdev); nv_state_t *nv = NV_STATE_PTR(nvl); struct pci_bus *pbus = pdev->bus; const struct nv_pci_tegra_devfreq_data *tdata; struct nv_pci_tegra_devfreq_dev *tdev; #if NV_HAS_ICC_SUPPORTED struct icc_path *icc_path; #endif struct clk *clk; int i, err, node; u32 gpu_pg_mask; while (pbus->parent != NULL) { pbus = pbus->parent; } node = max(0, dev_to_node(to_pci_host_bridge(pbus->bridge)->dev.parent)); if (nv->tegra_pci_igpu_pg_mask == NV_TEGRA_PCI_IGPU_PG_MASK_DEFAULT) { gpu_pg_mask = 0; } else { gpu_pg_mask = nv->tegra_pci_igpu_pg_mask; nv_printf(NV_DBG_INFO, "NVRM: devfreq register receives gpu_pg_mask = %u\n", gpu_pg_mask); } for (i = 0; i < nvl->devfreq_table_size; i++) { tdata = &nvl->devfreq_table[i]; if (gpu_pg_mask && (gpu_pg_mask & tdata->gpc_fuse_field)) { continue; } #if NV_HAS_ICC_SUPPORTED if (tdata->icc_name != NULL) { icc_path = devm_of_icc_get(&pdev->dev, tdata->icc_name); if (IS_ERR_OR_NULL(icc_path)) { icc_path = NULL; } } #endif clk = devm_clk_get(&pdev->dev, tdata->clk_name); if (IS_ERR_OR_NULL(clk)) { continue; } tdev = devm_kzalloc(&pdev->dev, sizeof(*tdev), GFP_KERNEL); if (tdev == NULL) { return -ENOMEM; } INIT_LIST_HEAD(&tdev->gpc_cluster); INIT_LIST_HEAD(&tdev->nvd_cluster); #if NV_HAS_COOLING_SUPPORTED INIT_LIST_HEAD(&tdev->therm_zones); #endif #if NV_HAS_ICC_SUPPORTED tdev->icc_path = icc_path; #endif tdev->clk = clk; tdev->domain = node; tdev->devfreq_clk = tdata->devfreq_clk; tdev->dev.parent = &pdev->dev; tdev->dev.release = nv_pci_gb10b_device_release; dev_set_name(&tdev->dev, "%s-%d", tdata->clk_name, node); if (strstr(tdata->clk_name, "gpc")) { if (nvl->gpc_devfreq_dev != NULL) { list_add_tail(&tdev->gpc_cluster, &nvl->gpc_devfreq_dev->gpc_cluster); tdev->gpc_master = nvl->gpc_devfreq_dev; } else { nvl->gpc_devfreq_dev = tdev; dev_set_name(&tdev->dev, "gpu-gpc-%d", node); } } else if (strstr(tdata->clk_name, "nvd")) { nvl->nvd_devfreq_dev = tdev; dev_set_name(&tdev->dev, "gpu-nvd-%d", node); } else if (strstr(tdata->clk_name, "sys")) { nvl->sys_devfreq_dev = tdev; dev_set_name(&tdev->dev, "gpu-sys-%d", node); } else if (strstr(tdata->clk_name, "uproc")) { nvl->pwr_devfreq_dev = tdev; dev_set_name(&tdev->dev, "gpu-pwr-%d", node); } err = device_register(&tdev->dev); if (err != 0) { goto error_return; } } if (nvl->gpc_devfreq_dev != NULL) { err = nv_pci_gb10b_add_devfreq_device(nvl->gpc_devfreq_dev); if (err != 0) { nvl->gpc_devfreq_dev->devfreq = NULL; goto error_slave_teardown; } #if defined(NV_DEVFREQ_HAS_SUSPEND_FREQ) else { // Set the devfreq suspend frequency for the GPC devfreq device. nvl->gpc_devfreq_dev->devfreq->suspend_freq = nvl->tegra_suspend_freq; } #endif if (nvl->sys_devfreq_dev != NULL) { list_add_tail(&nvl->sys_devfreq_dev->gpc_cluster, &nvl->gpc_devfreq_dev->gpc_cluster); nvl->sys_devfreq_dev->gpc_master = nvl->gpc_devfreq_dev; } if (nvl->pwr_devfreq_dev != NULL) { list_add_tail(&nvl->pwr_devfreq_dev->gpc_cluster, &nvl->gpc_devfreq_dev->gpc_cluster); nvl->pwr_devfreq_dev->gpc_master = nvl->gpc_devfreq_dev; } } if (nvl->nvd_devfreq_dev != NULL) { err = nv_pci_gb10b_add_devfreq_device(nvl->nvd_devfreq_dev); if (err != 0) { nvl->nvd_devfreq_dev->devfreq = NULL; goto error_slave_teardown; } #if defined(NV_DEVFREQ_HAS_SUSPEND_FREQ) else { // Set the devfreq suspend frequency for the NVD devfreq device. nvl->nvd_devfreq_dev->devfreq->suspend_freq = nvl->tegra_suspend_freq; } #endif if (nvl->sys_devfreq_dev != NULL) { list_add_tail(&nvl->sys_devfreq_dev->nvd_cluster, &nvl->nvd_devfreq_dev->nvd_cluster); nvl->sys_devfreq_dev->nvd_master = nvl->nvd_devfreq_dev; } if (nvl->pwr_devfreq_dev != NULL) { list_add_tail(&nvl->pwr_devfreq_dev->nvd_cluster, &nvl->nvd_devfreq_dev->nvd_cluster); nvl->pwr_devfreq_dev->nvd_master = nvl->nvd_devfreq_dev; } } return 0; error_slave_teardown: if (nvl->sys_devfreq_dev != NULL) { if (nvl->sys_devfreq_dev->gpc_master != NULL) { list_del(&nvl->sys_devfreq_dev->gpc_cluster); nvl->sys_devfreq_dev->gpc_master = NULL; } device_unregister(&nvl->sys_devfreq_dev->dev); nvl->sys_devfreq_dev = NULL; } if (nvl->pwr_devfreq_dev != NULL) { if (nvl->pwr_devfreq_dev->gpc_master != NULL) { list_del(&nvl->pwr_devfreq_dev->gpc_cluster); nvl->pwr_devfreq_dev->gpc_master = NULL; } device_unregister(&nvl->pwr_devfreq_dev->dev); nvl->pwr_devfreq_dev = NULL; } error_return: /* The caller will call unregister to unwind on failure */ return err; } static int nv_pci_gb10b_suspend_devfreq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); int err = 0; if (nvl->gpc_devfreq_dev != NULL && nvl->gpc_devfreq_dev->devfreq != NULL) { err = devfreq_suspend_device(nvl->gpc_devfreq_dev->devfreq); if (err) { return err; } #if NV_HAS_ICC_SUPPORTED if (nvl->gpc_devfreq_dev->icc_path != NULL) { icc_set_bw(nvl->gpc_devfreq_dev->icc_path, 0, 0); } #endif } if (nvl->nvd_devfreq_dev != NULL && nvl->nvd_devfreq_dev->devfreq != NULL) { err = devfreq_suspend_device(nvl->nvd_devfreq_dev->devfreq); if (err) { return err; } #if NV_HAS_ICC_SUPPORTED if (nvl->nvd_devfreq_dev->icc_path != NULL) { icc_set_bw(nvl->nvd_devfreq_dev->icc_path, 0, 0); } #endif } return err; } static int nv_pci_gb10b_resume_devfreq(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); int err = 0; if (nvl->gpc_devfreq_dev != NULL && nvl->gpc_devfreq_dev->devfreq != NULL) { err = devfreq_resume_device(nvl->gpc_devfreq_dev->devfreq); if (err) { return err; } } if (nvl->nvd_devfreq_dev != NULL && nvl->nvd_devfreq_dev->devfreq != NULL) { err = devfreq_resume_device(nvl->nvd_devfreq_dev->devfreq); if (err) { return err; } } return err; } static void nv_pci_devfreq_disable_boost(struct work_struct *work) { #if defined(NV_UPDATE_DEVFREQ_PRESENT) struct nv_pci_tegra_devfreq_dev *tdev; tdev = container_of(work, struct nv_pci_tegra_devfreq_dev, boost_disable.work); tdev->boost_enabled = 0; #endif } static int nv_pci_gb10b_devfreq_enable_boost(struct device *dev, unsigned int duration) { #if defined(NV_UPDATE_DEVFREQ_PRESENT) struct pci_dev *pci_dev = to_pci_dev(dev); nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); struct nv_pci_tegra_devfreq_dev *tdev; unsigned long delay; if (duration == 0) return 0; delay = msecs_to_jiffies(duration * 1000); tdev = nvl->gpc_devfreq_dev; if (tdev != NULL && tdev->devfreq != NULL && tdev->boost_enabled == 0) { tdev->boost_enabled = 1; INIT_DELAYED_WORK(&tdev->boost_disable, nv_pci_devfreq_disable_boost); schedule_delayed_work(&tdev->boost_disable, delay); } tdev = nvl->nvd_devfreq_dev; if (tdev != NULL && tdev->devfreq != NULL && tdev->boost_enabled == 0) { tdev->boost_enabled = 1; INIT_DELAYED_WORK(&tdev->boost_disable, nv_pci_devfreq_disable_boost); schedule_delayed_work(&tdev->boost_disable, delay); } return 0; #else // !defined(NV_UPDATE_DEVFREQ_PRESENT) return -1; #endif } static int nv_pci_gb10b_devfreq_disable_boost(struct device *dev) { #if defined(NV_UPDATE_DEVFREQ_PRESENT) struct pci_dev *pci_dev = to_pci_dev(dev); nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); struct nv_pci_tegra_devfreq_dev *tdev; tdev = nvl->gpc_devfreq_dev; if (tdev != NULL && tdev->devfreq != NULL && tdev->boost_enabled) { tdev->boost_enabled = 0; cancel_delayed_work_sync(&tdev->boost_disable); } tdev = nvl->nvd_devfreq_dev; if (tdev != NULL && tdev->devfreq != NULL && tdev->boost_enabled) { tdev->boost_enabled = 0; cancel_delayed_work_sync(&tdev->boost_disable); } return 0; #else // !defined(NV_UPDATE_DEVFREQ_PRESENT) return -1; #endif } struct nv_pci_tegra_data { unsigned short vendor; unsigned short device; const struct nv_pci_tegra_devfreq_data *devfreq_table; unsigned int devfreq_table_size; int (*devfreq_register)(struct pci_dev*); int (*devfreq_suspend)(struct device*); int (*devfreq_resume)(struct device*); int (*devfreq_enable_boost)(struct device*, unsigned int); int (*devfreq_disable_boost)(struct device*); }; static const struct nv_pci_tegra_data nv_pci_tegra_table[] = { { .vendor = 0x10de, .device = 0x2b00, .devfreq_table = gb10b_tegra_devfreq_table, .devfreq_table_size = ARRAY_SIZE(gb10b_tegra_devfreq_table), .devfreq_register = nv_pci_gb10b_register_devfreq, .devfreq_suspend = nv_pci_gb10b_suspend_devfreq, .devfreq_resume = nv_pci_gb10b_resume_devfreq, .devfreq_enable_boost = nv_pci_gb10b_devfreq_enable_boost, .devfreq_disable_boost = nv_pci_gb10b_devfreq_disable_boost, }, }; static void nv_pci_tegra_devfreq_remove(struct nv_pci_tegra_devfreq_dev *tdev) { struct nv_pci_tegra_devfreq_dev *tptr, *next; #if NV_HAS_COOLING_SUPPORTED struct nv_pci_tegra_thermal_data *data; struct thermal_zone_device *tzdev; #endif if (tdev->devfreq != NULL) { #if NV_HAS_COOLING_SUPPORTED list_for_each_entry(data, &tdev->therm_zones, zones) { tzdev = thermal_zone_get_zone_by_name(data->tz_name); if (IS_ERR(tzdev)) { continue; } thermal_unbind_cdev_from_trip(tzdev, data->passive_trip, tdev->devfreq->cdev); } #endif devm_devfreq_remove_device(&tdev->dev, tdev->devfreq); nv_pci_tegra_devfreq_remove_opps(tdev); tdev->devfreq = NULL; } #if NV_HAS_ICC_SUPPORTED if (tdev->icc_path != NULL) { icc_set_bw(tdev->icc_path, 0, 0); } #endif list_for_each_entry_safe(tptr, next, &tdev->gpc_cluster, gpc_cluster) { if (tptr->clk != NULL) { devm_clk_put(tdev->dev.parent, tptr->clk); tptr->clk = NULL; device_unregister(&tptr->dev); } list_del(&tptr->gpc_cluster); tptr->gpc_master = NULL; } list_for_each_entry_safe(tptr, next, &tdev->nvd_cluster, nvd_cluster) { if (tptr->clk != NULL) { devm_clk_put(tdev->dev.parent, tptr->clk); tptr->clk = NULL; device_unregister(&tptr->dev); } list_del(&tptr->nvd_cluster); tptr->nvd_master = NULL; } if (tdev->clk != NULL) { devm_clk_put(tdev->dev.parent, tptr->clk); tdev->clk = NULL; device_unregister(&tdev->dev); } } static void nv_pci_tegra_unregister_devfreq(struct pci_dev *pdev) { nv_linux_state_t *nvl = pci_get_drvdata(pdev); if (nvl->gpc_devfreq_dev != NULL) { nv_pci_tegra_devfreq_remove(nvl->gpc_devfreq_dev); nvl->gpc_devfreq_dev = NULL; } if (nvl->nvd_devfreq_dev != NULL) { nv_pci_tegra_devfreq_remove(nvl->nvd_devfreq_dev); nvl->nvd_devfreq_dev = NULL; } } static const struct nv_pci_tegra_data* nv_pci_get_tegra_igpu_data(struct pci_dev *pdev) { const struct nv_pci_tegra_data *tegra_data = NULL; int i; for (i = 0; i < ARRAY_SIZE(nv_pci_tegra_table); i++) { tegra_data = &nv_pci_tegra_table[i]; if ((tegra_data->vendor == pdev->vendor) && (tegra_data->device == pdev->device)) { return tegra_data; } } return NULL; } void nv_pci_tegra_boost_sys_uproc(struct device *dev) { #if defined(CONFIG_PM_DEVFREQ) nv_linux_state_t *nvl = pci_get_drvdata(to_pci_dev(dev)); const struct nv_pci_tegra_devfreq_data *tdata; struct clk *clk = NULL; unsigned long max_freq; int i; for (i = 0; i < nvl->devfreq_table_size; i++) { tdata = &nvl->devfreq_table[i]; /* * Don't boost the GPC, NVD clocks but only the uprocclk * and sysclk. This saves power consumption of GPU rail * and reduces impact to GPU suspend/resume latency. */ if (strstr(tdata->clk_name, "uproc") || strstr(tdata->clk_name, "sys")) { clk = devm_clk_get(dev, tdata->clk_name); } else { continue; } max_freq = clk_round_rate(clk, ULONG_MAX); clk_set_rate(clk, max_freq); } #endif } static int nv_pci_tegra_register_devfreq(struct pci_dev *pdev) { nv_linux_state_t *nvl = pci_get_drvdata(pdev); const struct nv_pci_tegra_data *tegra_data = NULL; int err; #if defined(NV_DEVFREQ_HAS_SUSPEND_FREQ) struct device_node *np = pdev->dev.of_node; NvU32 suspend_freq = 0; #endif tegra_data = nv_pci_get_tegra_igpu_data(pdev); if (tegra_data == NULL) { return 0; } nvl->devfreq_table = tegra_data->devfreq_table; nvl->devfreq_table_size = tegra_data->devfreq_table_size; nvl->devfreq_suspend = tegra_data->devfreq_suspend; nvl->devfreq_resume = tegra_data->devfreq_resume; nvl->devfreq_enable_boost = tegra_data->devfreq_enable_boost; nvl->devfreq_disable_boost = tegra_data->devfreq_disable_boost; #if defined(NV_DEVFREQ_HAS_SUSPEND_FREQ) of_property_read_u32(np, "nvidia,suspend-freq", &suspend_freq); if (suspend_freq == 0) { suspend_freq = NV_PCI_TEGRA_DEVFREQ_SUSPEND_FREQ; } nvl->tegra_suspend_freq = suspend_freq; #endif err = tegra_data->devfreq_register(pdev); if (err != 0) { nv_pci_tegra_unregister_devfreq(pdev); return err; } return 0; } #endif static void nv_init_dynamic_power_management ( nvidia_stack_t *sp, struct pci_dev *pci_dev ) { nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); nv_state_t *nv = NV_STATE_PTR(nvl); NvBool pr3_acpi_method_present = NV_FALSE; nvl->sysfs_config_file = NULL; nv_get_pci_sysfs_config(pci_dev, nvl); if (nv_get_hypervisor_type() != OS_HYPERVISOR_UNKNOWN) { pr3_acpi_method_present = nv_acpi_power_resource_method_present(pci_dev); } else if (pci_dev->bus && pci_dev->bus->self) { pr3_acpi_method_present = nv_acpi_power_resource_method_present(pci_dev->bus->self); } rm_init_dynamic_power_management(sp, nv, pr3_acpi_method_present); } static void nv_init_tegra_gpu_pg_mask(nvidia_stack_t *sp, struct pci_dev *pci_dev) { nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); nv_state_t *nv = NV_STATE_PTR(nvl); struct device_node *np = pci_dev->dev.of_node; u32 gpu_pg_mask = 0; /* Only continue with certain Tegra PCI iGPUs */ if (!nv->supports_tegra_igpu_rg) { return; } nv->tegra_pci_igpu_pg_mask = NV_TEGRA_PCI_IGPU_PG_MASK_DEFAULT; of_property_read_u32(np, "nvidia,fuse-overrides", &gpu_pg_mask); if (gpu_pg_mask != 0) { nv_printf(NV_DBG_INFO, "NVRM: nvidia,fuse-overrides parsed from device tree: 0x%x\n", gpu_pg_mask); nv->tegra_pci_igpu_pg_mask = gpu_pg_mask; } nv_set_gpu_pg_mask(nv); } static NvBool nv_pci_validate_bars(const struct pci_dev *pci_dev, NvBool only_bar0) { unsigned int i, j; NvBool last_bar_64bit = NV_FALSE; for (i = 0, j = 0; i < NVRM_PCICFG_NUM_BARS && j < NV_GPU_NUM_BARS; i++) { if (NV_PCI_RESOURCE_VALID(pci_dev, i)) { if ((NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_TYPE_64) && (NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_PREFETCH)) { last_bar_64bit = NV_TRUE; } // // If we are here, then we have found a valid BAR -- 32 or 64-bit. // j++; if (only_bar0) return NV_TRUE; continue; } // // If last_bar_64bit is "true" then, we are looking at the 2nd (upper) // half of the 64-bit BAR. This is typically all 0s which looks invalid // but it's normal and not a problem and we can ignore it and continue. // if (last_bar_64bit) { last_bar_64bit = NV_FALSE; continue; } // Invalid 32 or 64-bit BAR. nv_printf(NV_DBG_ERRORS, "NVRM: This PCI I/O region assigned to your NVIDIA device is invalid:\n" "NVRM: BAR%d is %" NvU64_fmtu "M @ 0x%" NvU64_fmtx " (PCI:%04x:%02x:%02x.%x)\n", i, (NvU64)(NV_PCI_RESOURCE_SIZE(pci_dev, i) >> 20), (NvU64)NV_PCI_RESOURCE_START(pci_dev, i), NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); return NV_FALSE; } return NV_TRUE; } /* find nvidia devices and set initial state */ static int nv_pci_probe ( struct pci_dev *pci_dev, const struct pci_device_id *id_table ) { nv_state_t *nv = NULL; nv_linux_state_t *nvl = NULL; unsigned int i, j; int flags = 0; nvidia_stack_t *sp = NULL; NvBool prev_nv_ats_supported = nv_ats_supported; NV_STATUS status; NvU8 regs_bar_index = nv_bar_index_to_os_bar_index(pci_dev, NV_GPU_BAR_INDEX_REGS); NvBool bar0_requested = NV_FALSE; nv_printf(NV_DBG_SETUP, "NVRM: probing 0x%x 0x%x, class 0x%x\n", pci_dev->vendor, pci_dev->device, pci_dev->class); if (nv_kmem_cache_alloc_stack(&sp) != 0) { return -1; } #ifdef NV_PCI_SRIOV_SUPPORT if (pci_dev->is_virtfn) { #if defined(NV_VGPU_KVM_BUILD) #if defined(NV_BUS_TYPE_HAS_IOMMU_OPS) if (pci_dev->dev.bus->iommu_ops == NULL) #else if ((pci_dev->dev.iommu != NULL) && (pci_dev->dev.iommu->iommu_dev != NULL) && (pci_dev->dev.iommu->iommu_dev->ops == NULL)) #endif { nv_printf(NV_DBG_ERRORS, "NVRM: Aborting probe for VF %04x:%02x:%02x.%x " "since IOMMU is not present on the system.\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); goto failed; } nv_kmem_cache_free_stack(sp); return 0; #else nv_printf(NV_DBG_ERRORS, "NVRM: Ignoring probe for VF %04x:%02x:%02x.%x ", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); goto failed; #endif /* NV_VGPU_KVM_BUILD */ } #endif /* NV_PCI_SRIOV_SUPPORT */ if (!rm_wait_for_bar_firewall( sp, NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn), pci_dev->device, pci_dev->subsystem_device)) { nv_printf(NV_DBG_ERRORS, "NVRM: failed to wait for bar firewall to lower\n"); goto failed; } if (!rm_is_supported_pci_device( (pci_dev->class >> 16) & 0xFF, (pci_dev->class >> 8) & 0xFF, pci_dev->vendor, pci_dev->device, pci_dev->subsystem_vendor, pci_dev->subsystem_device, NV_FALSE /* print_legacy_warning */)) { nv_printf(NV_DBG_ERRORS, "NVRM: ignoring the legacy GPU %04x:%02x:%02x.%x\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); goto failed; } num_probed_nv_devices++; if (pci_enable_device(pci_dev) != 0) { nv_printf(NV_DBG_ERRORS, "NVRM: pci_enable_device failed, aborting\n"); goto failed; } if ((pci_dev->irq == 0 && !pci_find_capability(pci_dev, PCI_CAP_ID_MSIX)) && nv_treat_missing_irq_as_error()) { nv_printf(NV_DBG_ERRORS, "NVRM: Can't find an IRQ for your NVIDIA card!\n"); nv_printf(NV_DBG_ERRORS, "NVRM: Please check your BIOS settings.\n"); nv_printf(NV_DBG_ERRORS, "NVRM: [Plug & Play OS] should be set to NO\n"); nv_printf(NV_DBG_ERRORS, "NVRM: [Assign IRQ to VGA] should be set to YES \n"); goto failed; } // Validate if BAR0 is usable if (!nv_pci_validate_bars(pci_dev, /* only_bar0 = */ NV_TRUE)) goto failed; if (!request_mem_region(NV_PCI_RESOURCE_START(pci_dev, regs_bar_index), NV_PCI_RESOURCE_SIZE(pci_dev, regs_bar_index), nv_device_name)) { nv_printf(NV_DBG_ERRORS, "NVRM: request_mem_region failed for %" NvU64_fmtu "M @ 0x%" NvU64_fmtx ". This can\n" "NVRM: occur when a driver such as rivatv is loaded and claims\n" "NVRM: ownership of the device's registers.\n", (NvU64)(NV_PCI_RESOURCE_SIZE(pci_dev, regs_bar_index) >> 20), (NvU64)NV_PCI_RESOURCE_START(pci_dev, regs_bar_index)); goto failed; } bar0_requested = NV_TRUE; NV_KZALLOC(nvl, sizeof(nv_linux_state_t)); if (nvl == NULL) { nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate memory\n"); goto err_not_supported; } nv = NV_STATE_PTR(nvl); // Map BAR0 for (i = 0; i < NVRM_PCICFG_NUM_BARS; i++) { if ((NV_PCI_RESOURCE_VALID(pci_dev, i)) && (NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY) { nv->bars[NV_GPU_BAR_INDEX_REGS].offset = NVRM_PCICFG_BAR_OFFSET(i); nv->bars[NV_GPU_BAR_INDEX_REGS].cpu_address = NV_PCI_RESOURCE_START(pci_dev, i); nv->bars[NV_GPU_BAR_INDEX_REGS].size = NV_PCI_RESOURCE_SIZE(pci_dev, i); break; } } nv->regs = &nv->bars[NV_GPU_BAR_INDEX_REGS]; pci_set_drvdata(pci_dev, (void *)nvl); /* default to 32-bit PCI bus address space */ pci_dev->dma_mask = 0xffffffffULL; nvl->dev = &pci_dev->dev; nvl->pci_dev = pci_dev; nvl->dma_dev.dev = nvl->dev; nv->pci_info.vendor_id = pci_dev->vendor; nv->pci_info.device_id = pci_dev->device; nv->subsystem_id = pci_dev->subsystem_device; nv->subsystem_vendor = pci_dev->subsystem_vendor; nv->os_state = (void *) nvl; nv->dma_dev = &nvl->dma_dev; nv->pci_info.domain = NV_PCI_DOMAIN_NUMBER(pci_dev); nv->pci_info.bus = NV_PCI_BUS_NUMBER(pci_dev); nv->pci_info.slot = NV_PCI_SLOT_NUMBER(pci_dev); nv->handle = pci_dev; nv->flags |= flags; if (!nv_lock_init_locks(sp, nv)) { goto err_not_supported; } // Wire RM HAL if (!rm_init_private_state(sp, nv)) { NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "rm_init_private_state() failed!\n"); goto err_zero_dev; } if (!nv->is_tegra_pci_igpu && !pci_devid_is_self_hosted(pci_dev->device) && !nv_pci_validate_bars(pci_dev, /* only_bar0 = */ NV_FALSE)) goto err_zero_dev; if (nv_resize_pcie_bars(pci_dev)) { nv_printf(NV_DBG_ERRORS, "NVRM: Fatal Error while attempting to resize PCIe BARs.\n"); goto err_zero_dev; } nvl->all_mappings_revoked = NV_TRUE; nvl->safe_to_mmap = NV_TRUE; nvl->gpu_wakeup_callback_needed = NV_TRUE; INIT_LIST_HEAD(&nvl->open_files); // Map BAR>=1 if (!nv->is_tegra_pci_igpu) { for (i = 0, j = 0; i < NVRM_PCICFG_NUM_BARS && j < NV_GPU_NUM_BARS; i++) { if (j == NV_GPU_BAR_INDEX_REGS) { j++; continue; } if ((NV_PCI_RESOURCE_VALID(pci_dev, i)) && (NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_MEMORY) { nv->bars[j].offset = NVRM_PCICFG_BAR_OFFSET(i); nv->bars[j].cpu_address = NV_PCI_RESOURCE_START(pci_dev, i); nv->bars[j].size = NV_PCI_RESOURCE_SIZE(pci_dev, i); j++; } } } // Assign empty BAR1 if absent nv->fb = &nv->bars[NV_GPU_BAR_INDEX_FB]; nv->interrupt_line = pci_dev->irq; NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_DISABLED); nvl->numa_info.node_id = NUMA_NO_NODE; #if NV_IS_EXPORT_SYMBOL_GPL_pci_ats_supported nv->ats_support = pci_ats_supported(nvl->pci_dev); #else nv->ats_support = nvl->pci_dev->ats_enabled; #endif if (nv->ats_support) { int ret __attribute__ ((unused)); NV_DEV_PRINTF(NV_DBG_INFO, nv, "ATS supported by this GPU!\n"); #if defined(CONFIG_IOMMU_SVA) && \ (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT)) ret = iommu_dev_enable_feature(nvl->dev, IOMMU_DEV_FEAT_SVA); if (ret == 0) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "Enabled SMMU SVA feature! \n"); } else if (ret == -EBUSY) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "SMMU SVA feature already enabled!\n"); } else { NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Enabling SMMU SVA feature failed! ret: %d\n", ret); nv->ats_support = NV_FALSE; } #else NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Enabling SMMU SVA feature failed due to lack of necessary kernel configs.\n"); nv->ats_support = NV_FALSE; #endif } if (pci_devid_is_self_hosted(pci_dev->device)) { nv_init_coherent_link_info(nv); } nv_ats_supported |= nv->ats_support; nv_clk_get_handles(nv); pci_set_master(pci_dev); #if defined(CONFIG_VGA_ARB) #if defined(VGA_DEFAULT_DEVICE) #if defined(NV_VGA_TRYGET_PRESENT) vga_tryget(VGA_DEFAULT_DEVICE, VGA_RSRC_LEGACY_MASK); #endif #endif vga_set_legacy_decoding(pci_dev, VGA_RSRC_NONE); #endif status = nv_check_gpu_state(nv); if (status == NV_ERR_GPU_IS_LOST) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "GPU is lost, skipping nv_pci_probe\n"); goto err_not_supported; } if ((rm_is_supported_device(sp, nv)) != NV_OK) goto err_not_supported; nv->cpu_numa_node_id = dev_to_node(nvl->dev); if (nv_linux_init_open_q(nvl) != 0) { NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "nv_linux_init_open_q() failed!\n"); goto err_zero_dev; } nv_printf(NV_DBG_INFO, "NVRM: PCI:%04x:%02x:%02x.%x (%04x:%04x): BAR0 @ 0x%llx (%lluMB)\n", nv->pci_info.domain, nv->pci_info.bus, nv->pci_info.slot, PCI_FUNC(pci_dev->devfn), nv->pci_info.vendor_id, nv->pci_info.device_id, nv->regs->cpu_address, (nv->regs->size >> 20)); nv_printf(NV_DBG_INFO, "NVRM: PCI:%04x:%02x:%02x.%x (%04x:%04x): BAR1 @ 0x%llx (%lluMB)\n", nv->pci_info.domain, nv->pci_info.bus, nv->pci_info.slot, PCI_FUNC(pci_dev->devfn), nv->pci_info.vendor_id, nv->pci_info.device_id, nv->fb->cpu_address, (nv->fb->size >> 20)); num_nv_devices++; /* * The newly created nvl object is added to the nv_linux_devices global list * only after all the initialization operations for that nvl object are * completed, so as to protect against simultaneous lookup operations which * may discover a partially initialized nvl object in the list */ LOCK_NV_LINUX_DEVICES(); if (nv_linux_add_device_locked(nvl) != 0) { UNLOCK_NV_LINUX_DEVICES(); goto err_add_device; } UNLOCK_NV_LINUX_DEVICES(); pm_vt_switch_required(nvl->dev, NV_TRUE); #if defined(CONFIG_PM_DEVFREQ) // Support dynamic power management if device is a tegra PCI iGPU rm_init_tegra_dynamic_power_management(sp, nv); #endif nv_init_dynamic_power_management(sp, pci_dev); nv_init_tegra_gpu_pg_mask(sp, pci_dev); nv_procfs_add_gpu(nvl); /* Parse and set any per-GPU registry keys specified. */ nv_parse_per_device_option_string(sp); rm_set_rm_firmware_requested(sp, nv); #if defined(NV_VGPU_KVM_BUILD) if (nvidia_vgpu_vfio_probe(nvl->pci_dev) != NV_OK) { NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Failed to register device to vGPU VFIO module"); goto err_free_all; } #endif nv_check_and_exclude_gpu(sp, nv); #if defined(DPM_FLAG_NO_DIRECT_COMPLETE) dev_pm_set_driver_flags(nvl->dev, DPM_FLAG_NO_DIRECT_COMPLETE); #elif defined(DPM_FLAG_NEVER_SKIP) dev_pm_set_driver_flags(nvl->dev, DPM_FLAG_NEVER_SKIP); #endif #if defined(CONFIG_PM_DEVFREQ) /* * Expose clock control interface via devfreq framework for Tegra iGPU and * let the linux kernel itself to support all the clock scaling logic for * Tegra iGPU. * * On Tegra platforms, most of the clocks are managed by the BPMP. PMU inside * the iGPU does not have direct communication path to the BPMP, so using * existing clock management features (e.g. Pstates, PerfCf, and etc) with * PMU will not work. */ if (nv_pci_tegra_register_devfreq(pci_dev) != 0) { NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Failed to register linux devfreq"); goto err_free_all; }; #endif /* * Dynamic power management should be enabled as the last step. * Kernel runtime power management framework can put the device * into the suspended state. Hardware register access should not be done * after enabling dynamic power management. */ rm_enable_dynamic_power_management(sp, nv); /* * This must be the last action in nv_pci_probe(). Do not add code after this line. */ rm_notify_gpu_addition(sp, nv); nv_kmem_cache_free_stack(sp); nvidia_modeset_probe(nvl); return 0; goto err_free_all; err_free_all: nv_procfs_remove_gpu(nvl); rm_cleanup_dynamic_power_management(sp, nv); pm_vt_switch_unregister(nvl->dev); LOCK_NV_LINUX_DEVICES(); nv_linux_remove_device_locked(nvl); UNLOCK_NV_LINUX_DEVICES(); err_add_device: nv_linux_stop_open_q(nvl); err_zero_dev: rm_free_private_state(sp, nv); err_not_supported: nv_clk_clear_handles(nv); nv_ats_supported = prev_nv_ats_supported; nv_lock_destroy_locks(sp, nv); failed: if (bar0_requested) { release_mem_region(NV_PCI_RESOURCE_START(pci_dev, regs_bar_index), NV_PCI_RESOURCE_SIZE(pci_dev, regs_bar_index)); } NV_PCI_DISABLE_DEVICE(pci_dev); pci_set_drvdata(pci_dev, NULL); if (nvl != NULL) { NV_KFREE(nvl, sizeof(nv_linux_state_t)); } nv_kmem_cache_free_stack(sp); return -1; } static void nv_pci_remove_helper(struct pci_dev *pci_dev, bool block_if_gpu_in_use) { nv_linux_state_t *nvl = NULL; nv_state_t *nv; nvidia_stack_t *sp = NULL; NvU8 regs_bar_index = nv_bar_index_to_os_bar_index(pci_dev, NV_GPU_BAR_INDEX_REGS); #ifdef NV_PCI_SRIOV_SUPPORT if (pci_dev->is_virtfn) { #if defined(NV_VGPU_KVM_BUILD) /* Arg 2 == NV_TRUE means that the PCI device should be removed */ nvidia_vgpu_vfio_remove(pci_dev, NV_TRUE); #endif /* NV_VGPU_KVM_BUILD */ return; } #endif /* NV_PCI_SRIOV_SUPPORT */ nvl = pci_get_drvdata(pci_dev); if (!nvl || (nvl->pci_dev != pci_dev)) { return; } nv = NV_STATE_PTR(nvl); #if defined(CONFIG_IOMMU_SVA) && \ (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT)) if (nv->ats_support) { int ret; ret = iommu_dev_disable_feature(nvl->dev, IOMMU_DEV_FEAT_SVA); if (ret == 0) { NV_DEV_PRINTF(NV_DBG_INFO, nv, "Disabled SMMU SVA feature! \n"); } else { NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Disabling SMMU SVA feature failed! ret: %d\n", ret); } } #endif /* * Flush and stop open_q before proceeding with removal to ensure nvl * outlives all enqueued work items. */ nv_linux_stop_open_q(nvl); nvidia_modeset_remove(nv->gpu_id); LOCK_NV_LINUX_DEVICES(); down(&nvl->ldata_lock); if (!block_if_gpu_in_use && (NV_ATOMIC_READ(nvl->usage_count) != 0)) { up(&nvl->ldata_lock); UNLOCK_NV_LINUX_DEVICES(); return; } if (nv_kmem_cache_alloc_stack(&sp) != 0) { up(&nvl->ldata_lock); UNLOCK_NV_LINUX_DEVICES(); return; } nv->flags |= NV_FLAG_PCI_REMOVE_IN_PROGRESS; rm_notify_gpu_removal(sp, nv); /* * Sanity check: A removed device shouldn't have a non-zero usage_count. * For eGPU, fall off the bus along with clients active is a valid scenario. * Hence skipping the sanity check for eGPU. */ if ((NV_ATOMIC_READ(nvl->usage_count) != 0) && !(nv->is_external_gpu)) { nv_printf(NV_DBG_ERRORS, "NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count!\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); /* * We can't return from this function without corrupting state, so we wait for * the usage count to go to zero. */ while (NV_ATOMIC_READ(nvl->usage_count) != 0) { /* * While waiting, release the locks so that other threads can make * forward progress. */ up(&nvl->ldata_lock); UNLOCK_NV_LINUX_DEVICES(); os_delay(500); /* Re-acquire the locks before checking again */ LOCK_NV_LINUX_DEVICES(); nvl = pci_get_drvdata(pci_dev); if (!nvl) { /* The device was not found, which should not happen */ nv_printf(NV_DBG_ERRORS, "NVRM: Failed removal of device %04x:%02x:%02x.%x!\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); WARN_ON(1); goto done; } nv = NV_STATE_PTR(nvl); down(&nvl->ldata_lock); } nv_printf(NV_DBG_ERRORS, "NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); } rm_check_for_gpu_surprise_removal(sp, nv); nv_linux_remove_device_locked(nvl); /* Remove proc entry for this GPU */ nv_procfs_remove_gpu(nvl); #if defined(CONFIG_PM_DEVFREQ) nv_pci_tegra_unregister_devfreq(pci_dev); #endif nv_clk_clear_handles(nv); rm_cleanup_dynamic_power_management(sp, nv); nv->removed = NV_TRUE; UNLOCK_NV_LINUX_DEVICES(); pm_vt_switch_unregister(&pci_dev->dev); #if defined(NV_VGPU_KVM_BUILD) /* Arg 2 == NV_TRUE means that the PCI device should be removed */ nvidia_vgpu_vfio_remove(pci_dev, NV_TRUE); #endif if ((nv->flags & NV_FLAG_PERSISTENT_SW_STATE) || (nv->flags & NV_FLAG_OPEN)) { nv_acpi_unregister_notifier(nvl); if (nv->flags & NV_FLAG_PERSISTENT_SW_STATE) { rm_disable_gpu_state_persistence(sp, nv); } nv_shutdown_adapter(sp, nv, nvl); nv_dev_free_stacks(nvl); } if (nvl->sysfs_config_file != NULL) { filp_close(nvl->sysfs_config_file, NULL); nvl->sysfs_config_file = NULL; } if (NV_ATOMIC_READ(nvl->usage_count) == 0) { nv_lock_destroy_locks(sp, nv); } num_probed_nv_devices--; pci_set_drvdata(pci_dev, NULL); rm_i2c_remove_adapters(sp, nv); rm_free_private_state(sp, nv); release_mem_region(NV_PCI_RESOURCE_START(pci_dev, regs_bar_index), NV_PCI_RESOURCE_SIZE(pci_dev, regs_bar_index)); num_nv_devices--; if (NV_ATOMIC_READ(nvl->usage_count) == 0) { NV_PCI_DISABLE_DEVICE(pci_dev); NV_KFREE(nvl, sizeof(nv_linux_state_t)); } else { up(&nvl->ldata_lock); } nv_kmem_cache_free_stack(sp); return; done: UNLOCK_NV_LINUX_DEVICES(); nv_kmem_cache_free_stack(sp); } static void nv_pci_remove(struct pci_dev *pci_dev) { nv_printf(NV_DBG_SETUP, "NVRM: removing GPU %04x:%02x:%02x.%x\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn)); nv_pci_remove_helper(pci_dev, true); } static void nv_pci_shutdown(struct pci_dev *pci_dev) { nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); if (!nvl || (nvl->pci_dev != pci_dev)) { return; } if (nvl->is_forced_shutdown) { nvl->is_forced_shutdown = NV_FALSE; return; } #if defined(CONFIG_PM_DEVFREQ) nv_pci_tegra_unregister_devfreq(pci_dev); #endif nv_pci_remove_helper(pci_dev, false); if (pci_get_drvdata(pci_dev) != NULL) { nvl->nv_state.is_shutdown = NV_TRUE; } /* pci_clear_master is not defined for !CONFIG_PCI */ #ifdef CONFIG_PCI pci_clear_master(pci_dev); #endif /* SHH HW mandates 1us delay to realise the effects of * Bus Mater Enable(BME) disable. Adding 1us delay for * all the chips as the delay is not in the data path * and not big. Creating HAL for this would be a overkill. */ udelay(1); } /*! * @brief This function accepts pci information corresponding to a GPU * and returns a reference to the nv_linux_state_t corresponding to that GPU. * * @param[in] domain Pci domain number for the GPU to be found. * @param[in] bus Pci bus number for the GPU to be found. * @param[in] slot Pci slot number for the GPU to be found. * @param[in] function Pci function number for the GPU to be found. * * @return Pointer to nv_linux_state_t for the GPU if it is found, or NULL otherwise. */ nv_linux_state_t * find_pci(NvU32 domain, NvU8 bus, NvU8 slot, NvU8 function) { nv_linux_state_t *nvl = NULL; LOCK_NV_LINUX_DEVICES(); for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next) { nv_state_t *nv = NV_STATE_PTR(nvl); if (nv->pci_info.domain == domain && nv->pci_info.bus == bus && nv->pci_info.slot == slot && nv->pci_info.function == function) { break; } } UNLOCK_NV_LINUX_DEVICES(); return nvl; } int nvidia_dev_get_pci_info(const NvU8 *uuid, struct pci_dev **pci_dev_out, NvU64 *dma_start, NvU64 *dma_limit) { nv_linux_state_t *nvl; /* Takes nvl->ldata_lock */ nvl = find_uuid(uuid); if (!nvl) return -ENODEV; *pci_dev_out = nvl->pci_dev; *dma_start = nvl->dma_dev.addressable_range.start; *dma_limit = nvl->dma_dev.addressable_range.limit; up(&nvl->ldata_lock); return 0; } NvU8 nv_find_pci_capability(struct pci_dev *pci_dev, NvU8 capability) { u16 status = 0; u8 cap_ptr = 0, cap_id = 0xff; pci_read_config_word(pci_dev, PCI_STATUS, &status); status &= PCI_STATUS_CAP_LIST; if (!status) return 0; switch (pci_dev->hdr_type) { case PCI_HEADER_TYPE_NORMAL: case PCI_HEADER_TYPE_BRIDGE: pci_read_config_byte(pci_dev, PCI_CAPABILITY_LIST, &cap_ptr); break; default: return 0; } do { cap_ptr &= 0xfc; pci_read_config_byte(pci_dev, cap_ptr + PCI_CAP_LIST_ID, &cap_id); if (cap_id == capability) return cap_ptr; pci_read_config_byte(pci_dev, cap_ptr + PCI_CAP_LIST_NEXT, &cap_ptr); } while (cap_ptr && cap_id != 0xff); return 0; } static void check_for_bound_driver(struct pci_dev *pci_dev) { if (pci_dev->dev.driver) { const char *driver_name = pci_dev->dev.driver->name; nv_printf(NV_DBG_WARNINGS, "NVRM: GPU %04x:%02x:%02x.%x is already " "bound to %s.\n", NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn), driver_name ? driver_name : "another driver" ); } } /* make sure the pci_driver called probe for all of our devices. * we've seen cases where rivafb claims the device first and our driver * doesn't get called. */ int nv_pci_count_devices(void) { struct pci_dev *pci_dev; int count = 0; if (NVreg_RegisterPCIDriver == 0) { return 0; } pci_dev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, NULL); while (pci_dev) { if (rm_is_supported_pci_device( PCI_BASE_CLASS_DISPLAY, PCI_CLASS_DISPLAY_VGA & 0xFF, pci_dev->vendor, pci_dev->device, pci_dev->subsystem_vendor, pci_dev->subsystem_device, NV_TRUE /* print_legacy_warning */)) { check_for_bound_driver(pci_dev); count++; } pci_dev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pci_dev); } pci_dev = pci_get_class(PCI_CLASS_DISPLAY_3D << 8, NULL); while (pci_dev) { if (rm_is_supported_pci_device( (pci_dev->class >> 16) & 0xFF, (pci_dev->class >> 8) & 0xFF, pci_dev->vendor, pci_dev->device, pci_dev->subsystem_vendor, pci_dev->subsystem_device, NV_TRUE /* print_legacy_warning */)) { check_for_bound_driver(pci_dev); count++; } pci_dev = pci_get_class(PCI_CLASS_DISPLAY_3D << 8, pci_dev); } return count; } /* * On coherent platforms that support BAR1 mappings for GPUDirect RDMA, * dma-buf and nv-p2p subsystems need to ensure the 2 devices belong to * the same IOMMU group. */ NvBool nv_pci_is_valid_topology_for_direct_pci( nv_state_t *nv, struct pci_dev *peer ) { struct pci_dev *pdev0 = to_pci_dev(nv->dma_dev->dev); struct pci_dev *pdev1 = peer; if (!nv->coherent) { return NV_FALSE; } switch (NVreg_GrdmaPciTopoCheckOverride) { case NV_REG_GRDMA_PCI_TOPO_CHECK_OVERRIDE_ALLOW_ACCESS: return NV_TRUE; case NV_REG_GRDMA_PCI_TOPO_CHECK_OVERRIDE_DENY_ACCESS: return NV_FALSE; default: return (pdev0->dev.iommu_group == pdev1->dev.iommu_group); } } NvBool nv_pci_has_common_pci_switch( nv_state_t *nv, struct pci_dev *peer ) { struct pci_dev *pci_dev0, *pci_dev1; pci_dev0 = pci_upstream_bridge(to_pci_dev(nv->dma_dev->dev)); while (pci_dev0 != NULL) { pci_dev1 = pci_upstream_bridge(peer); while (pci_dev1 != NULL) { if (pci_dev0 == pci_dev1) return NV_TRUE; pci_dev1 = pci_upstream_bridge(pci_dev1); } pci_dev0 = pci_upstream_bridge(pci_dev0); } return NV_FALSE; } NvBool NV_API_CALL nv_grdma_pci_topology_supported( nv_state_t *nv, nv_dma_device_t *dma_peer ) { // // Skip topo check on coherent platforms since // NIC can map over C2C anyway and PCIe topology shouldn't matter. // if (nv->coherent) { return NV_TRUE; } switch (NVreg_GrdmaPciTopoCheckOverride) { case NV_REG_GRDMA_PCI_TOPO_CHECK_OVERRIDE_ALLOW_ACCESS: return NV_TRUE; case NV_REG_GRDMA_PCI_TOPO_CHECK_OVERRIDE_DENY_ACCESS: return NV_FALSE; default: break; } // Allow RDMA by default on passthrough VMs. if ((nv->flags & NV_FLAG_PASSTHRU) != 0) return NV_TRUE; // // Only allow RDMA on unsupported chipsets if there exists // a common PCI switch between the GPU and the other device // if ((nv->flags & NV_FLAG_PCI_P2P_UNSUPPORTED_CHIPSET) != 0) return nv_pci_has_common_pci_switch(nv, to_pci_dev(dma_peer->dev)); return NV_TRUE; } #if defined(CONFIG_PM) extern struct dev_pm_ops nv_pm_ops; #endif struct pci_driver nv_pci_driver = { .name = MODULE_NAME, .id_table = nv_pci_table, .probe = nv_pci_probe, .remove = nv_pci_remove, .shutdown = nv_pci_shutdown, #if defined(NV_USE_VFIO_PCI_CORE) && \ defined(NV_PCI_DRIVER_HAS_DRIVER_MANAGED_DMA) .driver_managed_dma = NV_TRUE, #endif #if defined(CONFIG_PM) .driver.pm = &nv_pm_ops, #endif }; void nv_pci_unregister_driver(void) { if (NVreg_RegisterPCIDriver == 0) { return; } return pci_unregister_driver(&nv_pci_driver); } int nv_pci_register_driver(void) { if (NVreg_RegisterPCIDriver == 0) { return 0; } return pci_register_driver(&nv_pci_driver); }