Skip to content

Commit aac6db7

Browse files
committed
vfio/pci: Use unmap_mapping_range()
With the vfio device fd tied to the address space of the pseudo fs inode, we can use the mm to track all vmas that might be mmap'ing device BARs, which removes our vma_list and all the complicated lock ordering necessary to manually zap each related vma. Note that we can no longer store the pfn in vm_pgoff if we want to use unmap_mapping_range() to zap a selective portion of the device fd corresponding to BAR mappings. This also converts our mmap fault handler to use vmf_insert_pfn() because we no longer have a vma_list to avoid the concurrency problem with io_remap_pfn_range(). The goal is to eventually use the vm_ops huge_fault handler to avoid the additional faulting overhead, but vmf_insert_pfn_{pmd,pud}() need to learn about pfnmaps first. Also, Jason notes that a race exists between unmap_mapping_range() and the fops mmap callback if we were to call io_remap_pfn_range() to populate the vma on mmap. Specifically, mmap_region() does call_mmap() before it does vma_link_file() which gives a window where the vma is populated but invisible to unmap_mapping_range(). Suggested-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Link: https://lore.kernel.org/r/20240530045236.1005864-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
1 parent b7c5e64 commit aac6db7

File tree

2 files changed

+55
-211
lines changed

2 files changed

+55
-211
lines changed

drivers/vfio/pci/vfio_pci_core.c

Lines changed: 55 additions & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -1610,100 +1610,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
16101610
}
16111611
EXPORT_SYMBOL_GPL(vfio_pci_core_write);
16121612

1613-
/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1614-
static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
1613+
static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
16151614
{
1616-
struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1615+
struct vfio_device *core_vdev = &vdev->vdev;
1616+
loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
1617+
loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
1618+
loff_t len = end - start;
16171619

1618-
/*
1619-
* Lock ordering:
1620-
* vma_lock is nested under mmap_lock for vm_ops callback paths.
1621-
* The memory_lock semaphore is used by both code paths calling
1622-
* into this function to zap vmas and the vm_ops.fault callback
1623-
* to protect the memory enable state of the device.
1624-
*
1625-
* When zapping vmas we need to maintain the mmap_lock => vma_lock
1626-
* ordering, which requires using vma_lock to walk vma_list to
1627-
* acquire an mm, then dropping vma_lock to get the mmap_lock and
1628-
* reacquiring vma_lock. This logic is derived from similar
1629-
* requirements in uverbs_user_mmap_disassociate().
1630-
*
1631-
* mmap_lock must always be the top-level lock when it is taken.
1632-
* Therefore we can only hold the memory_lock write lock when
1633-
* vma_list is empty, as we'd need to take mmap_lock to clear
1634-
* entries. vma_list can only be guaranteed empty when holding
1635-
* vma_lock, thus memory_lock is nested under vma_lock.
1636-
*
1637-
* This enables the vm_ops.fault callback to acquire vma_lock,
1638-
* followed by memory_lock read lock, while already holding
1639-
* mmap_lock without risk of deadlock.
1640-
*/
1641-
while (1) {
1642-
struct mm_struct *mm = NULL;
1643-
1644-
if (try) {
1645-
if (!mutex_trylock(&vdev->vma_lock))
1646-
return 0;
1647-
} else {
1648-
mutex_lock(&vdev->vma_lock);
1649-
}
1650-
while (!list_empty(&vdev->vma_list)) {
1651-
mmap_vma = list_first_entry(&vdev->vma_list,
1652-
struct vfio_pci_mmap_vma,
1653-
vma_next);
1654-
mm = mmap_vma->vma->vm_mm;
1655-
if (mmget_not_zero(mm))
1656-
break;
1657-
1658-
list_del(&mmap_vma->vma_next);
1659-
kfree(mmap_vma);
1660-
mm = NULL;
1661-
}
1662-
if (!mm)
1663-
return 1;
1664-
mutex_unlock(&vdev->vma_lock);
1665-
1666-
if (try) {
1667-
if (!mmap_read_trylock(mm)) {
1668-
mmput(mm);
1669-
return 0;
1670-
}
1671-
} else {
1672-
mmap_read_lock(mm);
1673-
}
1674-
if (try) {
1675-
if (!mutex_trylock(&vdev->vma_lock)) {
1676-
mmap_read_unlock(mm);
1677-
mmput(mm);
1678-
return 0;
1679-
}
1680-
} else {
1681-
mutex_lock(&vdev->vma_lock);
1682-
}
1683-
list_for_each_entry_safe(mmap_vma, tmp,
1684-
&vdev->vma_list, vma_next) {
1685-
struct vm_area_struct *vma = mmap_vma->vma;
1686-
1687-
if (vma->vm_mm != mm)
1688-
continue;
1689-
1690-
list_del(&mmap_vma->vma_next);
1691-
kfree(mmap_vma);
1692-
1693-
zap_vma_ptes(vma, vma->vm_start,
1694-
vma->vm_end - vma->vm_start);
1695-
}
1696-
mutex_unlock(&vdev->vma_lock);
1697-
mmap_read_unlock(mm);
1698-
mmput(mm);
1699-
}
1620+
unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true);
17001621
}
17011622

17021623
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
17031624
{
1704-
vfio_pci_zap_and_vma_lock(vdev, false);
17051625
down_write(&vdev->memory_lock);
1706-
mutex_unlock(&vdev->vma_lock);
1626+
vfio_pci_zap_bars(vdev);
17071627
}
17081628

17091629
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
@@ -1725,99 +1645,41 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c
17251645
up_write(&vdev->memory_lock);
17261646
}
17271647

1728-
/* Caller holds vma_lock */
1729-
static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
1730-
struct vm_area_struct *vma)
1731-
{
1732-
struct vfio_pci_mmap_vma *mmap_vma;
1733-
1734-
mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT);
1735-
if (!mmap_vma)
1736-
return -ENOMEM;
1737-
1738-
mmap_vma->vma = vma;
1739-
list_add(&mmap_vma->vma_next, &vdev->vma_list);
1740-
1741-
return 0;
1742-
}
1743-
1744-
/*
1745-
* Zap mmaps on open so that we can fault them in on access and therefore
1746-
* our vma_list only tracks mappings accessed since last zap.
1747-
*/
1748-
static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1749-
{
1750-
zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1751-
}
1752-
1753-
static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1648+
static unsigned long vma_to_pfn(struct vm_area_struct *vma)
17541649
{
17551650
struct vfio_pci_core_device *vdev = vma->vm_private_data;
1756-
struct vfio_pci_mmap_vma *mmap_vma;
1651+
int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1652+
u64 pgoff;
17571653

1758-
mutex_lock(&vdev->vma_lock);
1759-
list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1760-
if (mmap_vma->vma == vma) {
1761-
list_del(&mmap_vma->vma_next);
1762-
kfree(mmap_vma);
1763-
break;
1764-
}
1765-
}
1766-
mutex_unlock(&vdev->vma_lock);
1654+
pgoff = vma->vm_pgoff &
1655+
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1656+
1657+
return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
17671658
}
17681659

17691660
static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
17701661
{
17711662
struct vm_area_struct *vma = vmf->vma;
17721663
struct vfio_pci_core_device *vdev = vma->vm_private_data;
1773-
struct vfio_pci_mmap_vma *mmap_vma;
1774-
vm_fault_t ret = VM_FAULT_NOPAGE;
1664+
unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
1665+
vm_fault_t ret = VM_FAULT_SIGBUS;
17751666

1776-
mutex_lock(&vdev->vma_lock);
1777-
down_read(&vdev->memory_lock);
1667+
pfn = vma_to_pfn(vma);
17781668

1779-
/*
1780-
* Memory region cannot be accessed if the low power feature is engaged
1781-
* or memory access is disabled.
1782-
*/
1783-
if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
1784-
ret = VM_FAULT_SIGBUS;
1785-
goto up_out;
1786-
}
1669+
down_read(&vdev->memory_lock);
17871670

1788-
/*
1789-
* We populate the whole vma on fault, so we need to test whether
1790-
* the vma has already been mapped, such as for concurrent faults
1791-
* to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
1792-
* we ask it to fill the same range again.
1793-
*/
1794-
list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1795-
if (mmap_vma->vma == vma)
1796-
goto up_out;
1797-
}
1671+
if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
1672+
goto out_disabled;
17981673

1799-
if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
1800-
vma->vm_end - vma->vm_start,
1801-
vma->vm_page_prot)) {
1802-
ret = VM_FAULT_SIGBUS;
1803-
zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1804-
goto up_out;
1805-
}
1674+
ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
18061675

1807-
if (__vfio_pci_add_vma(vdev, vma)) {
1808-
ret = VM_FAULT_OOM;
1809-
zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1810-
}
1811-
1812-
up_out:
1676+
out_disabled:
18131677
up_read(&vdev->memory_lock);
1814-
mutex_unlock(&vdev->vma_lock);
1678+
18151679
return ret;
18161680
}
18171681

18181682
static const struct vm_operations_struct vfio_pci_mmap_ops = {
1819-
.open = vfio_pci_mmap_open,
1820-
.close = vfio_pci_mmap_close,
18211683
.fault = vfio_pci_mmap_fault,
18221684
};
18231685

@@ -1880,11 +1742,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
18801742

18811743
vma->vm_private_data = vdev;
18821744
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1883-
vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
1745+
vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
18841746

18851747
/*
1886-
* See remap_pfn_range(), called from vfio_pci_fault() but we can't
1887-
* change vm_flags within the fault handler. Set them now.
1748+
* Set vm_flags now, they should not be changed in the fault handler.
1749+
* We want the same flags and page protection (decrypted above) as
1750+
* io_remap_pfn_range() would set.
18881751
*
18891752
* VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
18901753
* allowing KVM stage 2 device mapping attributes to use Normal-NC
@@ -2202,8 +2065,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
22022065
mutex_init(&vdev->ioeventfds_lock);
22032066
INIT_LIST_HEAD(&vdev->dummy_resources_list);
22042067
INIT_LIST_HEAD(&vdev->ioeventfds_list);
2205-
mutex_init(&vdev->vma_lock);
2206-
INIT_LIST_HEAD(&vdev->vma_list);
22072068
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
22082069
init_rwsem(&vdev->memory_lock);
22092070
xa_init(&vdev->ctx);
@@ -2219,7 +2080,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
22192080

22202081
mutex_destroy(&vdev->igate);
22212082
mutex_destroy(&vdev->ioeventfds_lock);
2222-
mutex_destroy(&vdev->vma_lock);
22232083
kfree(vdev->region);
22242084
kfree(vdev->pm_save);
22252085
}
@@ -2497,26 +2357,15 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
24972357
return ret;
24982358
}
24992359

2500-
/*
2501-
* We need to get memory_lock for each device, but devices can share mmap_lock,
2502-
* therefore we need to zap and hold the vma_lock for each device, and only then
2503-
* get each memory_lock.
2504-
*/
25052360
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25062361
struct vfio_pci_group_info *groups,
25072362
struct iommufd_ctx *iommufd_ctx)
25082363
{
2509-
struct vfio_pci_core_device *cur_mem;
2510-
struct vfio_pci_core_device *cur_vma;
2511-
struct vfio_pci_core_device *cur;
2364+
struct vfio_pci_core_device *vdev;
25122365
struct pci_dev *pdev;
2513-
bool is_mem = true;
25142366
int ret;
25152367

25162368
mutex_lock(&dev_set->lock);
2517-
cur_mem = list_first_entry(&dev_set->device_list,
2518-
struct vfio_pci_core_device,
2519-
vdev.dev_set_list);
25202369

25212370
pdev = vfio_pci_dev_set_resettable(dev_set);
25222371
if (!pdev) {
@@ -2533,7 +2382,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25332382
if (ret)
25342383
goto err_unlock;
25352384

2536-
list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
2385+
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
25372386
bool owned;
25382387

25392388
/*
@@ -2557,38 +2406,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25572406
* Otherwise, reset is not allowed.
25582407
*/
25592408
if (iommufd_ctx) {
2560-
int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
2409+
int devid = vfio_iommufd_get_dev_id(&vdev->vdev,
25612410
iommufd_ctx);
25622411

25632412
owned = (devid > 0 || devid == -ENOENT);
25642413
} else {
2565-
owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
2414+
owned = vfio_dev_in_groups(&vdev->vdev, groups);
25662415
}
25672416

25682417
if (!owned) {
25692418
ret = -EINVAL;
2570-
goto err_undo;
2419+
break;
25712420
}
25722421

25732422
/*
2574-
* Locking multiple devices is prone to deadlock, runaway and
2575-
* unwind if we hit contention.
2423+
* Take the memory write lock for each device and zap BAR
2424+
* mappings to prevent the user accessing the device while in
2425+
* reset. Locking multiple devices is prone to deadlock,
2426+
* runaway and unwind if we hit contention.
25762427
*/
2577-
if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
2428+
if (!down_write_trylock(&vdev->memory_lock)) {
25782429
ret = -EBUSY;
2579-
goto err_undo;
2430+
break;
25802431
}
2432+
2433+
vfio_pci_zap_bars(vdev);
25812434
}
2582-
cur_vma = NULL;
25832435

2584-
list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
2585-
if (!down_write_trylock(&cur_mem->memory_lock)) {
2586-
ret = -EBUSY;
2587-
goto err_undo;
2588-
}
2589-
mutex_unlock(&cur_mem->vma_lock);
2436+
if (!list_entry_is_head(vdev,
2437+
&dev_set->device_list, vdev.dev_set_list)) {
2438+
vdev = list_prev_entry(vdev, vdev.dev_set_list);
2439+
goto err_undo;
25902440
}
2591-
cur_mem = NULL;
25922441

25932442
/*
25942443
* The pci_reset_bus() will reset all the devices in the bus.
@@ -2599,25 +2448,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25992448
* cause the PCI config space reset without restoring the original
26002449
* state (saved locally in 'vdev->pm_save').
26012450
*/
2602-
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2603-
vfio_pci_set_power_state(cur, PCI_D0);
2451+
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
2452+
vfio_pci_set_power_state(vdev, PCI_D0);
26042453

26052454
ret = pci_reset_bus(pdev);
26062455

2456+
vdev = list_last_entry(&dev_set->device_list,
2457+
struct vfio_pci_core_device, vdev.dev_set_list);
2458+
26072459
err_undo:
2608-
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2609-
if (cur == cur_mem)
2610-
is_mem = false;
2611-
if (cur == cur_vma)
2612-
break;
2613-
if (is_mem)
2614-
up_write(&cur->memory_lock);
2615-
else
2616-
mutex_unlock(&cur->vma_lock);
2617-
}
2460+
list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
2461+
vdev.dev_set_list)
2462+
up_write(&vdev->memory_lock);
2463+
2464+
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
2465+
pm_runtime_put(&vdev->pdev->dev);
26182466

2619-
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2620-
pm_runtime_put(&cur->pdev->dev);
26212467
err_unlock:
26222468
mutex_unlock(&dev_set->lock);
26232469
return ret;

include/linux/vfio_pci_core.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,6 @@ struct vfio_pci_core_device {
9393
struct list_head sriov_pfs_item;
9494
struct vfio_pci_core_device *sriov_pf_core_dev;
9595
struct notifier_block nb;
96-
struct mutex vma_lock;
97-
struct list_head vma_list;
9896
struct rw_semaphore memory_lock;
9997
};
10098

0 commit comments

Comments
 (0)