@@ -1610,100 +1610,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
1610
1610
}
1611
1611
EXPORT_SYMBOL_GPL (vfio_pci_core_write );
1612
1612
1613
- /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1614
- static int vfio_pci_zap_and_vma_lock (struct vfio_pci_core_device * vdev , bool try )
1613
+ static void vfio_pci_zap_bars (struct vfio_pci_core_device * vdev )
1615
1614
{
1616
- struct vfio_pci_mmap_vma * mmap_vma , * tmp ;
1615
+ struct vfio_device * core_vdev = & vdev -> vdev ;
1616
+ loff_t start = VFIO_PCI_INDEX_TO_OFFSET (VFIO_PCI_BAR0_REGION_INDEX );
1617
+ loff_t end = VFIO_PCI_INDEX_TO_OFFSET (VFIO_PCI_ROM_REGION_INDEX );
1618
+ loff_t len = end - start ;
1617
1619
1618
- /*
1619
- * Lock ordering:
1620
- * vma_lock is nested under mmap_lock for vm_ops callback paths.
1621
- * The memory_lock semaphore is used by both code paths calling
1622
- * into this function to zap vmas and the vm_ops.fault callback
1623
- * to protect the memory enable state of the device.
1624
- *
1625
- * When zapping vmas we need to maintain the mmap_lock => vma_lock
1626
- * ordering, which requires using vma_lock to walk vma_list to
1627
- * acquire an mm, then dropping vma_lock to get the mmap_lock and
1628
- * reacquiring vma_lock. This logic is derived from similar
1629
- * requirements in uverbs_user_mmap_disassociate().
1630
- *
1631
- * mmap_lock must always be the top-level lock when it is taken.
1632
- * Therefore we can only hold the memory_lock write lock when
1633
- * vma_list is empty, as we'd need to take mmap_lock to clear
1634
- * entries. vma_list can only be guaranteed empty when holding
1635
- * vma_lock, thus memory_lock is nested under vma_lock.
1636
- *
1637
- * This enables the vm_ops.fault callback to acquire vma_lock,
1638
- * followed by memory_lock read lock, while already holding
1639
- * mmap_lock without risk of deadlock.
1640
- */
1641
- while (1 ) {
1642
- struct mm_struct * mm = NULL ;
1643
-
1644
- if (try ) {
1645
- if (!mutex_trylock (& vdev -> vma_lock ))
1646
- return 0 ;
1647
- } else {
1648
- mutex_lock (& vdev -> vma_lock );
1649
- }
1650
- while (!list_empty (& vdev -> vma_list )) {
1651
- mmap_vma = list_first_entry (& vdev -> vma_list ,
1652
- struct vfio_pci_mmap_vma ,
1653
- vma_next );
1654
- mm = mmap_vma -> vma -> vm_mm ;
1655
- if (mmget_not_zero (mm ))
1656
- break ;
1657
-
1658
- list_del (& mmap_vma -> vma_next );
1659
- kfree (mmap_vma );
1660
- mm = NULL ;
1661
- }
1662
- if (!mm )
1663
- return 1 ;
1664
- mutex_unlock (& vdev -> vma_lock );
1665
-
1666
- if (try ) {
1667
- if (!mmap_read_trylock (mm )) {
1668
- mmput (mm );
1669
- return 0 ;
1670
- }
1671
- } else {
1672
- mmap_read_lock (mm );
1673
- }
1674
- if (try ) {
1675
- if (!mutex_trylock (& vdev -> vma_lock )) {
1676
- mmap_read_unlock (mm );
1677
- mmput (mm );
1678
- return 0 ;
1679
- }
1680
- } else {
1681
- mutex_lock (& vdev -> vma_lock );
1682
- }
1683
- list_for_each_entry_safe (mmap_vma , tmp ,
1684
- & vdev -> vma_list , vma_next ) {
1685
- struct vm_area_struct * vma = mmap_vma -> vma ;
1686
-
1687
- if (vma -> vm_mm != mm )
1688
- continue ;
1689
-
1690
- list_del (& mmap_vma -> vma_next );
1691
- kfree (mmap_vma );
1692
-
1693
- zap_vma_ptes (vma , vma -> vm_start ,
1694
- vma -> vm_end - vma -> vm_start );
1695
- }
1696
- mutex_unlock (& vdev -> vma_lock );
1697
- mmap_read_unlock (mm );
1698
- mmput (mm );
1699
- }
1620
+ unmap_mapping_range (core_vdev -> inode -> i_mapping , start , len , true);
1700
1621
}
1701
1622
1702
1623
void vfio_pci_zap_and_down_write_memory_lock (struct vfio_pci_core_device * vdev )
1703
1624
{
1704
- vfio_pci_zap_and_vma_lock (vdev , false);
1705
1625
down_write (& vdev -> memory_lock );
1706
- mutex_unlock ( & vdev -> vma_lock );
1626
+ vfio_pci_zap_bars ( vdev );
1707
1627
}
1708
1628
1709
1629
u16 vfio_pci_memory_lock_and_enable (struct vfio_pci_core_device * vdev )
@@ -1725,99 +1645,41 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c
1725
1645
up_write (& vdev -> memory_lock );
1726
1646
}
1727
1647
1728
- /* Caller holds vma_lock */
1729
- static int __vfio_pci_add_vma (struct vfio_pci_core_device * vdev ,
1730
- struct vm_area_struct * vma )
1731
- {
1732
- struct vfio_pci_mmap_vma * mmap_vma ;
1733
-
1734
- mmap_vma = kmalloc (sizeof (* mmap_vma ), GFP_KERNEL_ACCOUNT );
1735
- if (!mmap_vma )
1736
- return - ENOMEM ;
1737
-
1738
- mmap_vma -> vma = vma ;
1739
- list_add (& mmap_vma -> vma_next , & vdev -> vma_list );
1740
-
1741
- return 0 ;
1742
- }
1743
-
1744
- /*
1745
- * Zap mmaps on open so that we can fault them in on access and therefore
1746
- * our vma_list only tracks mappings accessed since last zap.
1747
- */
1748
- static void vfio_pci_mmap_open (struct vm_area_struct * vma )
1749
- {
1750
- zap_vma_ptes (vma , vma -> vm_start , vma -> vm_end - vma -> vm_start );
1751
- }
1752
-
1753
- static void vfio_pci_mmap_close (struct vm_area_struct * vma )
1648
+ static unsigned long vma_to_pfn (struct vm_area_struct * vma )
1754
1649
{
1755
1650
struct vfio_pci_core_device * vdev = vma -> vm_private_data ;
1756
- struct vfio_pci_mmap_vma * mmap_vma ;
1651
+ int index = vma -> vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT );
1652
+ u64 pgoff ;
1757
1653
1758
- mutex_lock (& vdev -> vma_lock );
1759
- list_for_each_entry (mmap_vma , & vdev -> vma_list , vma_next ) {
1760
- if (mmap_vma -> vma == vma ) {
1761
- list_del (& mmap_vma -> vma_next );
1762
- kfree (mmap_vma );
1763
- break ;
1764
- }
1765
- }
1766
- mutex_unlock (& vdev -> vma_lock );
1654
+ pgoff = vma -> vm_pgoff &
1655
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT )) - 1 );
1656
+
1657
+ return (pci_resource_start (vdev -> pdev , index ) >> PAGE_SHIFT ) + pgoff ;
1767
1658
}
1768
1659
1769
1660
static vm_fault_t vfio_pci_mmap_fault (struct vm_fault * vmf )
1770
1661
{
1771
1662
struct vm_area_struct * vma = vmf -> vma ;
1772
1663
struct vfio_pci_core_device * vdev = vma -> vm_private_data ;
1773
- struct vfio_pci_mmap_vma * mmap_vma ;
1774
- vm_fault_t ret = VM_FAULT_NOPAGE ;
1664
+ unsigned long pfn , pgoff = vmf -> pgoff - vma -> vm_pgoff ;
1665
+ vm_fault_t ret = VM_FAULT_SIGBUS ;
1775
1666
1776
- mutex_lock (& vdev -> vma_lock );
1777
- down_read (& vdev -> memory_lock );
1667
+ pfn = vma_to_pfn (vma );
1778
1668
1779
- /*
1780
- * Memory region cannot be accessed if the low power feature is engaged
1781
- * or memory access is disabled.
1782
- */
1783
- if (vdev -> pm_runtime_engaged || !__vfio_pci_memory_enabled (vdev )) {
1784
- ret = VM_FAULT_SIGBUS ;
1785
- goto up_out ;
1786
- }
1669
+ down_read (& vdev -> memory_lock );
1787
1670
1788
- /*
1789
- * We populate the whole vma on fault, so we need to test whether
1790
- * the vma has already been mapped, such as for concurrent faults
1791
- * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
1792
- * we ask it to fill the same range again.
1793
- */
1794
- list_for_each_entry (mmap_vma , & vdev -> vma_list , vma_next ) {
1795
- if (mmap_vma -> vma == vma )
1796
- goto up_out ;
1797
- }
1671
+ if (vdev -> pm_runtime_engaged || !__vfio_pci_memory_enabled (vdev ))
1672
+ goto out_disabled ;
1798
1673
1799
- if (io_remap_pfn_range (vma , vma -> vm_start , vma -> vm_pgoff ,
1800
- vma -> vm_end - vma -> vm_start ,
1801
- vma -> vm_page_prot )) {
1802
- ret = VM_FAULT_SIGBUS ;
1803
- zap_vma_ptes (vma , vma -> vm_start , vma -> vm_end - vma -> vm_start );
1804
- goto up_out ;
1805
- }
1674
+ ret = vmf_insert_pfn (vma , vmf -> address , pfn + pgoff );
1806
1675
1807
- if (__vfio_pci_add_vma (vdev , vma )) {
1808
- ret = VM_FAULT_OOM ;
1809
- zap_vma_ptes (vma , vma -> vm_start , vma -> vm_end - vma -> vm_start );
1810
- }
1811
-
1812
- up_out :
1676
+ out_disabled :
1813
1677
up_read (& vdev -> memory_lock );
1814
- mutex_unlock ( & vdev -> vma_lock );
1678
+
1815
1679
return ret ;
1816
1680
}
1817
1681
1818
1682
static const struct vm_operations_struct vfio_pci_mmap_ops = {
1819
- .open = vfio_pci_mmap_open ,
1820
- .close = vfio_pci_mmap_close ,
1821
1683
.fault = vfio_pci_mmap_fault ,
1822
1684
};
1823
1685
@@ -1880,11 +1742,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
1880
1742
1881
1743
vma -> vm_private_data = vdev ;
1882
1744
vma -> vm_page_prot = pgprot_noncached (vma -> vm_page_prot );
1883
- vma -> vm_pgoff = ( pci_resource_start ( pdev , index ) >> PAGE_SHIFT ) + pgoff ;
1745
+ vma -> vm_page_prot = pgprot_decrypted ( vma -> vm_page_prot ) ;
1884
1746
1885
1747
/*
1886
- * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1887
- * change vm_flags within the fault handler. Set them now.
1748
+ * Set vm_flags now, they should not be changed in the fault handler.
1749
+ * We want the same flags and page protection (decrypted above) as
1750
+ * io_remap_pfn_range() would set.
1888
1751
*
1889
1752
* VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
1890
1753
* allowing KVM stage 2 device mapping attributes to use Normal-NC
@@ -2202,8 +2065,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
2202
2065
mutex_init (& vdev -> ioeventfds_lock );
2203
2066
INIT_LIST_HEAD (& vdev -> dummy_resources_list );
2204
2067
INIT_LIST_HEAD (& vdev -> ioeventfds_list );
2205
- mutex_init (& vdev -> vma_lock );
2206
- INIT_LIST_HEAD (& vdev -> vma_list );
2207
2068
INIT_LIST_HEAD (& vdev -> sriov_pfs_item );
2208
2069
init_rwsem (& vdev -> memory_lock );
2209
2070
xa_init (& vdev -> ctx );
@@ -2219,7 +2080,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
2219
2080
2220
2081
mutex_destroy (& vdev -> igate );
2221
2082
mutex_destroy (& vdev -> ioeventfds_lock );
2222
- mutex_destroy (& vdev -> vma_lock );
2223
2083
kfree (vdev -> region );
2224
2084
kfree (vdev -> pm_save );
2225
2085
}
@@ -2497,26 +2357,15 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
2497
2357
return ret ;
2498
2358
}
2499
2359
2500
- /*
2501
- * We need to get memory_lock for each device, but devices can share mmap_lock,
2502
- * therefore we need to zap and hold the vma_lock for each device, and only then
2503
- * get each memory_lock.
2504
- */
2505
2360
static int vfio_pci_dev_set_hot_reset (struct vfio_device_set * dev_set ,
2506
2361
struct vfio_pci_group_info * groups ,
2507
2362
struct iommufd_ctx * iommufd_ctx )
2508
2363
{
2509
- struct vfio_pci_core_device * cur_mem ;
2510
- struct vfio_pci_core_device * cur_vma ;
2511
- struct vfio_pci_core_device * cur ;
2364
+ struct vfio_pci_core_device * vdev ;
2512
2365
struct pci_dev * pdev ;
2513
- bool is_mem = true;
2514
2366
int ret ;
2515
2367
2516
2368
mutex_lock (& dev_set -> lock );
2517
- cur_mem = list_first_entry (& dev_set -> device_list ,
2518
- struct vfio_pci_core_device ,
2519
- vdev .dev_set_list );
2520
2369
2521
2370
pdev = vfio_pci_dev_set_resettable (dev_set );
2522
2371
if (!pdev ) {
@@ -2533,7 +2382,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2533
2382
if (ret )
2534
2383
goto err_unlock ;
2535
2384
2536
- list_for_each_entry (cur_vma , & dev_set -> device_list , vdev .dev_set_list ) {
2385
+ list_for_each_entry (vdev , & dev_set -> device_list , vdev .dev_set_list ) {
2537
2386
bool owned ;
2538
2387
2539
2388
/*
@@ -2557,38 +2406,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2557
2406
* Otherwise, reset is not allowed.
2558
2407
*/
2559
2408
if (iommufd_ctx ) {
2560
- int devid = vfio_iommufd_get_dev_id (& cur_vma -> vdev ,
2409
+ int devid = vfio_iommufd_get_dev_id (& vdev -> vdev ,
2561
2410
iommufd_ctx );
2562
2411
2563
2412
owned = (devid > 0 || devid == - ENOENT );
2564
2413
} else {
2565
- owned = vfio_dev_in_groups (& cur_vma -> vdev , groups );
2414
+ owned = vfio_dev_in_groups (& vdev -> vdev , groups );
2566
2415
}
2567
2416
2568
2417
if (!owned ) {
2569
2418
ret = - EINVAL ;
2570
- goto err_undo ;
2419
+ break ;
2571
2420
}
2572
2421
2573
2422
/*
2574
- * Locking multiple devices is prone to deadlock, runaway and
2575
- * unwind if we hit contention.
2423
+ * Take the memory write lock for each device and zap BAR
2424
+ * mappings to prevent the user accessing the device while in
2425
+ * reset. Locking multiple devices is prone to deadlock,
2426
+ * runaway and unwind if we hit contention.
2576
2427
*/
2577
- if (!vfio_pci_zap_and_vma_lock ( cur_vma , true )) {
2428
+ if (!down_write_trylock ( & vdev -> memory_lock )) {
2578
2429
ret = - EBUSY ;
2579
- goto err_undo ;
2430
+ break ;
2580
2431
}
2432
+
2433
+ vfio_pci_zap_bars (vdev );
2581
2434
}
2582
- cur_vma = NULL ;
2583
2435
2584
- list_for_each_entry (cur_mem , & dev_set -> device_list , vdev .dev_set_list ) {
2585
- if (!down_write_trylock (& cur_mem -> memory_lock )) {
2586
- ret = - EBUSY ;
2587
- goto err_undo ;
2588
- }
2589
- mutex_unlock (& cur_mem -> vma_lock );
2436
+ if (!list_entry_is_head (vdev ,
2437
+ & dev_set -> device_list , vdev .dev_set_list )) {
2438
+ vdev = list_prev_entry (vdev , vdev .dev_set_list );
2439
+ goto err_undo ;
2590
2440
}
2591
- cur_mem = NULL ;
2592
2441
2593
2442
/*
2594
2443
* The pci_reset_bus() will reset all the devices in the bus.
@@ -2599,25 +2448,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2599
2448
* cause the PCI config space reset without restoring the original
2600
2449
* state (saved locally in 'vdev->pm_save').
2601
2450
*/
2602
- list_for_each_entry (cur , & dev_set -> device_list , vdev .dev_set_list )
2603
- vfio_pci_set_power_state (cur , PCI_D0 );
2451
+ list_for_each_entry (vdev , & dev_set -> device_list , vdev .dev_set_list )
2452
+ vfio_pci_set_power_state (vdev , PCI_D0 );
2604
2453
2605
2454
ret = pci_reset_bus (pdev );
2606
2455
2456
+ vdev = list_last_entry (& dev_set -> device_list ,
2457
+ struct vfio_pci_core_device , vdev .dev_set_list );
2458
+
2607
2459
err_undo :
2608
- list_for_each_entry (cur , & dev_set -> device_list , vdev .dev_set_list ) {
2609
- if (cur == cur_mem )
2610
- is_mem = false;
2611
- if (cur == cur_vma )
2612
- break ;
2613
- if (is_mem )
2614
- up_write (& cur -> memory_lock );
2615
- else
2616
- mutex_unlock (& cur -> vma_lock );
2617
- }
2460
+ list_for_each_entry_from_reverse (vdev , & dev_set -> device_list ,
2461
+ vdev .dev_set_list )
2462
+ up_write (& vdev -> memory_lock );
2463
+
2464
+ list_for_each_entry (vdev , & dev_set -> device_list , vdev .dev_set_list )
2465
+ pm_runtime_put (& vdev -> pdev -> dev );
2618
2466
2619
- list_for_each_entry (cur , & dev_set -> device_list , vdev .dev_set_list )
2620
- pm_runtime_put (& cur -> pdev -> dev );
2621
2467
err_unlock :
2622
2468
mutex_unlock (& dev_set -> lock );
2623
2469
return ret ;
0 commit comments