@@ -853,11 +853,98 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
853
853
* About rmap_head encoding:
854
854
*
855
855
* If the bit zero of rmap_head->val is clear, then it points to the only spte
856
- * in this rmap chain. Otherwise, (rmap_head->val & ~1 ) points to a struct
856
+ * in this rmap chain. Otherwise, (rmap_head->val & ~3 ) points to a struct
857
857
* pte_list_desc containing more mappings.
858
858
*/
859
859
#define KVM_RMAP_MANY BIT(0)
860
860
861
+ /*
862
+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
863
+ * operates with mmu_lock held for write), but rmaps can be walked without
864
+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
865
+ * being zapped/dropped _while the rmap is locked_.
866
+ *
867
+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
868
+ * done while holding mmu_lock for write. This allows a task walking rmaps
869
+ * without holding mmu_lock to concurrently walk the same entries as a task
870
+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
871
+ * the rmaps, thus the walks are stable.
872
+ *
873
+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
874
+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
875
+ * ensures all "struct pte_list_desc" fields are stable.
876
+ */
877
+ #define KVM_RMAP_LOCKED BIT(1)
878
+
879
+ static unsigned long kvm_rmap_lock (struct kvm_rmap_head * rmap_head )
880
+ {
881
+ unsigned long old_val , new_val ;
882
+
883
+ lockdep_assert_preemption_disabled ();
884
+
885
+ /*
886
+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
887
+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
888
+ * entries to the rmap. I.e. the only paths that process empty rmaps
889
+ * do so while holding mmu_lock for write, and are mutually exclusive.
890
+ */
891
+ old_val = atomic_long_read (& rmap_head -> val );
892
+ if (!old_val )
893
+ return 0 ;
894
+
895
+ do {
896
+ /*
897
+ * If the rmap is locked, wait for it to be unlocked before
898
+ * trying acquire the lock, e.g. to avoid bouncing the cache
899
+ * line.
900
+ */
901
+ while (old_val & KVM_RMAP_LOCKED ) {
902
+ cpu_relax ();
903
+ old_val = atomic_long_read (& rmap_head -> val );
904
+ }
905
+
906
+ /*
907
+ * Recheck for an empty rmap, it may have been purged by the
908
+ * task that held the lock.
909
+ */
910
+ if (!old_val )
911
+ return 0 ;
912
+
913
+ new_val = old_val | KVM_RMAP_LOCKED ;
914
+ /*
915
+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
916
+ * from being reordered outside of the critical section created by
917
+ * kvm_rmap_lock().
918
+ *
919
+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
920
+ *
921
+ * For the !old_val case, no ordering is needed, as there is no rmap
922
+ * to walk.
923
+ */
924
+ } while (!atomic_long_try_cmpxchg_acquire (& rmap_head -> val , & old_val , new_val ));
925
+
926
+ /* Return the old value, i.e. _without_ the LOCKED bit set. */
927
+ return old_val ;
928
+ }
929
+
930
+ static void kvm_rmap_unlock (struct kvm_rmap_head * rmap_head ,
931
+ unsigned long new_val )
932
+ {
933
+ WARN_ON_ONCE (new_val & KVM_RMAP_LOCKED );
934
+ /*
935
+ * Ensure that all accesses to the rmap have completed before unlocking
936
+ * the rmap.
937
+ *
938
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock.
939
+ */
940
+ atomic_long_set_release (& rmap_head -> val , new_val );
941
+ }
942
+
943
+ static unsigned long kvm_rmap_get (struct kvm_rmap_head * rmap_head )
944
+ {
945
+ return atomic_long_read (& rmap_head -> val ) & ~KVM_RMAP_LOCKED ;
946
+ }
947
+
861
948
/*
862
949
* Returns the number of pointers in the rmap chain, not counting the new one.
863
950
*/
@@ -868,7 +955,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
868
955
struct pte_list_desc * desc ;
869
956
int count = 0 ;
870
957
871
- old_val = rmap_head -> val ;
958
+ old_val = kvm_rmap_lock ( rmap_head ) ;
872
959
873
960
if (!old_val ) {
874
961
new_val = (unsigned long )spte ;
@@ -900,7 +987,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
900
987
desc -> sptes [desc -> spte_count ++ ] = spte ;
901
988
}
902
989
903
- rmap_head -> val = new_val ;
990
+ kvm_rmap_unlock ( rmap_head , new_val ) ;
904
991
905
992
return count ;
906
993
}
@@ -948,7 +1035,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
948
1035
unsigned long rmap_val ;
949
1036
int i ;
950
1037
951
- rmap_val = rmap_head -> val ;
1038
+ rmap_val = kvm_rmap_lock ( rmap_head ) ;
952
1039
if (KVM_BUG_ON_DATA_CORRUPTION (!rmap_val , kvm ))
953
1040
goto out ;
954
1041
@@ -974,7 +1061,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
974
1061
}
975
1062
976
1063
out :
977
- rmap_head -> val = rmap_val ;
1064
+ kvm_rmap_unlock ( rmap_head , rmap_val ) ;
978
1065
}
979
1066
980
1067
static void kvm_zap_one_rmap_spte (struct kvm * kvm ,
@@ -992,7 +1079,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
992
1079
unsigned long rmap_val ;
993
1080
int i ;
994
1081
995
- rmap_val = rmap_head -> val ;
1082
+ rmap_val = kvm_rmap_lock ( rmap_head ) ;
996
1083
if (!rmap_val )
997
1084
return false;
998
1085
@@ -1011,13 +1098,13 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
1011
1098
}
1012
1099
out :
1013
1100
/* rmap_head is meaningless now, remember to reset it */
1014
- rmap_head -> val = 0 ;
1101
+ kvm_rmap_unlock ( rmap_head , 0 ) ;
1015
1102
return true;
1016
1103
}
1017
1104
1018
1105
unsigned int pte_list_count (struct kvm_rmap_head * rmap_head )
1019
1106
{
1020
- unsigned long rmap_val = rmap_head -> val ;
1107
+ unsigned long rmap_val = kvm_rmap_get ( rmap_head ) ;
1021
1108
struct pte_list_desc * desc ;
1022
1109
1023
1110
if (!rmap_val )
@@ -1083,7 +1170,7 @@ struct rmap_iterator {
1083
1170
static u64 * rmap_get_first (struct kvm_rmap_head * rmap_head ,
1084
1171
struct rmap_iterator * iter )
1085
1172
{
1086
- unsigned long rmap_val = rmap_head -> val ;
1173
+ unsigned long rmap_val = kvm_rmap_get ( rmap_head ) ;
1087
1174
u64 * sptep ;
1088
1175
1089
1176
if (!rmap_val )
@@ -1418,7 +1505,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1418
1505
while (++ iterator -> rmap <= iterator -> end_rmap ) {
1419
1506
iterator -> gfn += KVM_PAGES_PER_HPAGE (iterator -> level );
1420
1507
1421
- if (iterator -> rmap -> val )
1508
+ if (atomic_long_read ( & iterator -> rmap -> val ) )
1422
1509
return ;
1423
1510
}
1424
1511
@@ -2444,7 +2531,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2444
2531
* avoids retaining a large number of stale nested SPs.
2445
2532
*/
2446
2533
if (tdp_enabled && invalid_list &&
2447
- child -> role .guest_mode && !child -> parent_ptes .val )
2534
+ child -> role .guest_mode &&
2535
+ !atomic_long_read (& child -> parent_ptes .val ))
2448
2536
return kvm_mmu_prepare_zap_page (kvm , child ,
2449
2537
invalid_list );
2450
2538
}
0 commit comments