@@ -876,7 +876,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
876
876
*/
877
877
#define KVM_RMAP_LOCKED BIT(1)
878
878
879
- static unsigned long kvm_rmap_lock (struct kvm_rmap_head * rmap_head )
879
+ static unsigned long __kvm_rmap_lock (struct kvm_rmap_head * rmap_head )
880
880
{
881
881
unsigned long old_val , new_val ;
882
882
@@ -914,7 +914,7 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
914
914
/*
915
915
* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
916
916
* from being reordered outside of the critical section created by
917
- * kvm_rmap_lock ().
917
+ * __kvm_rmap_lock ().
918
918
*
919
919
* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
920
920
*
@@ -923,39 +923,92 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
923
923
*/
924
924
} while (!atomic_long_try_cmpxchg_acquire (& rmap_head -> val , & old_val , new_val ));
925
925
926
- /* Return the old value, i.e. _without_ the LOCKED bit set. */
926
+ /*
927
+ * Return the old value, i.e. _without_ the LOCKED bit set. It's
928
+ * impossible for the return value to be 0 (see above), i.e. the read-
929
+ * only unlock flow can't get a false positive and fail to unlock.
930
+ */
927
931
return old_val ;
928
932
}
929
933
930
- static void kvm_rmap_unlock (struct kvm_rmap_head * rmap_head ,
931
- unsigned long new_val )
934
+ static unsigned long kvm_rmap_lock (struct kvm * kvm ,
935
+ struct kvm_rmap_head * rmap_head )
936
+ {
937
+ lockdep_assert_held_write (& kvm -> mmu_lock );
938
+
939
+ return __kvm_rmap_lock (rmap_head );
940
+ }
941
+
942
+ static void __kvm_rmap_unlock (struct kvm_rmap_head * rmap_head ,
943
+ unsigned long val )
932
944
{
933
- WARN_ON_ONCE ( new_val & KVM_RMAP_LOCKED );
945
+ KVM_MMU_WARN_ON ( val & KVM_RMAP_LOCKED );
934
946
/*
935
947
* Ensure that all accesses to the rmap have completed before unlocking
936
948
* the rmap.
937
949
*
938
- * Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock .
950
+ * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock() .
939
951
*/
940
- atomic_long_set_release (& rmap_head -> val , new_val );
952
+ atomic_long_set_release (& rmap_head -> val , val );
953
+ }
954
+
955
+ static void kvm_rmap_unlock (struct kvm * kvm ,
956
+ struct kvm_rmap_head * rmap_head ,
957
+ unsigned long new_val )
958
+ {
959
+ lockdep_assert_held_write (& kvm -> mmu_lock );
960
+
961
+ __kvm_rmap_unlock (rmap_head , new_val );
941
962
}
942
963
943
964
static unsigned long kvm_rmap_get (struct kvm_rmap_head * rmap_head )
944
965
{
945
966
return atomic_long_read (& rmap_head -> val ) & ~KVM_RMAP_LOCKED ;
946
967
}
947
968
969
+ /*
970
+ * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
971
+ * actual locking is the same, but the caller is disallowed from modifying the
972
+ * rmap, and so the unlock flow is a nop if the rmap is/was empty.
973
+ */
974
+ __maybe_unused
975
+ static unsigned long kvm_rmap_lock_readonly (struct kvm_rmap_head * rmap_head )
976
+ {
977
+ unsigned long rmap_val ;
978
+
979
+ preempt_disable ();
980
+ rmap_val = __kvm_rmap_lock (rmap_head );
981
+
982
+ if (!rmap_val )
983
+ preempt_enable ();
984
+
985
+ return rmap_val ;
986
+ }
987
+
988
+ __maybe_unused
989
+ static void kvm_rmap_unlock_readonly (struct kvm_rmap_head * rmap_head ,
990
+ unsigned long old_val )
991
+ {
992
+ if (!old_val )
993
+ return ;
994
+
995
+ KVM_MMU_WARN_ON (old_val != kvm_rmap_get (rmap_head ));
996
+
997
+ __kvm_rmap_unlock (rmap_head , old_val );
998
+ preempt_enable ();
999
+ }
1000
+
948
1001
/*
949
1002
* Returns the number of pointers in the rmap chain, not counting the new one.
950
1003
*/
951
- static int pte_list_add (struct kvm_mmu_memory_cache * cache , u64 * spte ,
952
- struct kvm_rmap_head * rmap_head )
1004
+ static int pte_list_add (struct kvm * kvm , struct kvm_mmu_memory_cache * cache ,
1005
+ u64 * spte , struct kvm_rmap_head * rmap_head )
953
1006
{
954
1007
unsigned long old_val , new_val ;
955
1008
struct pte_list_desc * desc ;
956
1009
int count = 0 ;
957
1010
958
- old_val = kvm_rmap_lock (rmap_head );
1011
+ old_val = kvm_rmap_lock (kvm , rmap_head );
959
1012
960
1013
if (!old_val ) {
961
1014
new_val = (unsigned long )spte ;
@@ -987,7 +1040,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
987
1040
desc -> sptes [desc -> spte_count ++ ] = spte ;
988
1041
}
989
1042
990
- kvm_rmap_unlock (rmap_head , new_val );
1043
+ kvm_rmap_unlock (kvm , rmap_head , new_val );
991
1044
992
1045
return count ;
993
1046
}
@@ -1035,7 +1088,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
1035
1088
unsigned long rmap_val ;
1036
1089
int i ;
1037
1090
1038
- rmap_val = kvm_rmap_lock (rmap_head );
1091
+ rmap_val = kvm_rmap_lock (kvm , rmap_head );
1039
1092
if (KVM_BUG_ON_DATA_CORRUPTION (!rmap_val , kvm ))
1040
1093
goto out ;
1041
1094
@@ -1061,7 +1114,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
1061
1114
}
1062
1115
1063
1116
out :
1064
- kvm_rmap_unlock (rmap_head , rmap_val );
1117
+ kvm_rmap_unlock (kvm , rmap_head , rmap_val );
1065
1118
}
1066
1119
1067
1120
static void kvm_zap_one_rmap_spte (struct kvm * kvm ,
@@ -1079,7 +1132,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
1079
1132
unsigned long rmap_val ;
1080
1133
int i ;
1081
1134
1082
- rmap_val = kvm_rmap_lock (rmap_head );
1135
+ rmap_val = kvm_rmap_lock (kvm , rmap_head );
1083
1136
if (!rmap_val )
1084
1137
return false;
1085
1138
@@ -1098,7 +1151,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
1098
1151
}
1099
1152
out :
1100
1153
/* rmap_head is meaningless now, remember to reset it */
1101
- kvm_rmap_unlock (rmap_head , 0 );
1154
+ kvm_rmap_unlock (kvm , rmap_head , 0 );
1102
1155
return true;
1103
1156
}
1104
1157
@@ -1171,23 +1224,18 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1171
1224
struct rmap_iterator * iter )
1172
1225
{
1173
1226
unsigned long rmap_val = kvm_rmap_get (rmap_head );
1174
- u64 * sptep ;
1175
1227
1176
1228
if (!rmap_val )
1177
1229
return NULL ;
1178
1230
1179
1231
if (!(rmap_val & KVM_RMAP_MANY )) {
1180
1232
iter -> desc = NULL ;
1181
- sptep = (u64 * )rmap_val ;
1182
- goto out ;
1233
+ return (u64 * )rmap_val ;
1183
1234
}
1184
1235
1185
1236
iter -> desc = (struct pte_list_desc * )(rmap_val & ~KVM_RMAP_MANY );
1186
1237
iter -> pos = 0 ;
1187
- sptep = iter -> desc -> sptes [iter -> pos ];
1188
- out :
1189
- BUG_ON (!is_shadow_present_pte (* sptep ));
1190
- return sptep ;
1238
+ return iter -> desc -> sptes [iter -> pos ];
1191
1239
}
1192
1240
1193
1241
/*
@@ -1197,35 +1245,36 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1197
1245
*/
1198
1246
static u64 * rmap_get_next (struct rmap_iterator * iter )
1199
1247
{
1200
- u64 * sptep ;
1201
-
1202
1248
if (iter -> desc ) {
1203
1249
if (iter -> pos < PTE_LIST_EXT - 1 ) {
1204
1250
++ iter -> pos ;
1205
- sptep = iter -> desc -> sptes [iter -> pos ];
1206
- if (sptep )
1207
- goto out ;
1251
+ if (iter -> desc -> sptes [iter -> pos ])
1252
+ return iter -> desc -> sptes [iter -> pos ];
1208
1253
}
1209
1254
1210
1255
iter -> desc = iter -> desc -> more ;
1211
1256
1212
1257
if (iter -> desc ) {
1213
1258
iter -> pos = 0 ;
1214
1259
/* desc->sptes[0] cannot be NULL */
1215
- sptep = iter -> desc -> sptes [iter -> pos ];
1216
- goto out ;
1260
+ return iter -> desc -> sptes [iter -> pos ];
1217
1261
}
1218
1262
}
1219
1263
1220
1264
return NULL ;
1221
- out :
1222
- BUG_ON (!is_shadow_present_pte (* sptep ));
1223
- return sptep ;
1224
1265
}
1225
1266
1226
- #define for_each_rmap_spte (_rmap_head_ , _iter_ , _spte_ ) \
1227
- for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1228
- _spte_; _spte_ = rmap_get_next(_iter_))
1267
+ #define __for_each_rmap_spte (_rmap_head_ , _iter_ , _sptep_ ) \
1268
+ for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
1269
+ _sptep_; _sptep_ = rmap_get_next(_iter_))
1270
+
1271
+ #define for_each_rmap_spte (_rmap_head_ , _iter_ , _sptep_ ) \
1272
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
1273
+ if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
1274
+
1275
+ #define for_each_rmap_spte_lockless (_rmap_head_ , _iter_ , _sptep_ , _spte_ ) \
1276
+ __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
1277
+ if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
1229
1278
1230
1279
static void drop_spte (struct kvm * kvm , u64 * sptep )
1231
1280
{
@@ -1311,12 +1360,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1311
1360
struct rmap_iterator iter ;
1312
1361
bool flush = false;
1313
1362
1314
- for_each_rmap_spte (rmap_head , & iter , sptep )
1363
+ for_each_rmap_spte (rmap_head , & iter , sptep ) {
1315
1364
if (spte_ad_need_write_protect (* sptep ))
1316
1365
flush |= test_and_clear_bit (PT_WRITABLE_SHIFT ,
1317
1366
(unsigned long * )sptep );
1318
1367
else
1319
1368
flush |= spte_clear_dirty (sptep );
1369
+ }
1320
1370
1321
1371
return flush ;
1322
1372
}
@@ -1637,7 +1687,7 @@ static void __rmap_add(struct kvm *kvm,
1637
1687
kvm_update_page_stats (kvm , sp -> role .level , 1 );
1638
1688
1639
1689
rmap_head = gfn_to_rmap (gfn , sp -> role .level , slot );
1640
- rmap_count = pte_list_add (cache , spte , rmap_head );
1690
+ rmap_count = pte_list_add (kvm , cache , spte , rmap_head );
1641
1691
1642
1692
if (rmap_count > kvm -> stat .max_mmu_rmap_size )
1643
1693
kvm -> stat .max_mmu_rmap_size = rmap_count ;
@@ -1771,13 +1821,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
1771
1821
return hash_64 (gfn , KVM_MMU_HASH_SHIFT );
1772
1822
}
1773
1823
1774
- static void mmu_page_add_parent_pte (struct kvm_mmu_memory_cache * cache ,
1824
+ static void mmu_page_add_parent_pte (struct kvm * kvm ,
1825
+ struct kvm_mmu_memory_cache * cache ,
1775
1826
struct kvm_mmu_page * sp , u64 * parent_pte )
1776
1827
{
1777
1828
if (!parent_pte )
1778
1829
return ;
1779
1830
1780
- pte_list_add (cache , parent_pte , & sp -> parent_ptes );
1831
+ pte_list_add (kvm , cache , parent_pte , & sp -> parent_ptes );
1781
1832
}
1782
1833
1783
1834
static void mmu_page_remove_parent_pte (struct kvm * kvm , struct kvm_mmu_page * sp ,
@@ -2467,7 +2518,7 @@ static void __link_shadow_page(struct kvm *kvm,
2467
2518
2468
2519
mmu_spte_set (sptep , spte );
2469
2520
2470
- mmu_page_add_parent_pte (cache , sp , sptep );
2521
+ mmu_page_add_parent_pte (kvm , cache , sp , sptep );
2471
2522
2472
2523
/*
2473
2524
* The non-direct sub-pagetable must be updated before linking. For
0 commit comments