Skip to content

Commit bb6c774

Browse files
committed
KVM: x86/mmu: Add support for lockless walks of rmap SPTEs
Add a lockless version of for_each_rmap_spte(), which is pretty much the same as the normal version, except that it doesn't BUG() the host if a non-present SPTE is encountered. When mmu_lock is held, it should be impossible for a different task to zap a SPTE, _and_ zapped SPTEs must be removed from their rmap chain prior to dropping mmu_lock. Thus, the normal walker BUG()s if a non-present SPTE is encountered as something is wildly broken. When walking rmaps without holding mmu_lock, the SPTEs pointed at by the rmap chain can be zapped/dropped, and so a lockless walk can observe a non-present SPTE if it runs concurrently with a different operation that is zapping SPTEs. Signed-off-by: James Houghton <jthoughton@google.com> Link: https://lore.kernel.org/r/20250204004038.1680123-11-jthoughton@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 4834ead commit bb6c774

File tree

1 file changed

+92
-41
lines changed

1 file changed

+92
-41
lines changed

arch/x86/kvm/mmu/mmu.c

Lines changed: 92 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
876876
*/
877877
#define KVM_RMAP_LOCKED BIT(1)
878878

879-
static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
879+
static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
880880
{
881881
unsigned long old_val, new_val;
882882

@@ -914,7 +914,7 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
914914
/*
915915
* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
916916
* from being reordered outside of the critical section created by
917-
* kvm_rmap_lock().
917+
* __kvm_rmap_lock().
918918
*
919919
* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
920920
*
@@ -923,39 +923,92 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
923923
*/
924924
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
925925

926-
/* Return the old value, i.e. _without_ the LOCKED bit set. */
926+
/*
927+
* Return the old value, i.e. _without_ the LOCKED bit set. It's
928+
* impossible for the return value to be 0 (see above), i.e. the read-
929+
* only unlock flow can't get a false positive and fail to unlock.
930+
*/
927931
return old_val;
928932
}
929933

930-
static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
931-
unsigned long new_val)
934+
static unsigned long kvm_rmap_lock(struct kvm *kvm,
935+
struct kvm_rmap_head *rmap_head)
936+
{
937+
lockdep_assert_held_write(&kvm->mmu_lock);
938+
939+
return __kvm_rmap_lock(rmap_head);
940+
}
941+
942+
static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
943+
unsigned long val)
932944
{
933-
WARN_ON_ONCE(new_val & KVM_RMAP_LOCKED);
945+
KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
934946
/*
935947
* Ensure that all accesses to the rmap have completed before unlocking
936948
* the rmap.
937949
*
938-
* Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock.
950+
* Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
939951
*/
940-
atomic_long_set_release(&rmap_head->val, new_val);
952+
atomic_long_set_release(&rmap_head->val, val);
953+
}
954+
955+
static void kvm_rmap_unlock(struct kvm *kvm,
956+
struct kvm_rmap_head *rmap_head,
957+
unsigned long new_val)
958+
{
959+
lockdep_assert_held_write(&kvm->mmu_lock);
960+
961+
__kvm_rmap_unlock(rmap_head, new_val);
941962
}
942963

943964
static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
944965
{
945966
return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
946967
}
947968

969+
/*
970+
* If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
971+
* actual locking is the same, but the caller is disallowed from modifying the
972+
* rmap, and so the unlock flow is a nop if the rmap is/was empty.
973+
*/
974+
__maybe_unused
975+
static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
976+
{
977+
unsigned long rmap_val;
978+
979+
preempt_disable();
980+
rmap_val = __kvm_rmap_lock(rmap_head);
981+
982+
if (!rmap_val)
983+
preempt_enable();
984+
985+
return rmap_val;
986+
}
987+
988+
__maybe_unused
989+
static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
990+
unsigned long old_val)
991+
{
992+
if (!old_val)
993+
return;
994+
995+
KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
996+
997+
__kvm_rmap_unlock(rmap_head, old_val);
998+
preempt_enable();
999+
}
1000+
9481001
/*
9491002
* Returns the number of pointers in the rmap chain, not counting the new one.
9501003
*/
951-
static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
952-
struct kvm_rmap_head *rmap_head)
1004+
static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
1005+
u64 *spte, struct kvm_rmap_head *rmap_head)
9531006
{
9541007
unsigned long old_val, new_val;
9551008
struct pte_list_desc *desc;
9561009
int count = 0;
9571010

958-
old_val = kvm_rmap_lock(rmap_head);
1011+
old_val = kvm_rmap_lock(kvm, rmap_head);
9591012

9601013
if (!old_val) {
9611014
new_val = (unsigned long)spte;
@@ -987,7 +1040,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
9871040
desc->sptes[desc->spte_count++] = spte;
9881041
}
9891042

990-
kvm_rmap_unlock(rmap_head, new_val);
1043+
kvm_rmap_unlock(kvm, rmap_head, new_val);
9911044

9921045
return count;
9931046
}
@@ -1035,7 +1088,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
10351088
unsigned long rmap_val;
10361089
int i;
10371090

1038-
rmap_val = kvm_rmap_lock(rmap_head);
1091+
rmap_val = kvm_rmap_lock(kvm, rmap_head);
10391092
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
10401093
goto out;
10411094

@@ -1061,7 +1114,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
10611114
}
10621115

10631116
out:
1064-
kvm_rmap_unlock(rmap_head, rmap_val);
1117+
kvm_rmap_unlock(kvm, rmap_head, rmap_val);
10651118
}
10661119

10671120
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -1079,7 +1132,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
10791132
unsigned long rmap_val;
10801133
int i;
10811134

1082-
rmap_val = kvm_rmap_lock(rmap_head);
1135+
rmap_val = kvm_rmap_lock(kvm, rmap_head);
10831136
if (!rmap_val)
10841137
return false;
10851138

@@ -1098,7 +1151,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
10981151
}
10991152
out:
11001153
/* rmap_head is meaningless now, remember to reset it */
1101-
kvm_rmap_unlock(rmap_head, 0);
1154+
kvm_rmap_unlock(kvm, rmap_head, 0);
11021155
return true;
11031156
}
11041157

@@ -1171,23 +1224,18 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
11711224
struct rmap_iterator *iter)
11721225
{
11731226
unsigned long rmap_val = kvm_rmap_get(rmap_head);
1174-
u64 *sptep;
11751227

11761228
if (!rmap_val)
11771229
return NULL;
11781230

11791231
if (!(rmap_val & KVM_RMAP_MANY)) {
11801232
iter->desc = NULL;
1181-
sptep = (u64 *)rmap_val;
1182-
goto out;
1233+
return (u64 *)rmap_val;
11831234
}
11841235

11851236
iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
11861237
iter->pos = 0;
1187-
sptep = iter->desc->sptes[iter->pos];
1188-
out:
1189-
BUG_ON(!is_shadow_present_pte(*sptep));
1190-
return sptep;
1238+
return iter->desc->sptes[iter->pos];
11911239
}
11921240

11931241
/*
@@ -1197,35 +1245,36 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
11971245
*/
11981246
static u64 *rmap_get_next(struct rmap_iterator *iter)
11991247
{
1200-
u64 *sptep;
1201-
12021248
if (iter->desc) {
12031249
if (iter->pos < PTE_LIST_EXT - 1) {
12041250
++iter->pos;
1205-
sptep = iter->desc->sptes[iter->pos];
1206-
if (sptep)
1207-
goto out;
1251+
if (iter->desc->sptes[iter->pos])
1252+
return iter->desc->sptes[iter->pos];
12081253
}
12091254

12101255
iter->desc = iter->desc->more;
12111256

12121257
if (iter->desc) {
12131258
iter->pos = 0;
12141259
/* desc->sptes[0] cannot be NULL */
1215-
sptep = iter->desc->sptes[iter->pos];
1216-
goto out;
1260+
return iter->desc->sptes[iter->pos];
12171261
}
12181262
}
12191263

12201264
return NULL;
1221-
out:
1222-
BUG_ON(!is_shadow_present_pte(*sptep));
1223-
return sptep;
12241265
}
12251266

1226-
#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1227-
for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1228-
_spte_; _spte_ = rmap_get_next(_iter_))
1267+
#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
1268+
for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
1269+
_sptep_; _sptep_ = rmap_get_next(_iter_))
1270+
1271+
#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
1272+
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
1273+
if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
1274+
1275+
#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
1276+
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
1277+
if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
12291278

12301279
static void drop_spte(struct kvm *kvm, u64 *sptep)
12311280
{
@@ -1311,12 +1360,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
13111360
struct rmap_iterator iter;
13121361
bool flush = false;
13131362

1314-
for_each_rmap_spte(rmap_head, &iter, sptep)
1363+
for_each_rmap_spte(rmap_head, &iter, sptep) {
13151364
if (spte_ad_need_write_protect(*sptep))
13161365
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
13171366
(unsigned long *)sptep);
13181367
else
13191368
flush |= spte_clear_dirty(sptep);
1369+
}
13201370

13211371
return flush;
13221372
}
@@ -1637,7 +1687,7 @@ static void __rmap_add(struct kvm *kvm,
16371687
kvm_update_page_stats(kvm, sp->role.level, 1);
16381688

16391689
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
1640-
rmap_count = pte_list_add(cache, spte, rmap_head);
1690+
rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
16411691

16421692
if (rmap_count > kvm->stat.max_mmu_rmap_size)
16431693
kvm->stat.max_mmu_rmap_size = rmap_count;
@@ -1771,13 +1821,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
17711821
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
17721822
}
17731823

1774-
static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1824+
static void mmu_page_add_parent_pte(struct kvm *kvm,
1825+
struct kvm_mmu_memory_cache *cache,
17751826
struct kvm_mmu_page *sp, u64 *parent_pte)
17761827
{
17771828
if (!parent_pte)
17781829
return;
17791830

1780-
pte_list_add(cache, parent_pte, &sp->parent_ptes);
1831+
pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
17811832
}
17821833

17831834
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
@@ -2467,7 +2518,7 @@ static void __link_shadow_page(struct kvm *kvm,
24672518

24682519
mmu_spte_set(sptep, spte);
24692520

2470-
mmu_page_add_parent_pte(cache, sp, sptep);
2521+
mmu_page_add_parent_pte(kvm, cache, sp, sptep);
24712522

24722523
/*
24732524
* The non-direct sub-pagetable must be updated before linking. For

0 commit comments

Comments
 (0)