Skip to content

Commit 4834ead

Browse files
committed
KVM: x86/mmu: Add infrastructure to allow walking rmaps outside of mmu_lock
Steal another bit from rmap entries (which are word aligned pointers, i.e. have 2 free bits on 32-bit KVM, and 3 free bits on 64-bit KVM), and use the bit to implement a *very* rudimentary per-rmap spinlock. The only anticipated usage of the lock outside of mmu_lock is for aging gfns, and collisions between aging and other MMU rmap operations are quite rare, e.g. unless userspace is being silly and aging a tiny range over and over in a tight loop, time between contention when aging an actively running VM is O(seconds). In short, a more sophisticated locking scheme shouldn't be necessary. Note, the lock only protects the rmap structure itself, SPTEs that are pointed at by a locked rmap can still be modified and zapped by another task (KVM drops/zaps SPTEs before deleting the rmap entries) Co-developed-by: James Houghton <jthoughton@google.com> Signed-off-by: James Houghton <jthoughton@google.com> Link: https://lore.kernel.org/r/20250204004038.1680123-10-jthoughton@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 9fb13ba commit 4834ead

File tree

2 files changed

+101
-12
lines changed

2 files changed

+101
-12
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <linux/kfifo.h>
2828
#include <linux/sched/vhost_task.h>
2929
#include <linux/call_once.h>
30+
#include <linux/atomic.h>
3031

3132
#include <asm/apic.h>
3233
#include <asm/pvclock-abi.h>
@@ -405,7 +406,7 @@ union kvm_cpu_role {
405406
};
406407

407408
struct kvm_rmap_head {
408-
unsigned long val;
409+
atomic_long_t val;
409410
};
410411

411412
struct kvm_pio_request {

arch/x86/kvm/mmu/mmu.c

Lines changed: 99 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -853,11 +853,98 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
853853
* About rmap_head encoding:
854854
*
855855
* If the bit zero of rmap_head->val is clear, then it points to the only spte
856-
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
856+
* in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct
857857
* pte_list_desc containing more mappings.
858858
*/
859859
#define KVM_RMAP_MANY BIT(0)
860860

861+
/*
862+
* rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
863+
* operates with mmu_lock held for write), but rmaps can be walked without
864+
* holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
865+
* being zapped/dropped _while the rmap is locked_.
866+
*
867+
* Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
868+
* done while holding mmu_lock for write. This allows a task walking rmaps
869+
* without holding mmu_lock to concurrently walk the same entries as a task
870+
* that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
871+
* the rmaps, thus the walks are stable.
872+
*
873+
* As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
874+
* only the rmap chains themselves are protected. E.g. holding an rmap's lock
875+
* ensures all "struct pte_list_desc" fields are stable.
876+
*/
877+
#define KVM_RMAP_LOCKED BIT(1)
878+
879+
static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
880+
{
881+
unsigned long old_val, new_val;
882+
883+
lockdep_assert_preemption_disabled();
884+
885+
/*
886+
* Elide the lock if the rmap is empty, as lockless walkers (read-only
887+
* mode) don't need to (and can't) walk an empty rmap, nor can they add
888+
* entries to the rmap. I.e. the only paths that process empty rmaps
889+
* do so while holding mmu_lock for write, and are mutually exclusive.
890+
*/
891+
old_val = atomic_long_read(&rmap_head->val);
892+
if (!old_val)
893+
return 0;
894+
895+
do {
896+
/*
897+
* If the rmap is locked, wait for it to be unlocked before
898+
* trying acquire the lock, e.g. to avoid bouncing the cache
899+
* line.
900+
*/
901+
while (old_val & KVM_RMAP_LOCKED) {
902+
cpu_relax();
903+
old_val = atomic_long_read(&rmap_head->val);
904+
}
905+
906+
/*
907+
* Recheck for an empty rmap, it may have been purged by the
908+
* task that held the lock.
909+
*/
910+
if (!old_val)
911+
return 0;
912+
913+
new_val = old_val | KVM_RMAP_LOCKED;
914+
/*
915+
* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
916+
* from being reordered outside of the critical section created by
917+
* kvm_rmap_lock().
918+
*
919+
* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
920+
*
921+
* For the !old_val case, no ordering is needed, as there is no rmap
922+
* to walk.
923+
*/
924+
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
925+
926+
/* Return the old value, i.e. _without_ the LOCKED bit set. */
927+
return old_val;
928+
}
929+
930+
static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
931+
unsigned long new_val)
932+
{
933+
WARN_ON_ONCE(new_val & KVM_RMAP_LOCKED);
934+
/*
935+
* Ensure that all accesses to the rmap have completed before unlocking
936+
* the rmap.
937+
*
938+
* Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock.
939+
*/
940+
atomic_long_set_release(&rmap_head->val, new_val);
941+
}
942+
943+
static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
944+
{
945+
return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
946+
}
947+
861948
/*
862949
* Returns the number of pointers in the rmap chain, not counting the new one.
863950
*/
@@ -868,7 +955,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
868955
struct pte_list_desc *desc;
869956
int count = 0;
870957

871-
old_val = rmap_head->val;
958+
old_val = kvm_rmap_lock(rmap_head);
872959

873960
if (!old_val) {
874961
new_val = (unsigned long)spte;
@@ -900,7 +987,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
900987
desc->sptes[desc->spte_count++] = spte;
901988
}
902989

903-
rmap_head->val = new_val;
990+
kvm_rmap_unlock(rmap_head, new_val);
904991

905992
return count;
906993
}
@@ -948,7 +1035,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
9481035
unsigned long rmap_val;
9491036
int i;
9501037

951-
rmap_val = rmap_head->val;
1038+
rmap_val = kvm_rmap_lock(rmap_head);
9521039
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
9531040
goto out;
9541041

@@ -974,7 +1061,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
9741061
}
9751062

9761063
out:
977-
rmap_head->val = rmap_val;
1064+
kvm_rmap_unlock(rmap_head, rmap_val);
9781065
}
9791066

9801067
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
@@ -992,7 +1079,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
9921079
unsigned long rmap_val;
9931080
int i;
9941081

995-
rmap_val = rmap_head->val;
1082+
rmap_val = kvm_rmap_lock(rmap_head);
9961083
if (!rmap_val)
9971084
return false;
9981085

@@ -1011,13 +1098,13 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
10111098
}
10121099
out:
10131100
/* rmap_head is meaningless now, remember to reset it */
1014-
rmap_head->val = 0;
1101+
kvm_rmap_unlock(rmap_head, 0);
10151102
return true;
10161103
}
10171104

10181105
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
10191106
{
1020-
unsigned long rmap_val = rmap_head->val;
1107+
unsigned long rmap_val = kvm_rmap_get(rmap_head);
10211108
struct pte_list_desc *desc;
10221109

10231110
if (!rmap_val)
@@ -1083,7 +1170,7 @@ struct rmap_iterator {
10831170
static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
10841171
struct rmap_iterator *iter)
10851172
{
1086-
unsigned long rmap_val = rmap_head->val;
1173+
unsigned long rmap_val = kvm_rmap_get(rmap_head);
10871174
u64 *sptep;
10881175

10891176
if (!rmap_val)
@@ -1418,7 +1505,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
14181505
while (++iterator->rmap <= iterator->end_rmap) {
14191506
iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
14201507

1421-
if (iterator->rmap->val)
1508+
if (atomic_long_read(&iterator->rmap->val))
14221509
return;
14231510
}
14241511

@@ -2444,7 +2531,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
24442531
* avoids retaining a large number of stale nested SPs.
24452532
*/
24462533
if (tdp_enabled && invalid_list &&
2447-
child->role.guest_mode && !child->parent_ptes.val)
2534+
child->role.guest_mode &&
2535+
!atomic_long_read(&child->parent_ptes.val))
24482536
return kvm_mmu_prepare_zap_page(kvm, child,
24492537
invalid_list);
24502538
}

0 commit comments

Comments
 (0)