Skip to content

Commit af3b6a9

Browse files
committed
KVM: x86/mmu: Walk rmaps (shadow MMU) without holding mmu_lock when aging gfns
Convert the shadow MMU to use per-rmap locking instead of the per-VM mmu_lock to protect rmaps when aging SPTEs. When A/D bits are enabled, it is safe to simply clear the Accessed bits, i.e. KVM just needs to ensure the parent page table isn't freed. The less obvious case is marking SPTEs for access tracking in the non-A/D case (for EPT only). Because aging a gfn means making the SPTE not-present, KVM needs to play nice with the case where the CPU has TLB entries for a SPTE that is not-present in memory. For example, when doing dirty tracking, if KVM encounters a non-present shadow accessed SPTE, KVM must know to do a TLB invalidation. Fortunately, KVM already provides (and relies upon) the necessary functionality. E.g. KVM doesn't flush TLBs when aging pages (even in the clear_flush_young() case), and when harvesting dirty bitmaps, KVM flushes based on the dirty bitmaps, not on SPTEs. Co-developed-by: James Houghton <jthoughton@google.com> Signed-off-by: James Houghton <jthoughton@google.com> Link: https://lore.kernel.org/r/20250204004038.1680123-12-jthoughton@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent bb6c774 commit af3b6a9

File tree

1 file changed

+39
-33
lines changed

1 file changed

+39
-33
lines changed

arch/x86/kvm/mmu/mmu.c

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,6 @@ static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
971971
* actual locking is the same, but the caller is disallowed from modifying the
972972
* rmap, and so the unlock flow is a nop if the rmap is/was empty.
973973
*/
974-
__maybe_unused
975974
static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
976975
{
977976
unsigned long rmap_val;
@@ -985,7 +984,6 @@ static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
985984
return rmap_val;
986985
}
987986

988-
__maybe_unused
989987
static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
990988
unsigned long old_val)
991989
{
@@ -1706,37 +1704,48 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
17061704
}
17071705

17081706
static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
1709-
struct kvm_gfn_range *range, bool test_only)
1707+
struct kvm_gfn_range *range,
1708+
bool test_only)
17101709
{
1711-
struct slot_rmap_walk_iterator iterator;
1710+
struct kvm_rmap_head *rmap_head;
17121711
struct rmap_iterator iter;
1712+
unsigned long rmap_val;
17131713
bool young = false;
17141714
u64 *sptep;
1715+
gfn_t gfn;
1716+
int level;
1717+
u64 spte;
17151718

1716-
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1717-
range->start, range->end - 1, &iterator) {
1718-
for_each_rmap_spte(iterator.rmap, &iter, sptep) {
1719-
u64 spte = *sptep;
1719+
for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
1720+
for (gfn = range->start; gfn < range->end;
1721+
gfn += KVM_PAGES_PER_HPAGE(level)) {
1722+
rmap_head = gfn_to_rmap(gfn, level, range->slot);
1723+
rmap_val = kvm_rmap_lock_readonly(rmap_head);
17201724

1721-
if (!is_accessed_spte(spte))
1722-
continue;
1725+
for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
1726+
if (!is_accessed_spte(spte))
1727+
continue;
1728+
1729+
if (test_only) {
1730+
kvm_rmap_unlock_readonly(rmap_head, rmap_val);
1731+
return true;
1732+
}
17231733

1724-
if (test_only)
1725-
return true;
1726-
1727-
if (spte_ad_enabled(spte)) {
1728-
clear_bit((ffs(shadow_accessed_mask) - 1),
1729-
(unsigned long *)sptep);
1730-
} else {
1731-
/*
1732-
* WARN if mmu_spte_update() signals the need
1733-
* for a TLB flush, as Access tracking a SPTE
1734-
* should never trigger an _immediate_ flush.
1735-
*/
1736-
spte = mark_spte_for_access_track(spte);
1737-
WARN_ON_ONCE(mmu_spte_update(sptep, spte));
1734+
if (spte_ad_enabled(spte))
1735+
clear_bit((ffs(shadow_accessed_mask) - 1),
1736+
(unsigned long *)sptep);
1737+
else
1738+
/*
1739+
* If the following cmpxchg fails, the
1740+
* spte is being concurrently modified
1741+
* and should most likely stay young.
1742+
*/
1743+
cmpxchg64(sptep, spte,
1744+
mark_spte_for_access_track(spte));
1745+
young = true;
17381746
}
1739-
young = true;
1747+
1748+
kvm_rmap_unlock_readonly(rmap_head, rmap_val);
17401749
}
17411750
}
17421751
return young;
@@ -1754,11 +1763,8 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
17541763
if (tdp_mmu_enabled)
17551764
young = kvm_tdp_mmu_age_gfn_range(kvm, range);
17561765

1757-
if (kvm_may_have_shadow_mmu_sptes(kvm)) {
1758-
write_lock(&kvm->mmu_lock);
1766+
if (kvm_may_have_shadow_mmu_sptes(kvm))
17591767
young |= kvm_rmap_age_gfn_range(kvm, range, false);
1760-
write_unlock(&kvm->mmu_lock);
1761-
}
17621768

17631769
return young;
17641770
}
@@ -1770,11 +1776,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
17701776
if (tdp_mmu_enabled)
17711777
young = kvm_tdp_mmu_test_age_gfn(kvm, range);
17721778

1773-
if (!young && kvm_may_have_shadow_mmu_sptes(kvm)) {
1774-
write_lock(&kvm->mmu_lock);
1779+
if (young)
1780+
return young;
1781+
1782+
if (kvm_may_have_shadow_mmu_sptes(kvm))
17751783
young |= kvm_rmap_age_gfn_range(kvm, range, true);
1776-
write_unlock(&kvm->mmu_lock);
1777-
}
17781784

17791785
return young;
17801786
}

0 commit comments

Comments
 (0)