Skip to content

Commit b146a9b

Browse files
committed
KVM: x86/mmu: Age TDP MMU SPTEs without holding mmu_lock
Walk the TDP MMU in an RCU read-side critical section without holding mmu_lock when harvesting and potentially updating age information on TDP MMU SPTEs. Add a new macro to do RCU-safe walking of TDP MMU roots, and do all SPTE aging with atomic updates; while clobbering Accessed information is ok, KVM must not corrupt other bits, e.g. must not drop a Dirty or Writable bit when making a SPTE young.. If updating a SPTE to mark it for access tracking fails, leave it as is and treat it as if it were young. If the spte is being actively modified, it is most likely young. Acquire and release mmu_lock for write when harvesting age information from the shadow MMU, as the shadow MMU doesn't yet support aging outside of mmu_lock. Suggested-by: Yu Zhao <yuzhao@google.com> Signed-off-by: James Houghton <jthoughton@google.com> Reviewed-by: David Matlack <dmatlack@google.com> Link: https://lore.kernel.org/r/20250204004038.1680123-5-jthoughton@google.com [sean: massage changelog] Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 928c54b commit b146a9b

File tree

4 files changed

+35
-13
lines changed

4 files changed

+35
-13
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,6 +1478,7 @@ struct kvm_arch {
14781478
* tdp_mmu_page set.
14791479
*
14801480
* For reads, this list is protected by:
1481+
* RCU alone or
14811482
* the MMU lock in read mode + RCU or
14821483
* the MMU lock in write mode
14831484
*

arch/x86/kvm/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ config KVM_X86
2222
select KVM_COMMON
2323
select KVM_GENERIC_MMU_NOTIFIER
2424
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
25+
select KVM_MMU_LOCKLESS_AGING
2526
select HAVE_KVM_IRQCHIP
2627
select HAVE_KVM_PFNCACHE
2728
select HAVE_KVM_DIRTY_RING_TSO

arch/x86/kvm/mmu/mmu.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1592,8 +1592,11 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
15921592
{
15931593
bool young = false;
15941594

1595-
if (kvm_memslots_have_rmaps(kvm))
1595+
if (kvm_memslots_have_rmaps(kvm)) {
1596+
write_lock(&kvm->mmu_lock);
15961597
young = kvm_rmap_age_gfn_range(kvm, range, false);
1598+
write_unlock(&kvm->mmu_lock);
1599+
}
15971600

15981601
if (tdp_mmu_enabled)
15991602
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1605,8 +1608,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
16051608
{
16061609
bool young = false;
16071610

1608-
if (kvm_memslots_have_rmaps(kvm))
1611+
if (kvm_memslots_have_rmaps(kvm)) {
1612+
write_lock(&kvm->mmu_lock);
16091613
young = kvm_rmap_age_gfn_range(kvm, range, true);
1614+
write_unlock(&kvm->mmu_lock);
1615+
}
16101616

16111617
if (tdp_mmu_enabled)
16121618
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
193193
!tdp_mmu_root_match((_root), (_types)))) { \
194194
} else
195195

196+
/*
197+
* Iterate over all TDP MMU roots in an RCU read-side critical section.
198+
* It is safe to iterate over the SPTEs under the root, but their values will
199+
* be unstable, so all writes must be atomic. As this routine is meant to be
200+
* used without holding the mmu_lock at all, any bits that are flipped must
201+
* be reflected in kvm_tdp_mmu_spte_need_atomic_write().
202+
*/
203+
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
204+
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
205+
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
206+
!tdp_mmu_root_match((_root), (_types))) { \
207+
} else
208+
196209
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
197210
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
198211

@@ -1332,21 +1345,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
13321345
* from the clear_young() or clear_flush_young() notifier, which uses the
13331346
* return value to determine if the page has been accessed.
13341347
*/
1335-
static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
1348+
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
13361349
{
13371350
u64 new_spte;
13381351

13391352
if (spte_ad_enabled(iter->old_spte)) {
1340-
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1341-
iter->old_spte,
1342-
shadow_accessed_mask,
1343-
iter->level);
1353+
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1354+
shadow_accessed_mask);
13441355
new_spte = iter->old_spte & ~shadow_accessed_mask;
13451356
} else {
13461357
new_spte = mark_spte_for_access_track(iter->old_spte);
1347-
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1348-
iter->old_spte, new_spte,
1349-
iter->level);
1358+
/*
1359+
* It is safe for the following cmpxchg to fail. Leave the
1360+
* Accessed bit set, as the spte is most likely young anyway.
1361+
*/
1362+
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1363+
return;
13501364
}
13511365

13521366
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@@ -1371,9 +1385,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
13711385
* valid roots!
13721386
*/
13731387
WARN_ON(types & ~KVM_VALID_ROOTS);
1374-
__for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
1375-
guard(rcu)();
13761388

1389+
guard(rcu)();
1390+
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
13771391
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
13781392
if (!is_accessed_spte(iter.old_spte))
13791393
continue;
@@ -1382,7 +1396,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
13821396
return true;
13831397

13841398
ret = true;
1385-
kvm_tdp_mmu_age_spte(&iter);
1399+
kvm_tdp_mmu_age_spte(kvm, &iter);
13861400
}
13871401
}
13881402

0 commit comments

Comments
 (0)