Skip to content

Commit c59de14

Browse files
committed
Merge tag 'kvm-x86-mmu-6.13' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.13 - Cleanup KVM's handling of Accessed and Dirty bits to dedup code, improve documentation, harden against unexpected changes, and to simplify A/D-disabled MMUs by using the hardware-defined A/D bits to track if a PFN is Accessed and/or Dirty. - Elide TLB flushes when aging SPTEs, as has been done in x86's primary MMU for over 10 years. - Batch TLB flushes when zapping collapsible TDP MMU SPTEs, i.e. when dirty logging is toggled off, which reduces the time it takes to disable dirty logging by ~3x. - Recover huge pages in-place in the TDP MMU instead of zapping the SP and waiting until the page is re-accessed to create a huge mapping. Proactively installing huge pages can reduce vCPU jitter in extreme scenarios. - Remove support for (poorly) reclaiming page tables in shadow MMUs via the primary MMU's shrinker interface.
2 parents b39d157 + 4cf20d4 commit c59de14

File tree

11 files changed

+278
-408
lines changed

11 files changed

+278
-408
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1306,7 +1306,6 @@ struct kvm_arch {
13061306
bool pre_fault_allowed;
13071307
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
13081308
struct list_head active_mmu_pages;
1309-
struct list_head zapped_obsolete_pages;
13101309
/*
13111310
* A list of kvm_mmu_page structs that, if zapped, could possibly be
13121311
* replaced by an NX huge page. A shadow page is on this list if its
@@ -1955,8 +1954,8 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
19551954
const struct kvm_memory_slot *memslot,
19561955
u64 start, u64 end,
19571956
int target_level);
1958-
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
1959-
const struct kvm_memory_slot *memslot);
1957+
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
1958+
const struct kvm_memory_slot *memslot);
19601959
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
19611960
const struct kvm_memory_slot *memslot);
19621961
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);

arch/x86/kvm/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ config KVM_X86
2222
depends on X86_LOCAL_APIC
2323
select KVM_COMMON
2424
select KVM_GENERIC_MMU_NOTIFIER
25+
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
2526
select HAVE_KVM_IRQCHIP
2627
select HAVE_KVM_PFNCACHE
2728
select HAVE_KVM_DIRTY_RING_TSO

arch/x86/kvm/mmu/mmu.c

Lines changed: 34 additions & 167 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
179179

180180
static struct kmem_cache *pte_list_desc_cache;
181181
struct kmem_cache *mmu_page_header_cache;
182-
static struct percpu_counter kvm_total_used_mmu_pages;
183182

184183
static void mmu_spte_set(u64 *sptep, u64 spte);
185184

@@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
485484
__set_spte(sptep, new_spte);
486485
}
487486

488-
/*
489-
* Update the SPTE (excluding the PFN), but do not track changes in its
490-
* accessed/dirty status.
487+
/* Rules for using mmu_spte_update:
488+
* Update the state bits, it means the mapped pfn is not changed.
489+
*
490+
* Returns true if the TLB needs to be flushed
491491
*/
492-
static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
492+
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
493493
{
494494
u64 old_spte = *sptep;
495495

@@ -498,57 +498,18 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
498498

499499
if (!is_shadow_present_pte(old_spte)) {
500500
mmu_spte_set(sptep, new_spte);
501-
return old_spte;
501+
return false;
502502
}
503503

504504
if (!spte_has_volatile_bits(old_spte))
505505
__update_clear_spte_fast(sptep, new_spte);
506506
else
507507
old_spte = __update_clear_spte_slow(sptep, new_spte);
508508

509-
WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
510-
511-
return old_spte;
512-
}
513-
514-
/* Rules for using mmu_spte_update:
515-
* Update the state bits, it means the mapped pfn is not changed.
516-
*
517-
* Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
518-
* TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
519-
* spte, even though the writable spte might be cached on a CPU's TLB.
520-
*
521-
* Returns true if the TLB needs to be flushed
522-
*/
523-
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
524-
{
525-
bool flush = false;
526-
u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
527-
528-
if (!is_shadow_present_pte(old_spte))
529-
return false;
530-
531-
/*
532-
* For the spte updated out of mmu-lock is safe, since
533-
* we always atomically update it, see the comments in
534-
* spte_has_volatile_bits().
535-
*/
536-
if (is_mmu_writable_spte(old_spte) &&
537-
!is_writable_pte(new_spte))
538-
flush = true;
539-
540-
/*
541-
* Flush TLB when accessed/dirty states are changed in the page tables,
542-
* to guarantee consistency between TLB and page tables.
543-
*/
544-
545-
if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte))
546-
flush = true;
547-
548-
if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte))
549-
flush = true;
509+
WARN_ON_ONCE(!is_shadow_present_pte(old_spte) ||
510+
spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
550511

551-
return flush;
512+
return leaf_spte_change_needs_tlb_flush(old_spte, new_spte);
552513
}
553514

554515
/*
@@ -1606,8 +1567,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
16061567
clear_bit((ffs(shadow_accessed_mask) - 1),
16071568
(unsigned long *)sptep);
16081569
} else {
1570+
/*
1571+
* WARN if mmu_spte_update() signals the need
1572+
* for a TLB flush, as Access tracking a SPTE
1573+
* should never trigger an _immediate_ flush.
1574+
*/
16091575
spte = mark_spte_for_access_track(spte);
1610-
mmu_spte_update_no_track(sptep, spte);
1576+
WARN_ON_ONCE(mmu_spte_update(sptep, spte));
16111577
}
16121578
young = true;
16131579
}
@@ -1655,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
16551621
#endif
16561622
}
16571623

1658-
/*
1659-
* This value is the sum of all of the kvm instances's
1660-
* kvm->arch.n_used_mmu_pages values. We need a global,
1661-
* aggregate version in order to make the slab shrinker
1662-
* faster
1663-
*/
1664-
static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
1665-
{
1666-
kvm->arch.n_used_mmu_pages += nr;
1667-
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1668-
}
1669-
16701624
static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
16711625
{
1672-
kvm_mod_used_mmu_pages(kvm, +1);
1626+
kvm->arch.n_used_mmu_pages++;
16731627
kvm_account_pgtable_pages((void *)sp->spt, +1);
16741628
}
16751629

16761630
static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
16771631
{
1678-
kvm_mod_used_mmu_pages(kvm, -1);
1632+
kvm->arch.n_used_mmu_pages--;
16791633
kvm_account_pgtable_pages((void *)sp->spt, -1);
16801634
}
16811635

@@ -3147,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
31473101
}
31483102

31493103
int kvm_mmu_max_mapping_level(struct kvm *kvm,
3150-
const struct kvm_memory_slot *slot, gfn_t gfn,
3151-
int max_level)
3104+
const struct kvm_memory_slot *slot, gfn_t gfn)
31523105
{
31533106
bool is_private = kvm_slot_can_be_private(slot) &&
31543107
kvm_mem_is_private(kvm, gfn);
31553108

3156-
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
3109+
return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
31573110
}
31583111

31593112
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
@@ -3373,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
33733326
* by setting the Writable bit, which can be done out of mmu_lock.
33743327
*/
33753328
if (!fault->present)
3376-
return !kvm_ad_enabled();
3329+
return !kvm_ad_enabled;
33773330

33783331
/*
33793332
* Note, instruction fetches and writes are mutually exclusive, ignore
@@ -3508,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
35083461
* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
35093462
* enabled, the SPTE can't be an access-tracked SPTE.
35103463
*/
3511-
if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
3512-
new_spte = restore_acc_track_spte(new_spte);
3464+
if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
3465+
new_spte = restore_acc_track_spte(new_spte) |
3466+
shadow_accessed_mask;
35133467

35143468
/*
35153469
* To keep things simple, only SPTEs that are MMU-writable can
@@ -5485,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
54855439
role.efer_nx = true;
54865440
role.smm = cpu_role.base.smm;
54875441
role.guest_mode = cpu_role.base.guest_mode;
5488-
role.ad_disabled = !kvm_ad_enabled();
5442+
role.ad_disabled = !kvm_ad_enabled;
54895443
role.level = kvm_mmu_get_tdp_level(vcpu);
54905444
role.direct = true;
54915445
role.has_4_byte_gpte = false;
@@ -6413,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
64136367
{
64146368
struct kvm_mmu_page *sp, *node;
64156369
int nr_zapped, batch = 0;
6370+
LIST_HEAD(invalid_list);
64166371
bool unstable;
64176372

6373+
lockdep_assert_held(&kvm->slots_lock);
6374+
64186375
restart:
64196376
list_for_each_entry_safe_reverse(sp, node,
64206377
&kvm->arch.active_mmu_pages, link) {
@@ -6446,7 +6403,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
64466403
}
64476404

64486405
unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
6449-
&kvm->arch.zapped_obsolete_pages, &nr_zapped);
6406+
&invalid_list, &nr_zapped);
64506407
batch += nr_zapped;
64516408

64526409
if (unstable)
@@ -6462,7 +6419,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
64626419
* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
64636420
* running with an obsolete MMU.
64646421
*/
6465-
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
6422+
kvm_mmu_commit_zap_page(kvm, &invalid_list);
64666423
}
64676424

64686425
/*
@@ -6525,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
65256482
kvm_tdp_mmu_zap_invalidated_roots(kvm);
65266483
}
65276484

6528-
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
6529-
{
6530-
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
6531-
}
6532-
65336485
void kvm_mmu_init_vm(struct kvm *kvm)
65346486
{
65356487
kvm->arch.shadow_mmio_value = shadow_mmio_value;
65366488
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
6537-
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
65386489
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
65396490
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
65406491

@@ -6768,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
67686719
continue;
67696720
}
67706721

6771-
spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index);
6722+
spte = make_small_spte(kvm, huge_spte, sp->role, index);
67726723
mmu_spte_set(sptep, spte);
67736724
__rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access);
67746725
}
@@ -6951,8 +6902,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
69516902
* mapping if the indirect sp has level = 1.
69526903
*/
69536904
if (sp->role.direct &&
6954-
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
6955-
PG_LEVEL_NUM)) {
6905+
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
69566906
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
69576907

69586908
if (kvm_available_flush_remote_tlbs_range())
@@ -6980,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
69806930
kvm_flush_remote_tlbs_memslot(kvm, slot);
69816931
}
69826932

6983-
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6984-
const struct kvm_memory_slot *slot)
6933+
void kvm_mmu_recover_huge_pages(struct kvm *kvm,
6934+
const struct kvm_memory_slot *slot)
69856935
{
69866936
if (kvm_memslots_have_rmaps(kvm)) {
69876937
write_lock(&kvm->mmu_lock);
@@ -6991,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
69916941

69926942
if (tdp_mmu_enabled) {
69936943
read_lock(&kvm->mmu_lock);
6994-
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6944+
kvm_tdp_mmu_recover_huge_pages(kvm, slot);
69956945
read_unlock(&kvm->mmu_lock);
69966946
}
69976947
}
@@ -7146,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
71467096
}
71477097
}
71487098

7149-
static unsigned long mmu_shrink_scan(struct shrinker *shrink,
7150-
struct shrink_control *sc)
7151-
{
7152-
struct kvm *kvm;
7153-
int nr_to_scan = sc->nr_to_scan;
7154-
unsigned long freed = 0;
7155-
7156-
mutex_lock(&kvm_lock);
7157-
7158-
list_for_each_entry(kvm, &vm_list, vm_list) {
7159-
int idx;
7160-
7161-
/*
7162-
* Never scan more than sc->nr_to_scan VM instances.
7163-
* Will not hit this condition practically since we do not try
7164-
* to shrink more than one VM and it is very unlikely to see
7165-
* !n_used_mmu_pages so many times.
7166-
*/
7167-
if (!nr_to_scan--)
7168-
break;
7169-
/*
7170-
* n_used_mmu_pages is accessed without holding kvm->mmu_lock
7171-
* here. We may skip a VM instance errorneosly, but we do not
7172-
* want to shrink a VM that only started to populate its MMU
7173-
* anyway.
7174-
*/
7175-
if (!kvm->arch.n_used_mmu_pages &&
7176-
!kvm_has_zapped_obsolete_pages(kvm))
7177-
continue;
7178-
7179-
idx = srcu_read_lock(&kvm->srcu);
7180-
write_lock(&kvm->mmu_lock);
7181-
7182-
if (kvm_has_zapped_obsolete_pages(kvm)) {
7183-
kvm_mmu_commit_zap_page(kvm,
7184-
&kvm->arch.zapped_obsolete_pages);
7185-
goto unlock;
7186-
}
7187-
7188-
freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
7189-
7190-
unlock:
7191-
write_unlock(&kvm->mmu_lock);
7192-
srcu_read_unlock(&kvm->srcu, idx);
7193-
7194-
/*
7195-
* unfair on small ones
7196-
* per-vm shrinkers cry out
7197-
* sadness comes quickly
7198-
*/
7199-
list_move_tail(&kvm->vm_list, &vm_list);
7200-
break;
7201-
}
7202-
7203-
mutex_unlock(&kvm_lock);
7204-
return freed;
7205-
}
7206-
7207-
static unsigned long mmu_shrink_count(struct shrinker *shrink,
7208-
struct shrink_control *sc)
7209-
{
7210-
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
7211-
}
7212-
7213-
static struct shrinker *mmu_shrinker;
7214-
72157099
static void mmu_destroy_caches(void)
72167100
{
72177101
kmem_cache_destroy(pte_list_desc_cache);
@@ -7338,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void)
73387222
if (!mmu_page_header_cache)
73397223
goto out;
73407224

7341-
if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
7342-
goto out;
7343-
7344-
mmu_shrinker = shrinker_alloc(0, "x86-mmu");
7345-
if (!mmu_shrinker)
7346-
goto out_shrinker;
7347-
7348-
mmu_shrinker->count_objects = mmu_shrink_count;
7349-
mmu_shrinker->scan_objects = mmu_shrink_scan;
7350-
mmu_shrinker->seeks = DEFAULT_SEEKS * 10;
7351-
7352-
shrinker_register(mmu_shrinker);
7353-
73547225
return 0;
73557226

7356-
out_shrinker:
7357-
percpu_counter_destroy(&kvm_total_used_mmu_pages);
73587227
out:
73597228
mmu_destroy_caches();
73607229
return ret;
@@ -7371,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
73717240
void kvm_mmu_vendor_module_exit(void)
73727241
{
73737242
mmu_destroy_caches();
7374-
percpu_counter_destroy(&kvm_total_used_mmu_pages);
7375-
shrinker_free(mmu_shrinker);
73767243
}
73777244

73787245
/*

0 commit comments

Comments
 (0)