@@ -179,7 +179,6 @@ struct kvm_shadow_walk_iterator {
179
179
180
180
static struct kmem_cache * pte_list_desc_cache ;
181
181
struct kmem_cache * mmu_page_header_cache ;
182
- static struct percpu_counter kvm_total_used_mmu_pages ;
183
182
184
183
static void mmu_spte_set (u64 * sptep , u64 spte );
185
184
@@ -485,11 +484,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
485
484
__set_spte (sptep , new_spte );
486
485
}
487
486
488
- /*
489
- * Update the SPTE (excluding the PFN), but do not track changes in its
490
- * accessed/dirty status.
487
+ /* Rules for using mmu_spte_update:
488
+ * Update the state bits, it means the mapped pfn is not changed.
489
+ *
490
+ * Returns true if the TLB needs to be flushed
491
491
*/
492
- static u64 mmu_spte_update_no_track (u64 * sptep , u64 new_spte )
492
+ static bool mmu_spte_update (u64 * sptep , u64 new_spte )
493
493
{
494
494
u64 old_spte = * sptep ;
495
495
@@ -498,57 +498,18 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
498
498
499
499
if (!is_shadow_present_pte (old_spte )) {
500
500
mmu_spte_set (sptep , new_spte );
501
- return old_spte ;
501
+ return false ;
502
502
}
503
503
504
504
if (!spte_has_volatile_bits (old_spte ))
505
505
__update_clear_spte_fast (sptep , new_spte );
506
506
else
507
507
old_spte = __update_clear_spte_slow (sptep , new_spte );
508
508
509
- WARN_ON_ONCE (spte_to_pfn (old_spte ) != spte_to_pfn (new_spte ));
510
-
511
- return old_spte ;
512
- }
513
-
514
- /* Rules for using mmu_spte_update:
515
- * Update the state bits, it means the mapped pfn is not changed.
516
- *
517
- * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
518
- * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
519
- * spte, even though the writable spte might be cached on a CPU's TLB.
520
- *
521
- * Returns true if the TLB needs to be flushed
522
- */
523
- static bool mmu_spte_update (u64 * sptep , u64 new_spte )
524
- {
525
- bool flush = false;
526
- u64 old_spte = mmu_spte_update_no_track (sptep , new_spte );
527
-
528
- if (!is_shadow_present_pte (old_spte ))
529
- return false;
530
-
531
- /*
532
- * For the spte updated out of mmu-lock is safe, since
533
- * we always atomically update it, see the comments in
534
- * spte_has_volatile_bits().
535
- */
536
- if (is_mmu_writable_spte (old_spte ) &&
537
- !is_writable_pte (new_spte ))
538
- flush = true;
539
-
540
- /*
541
- * Flush TLB when accessed/dirty states are changed in the page tables,
542
- * to guarantee consistency between TLB and page tables.
543
- */
544
-
545
- if (is_accessed_spte (old_spte ) && !is_accessed_spte (new_spte ))
546
- flush = true;
547
-
548
- if (is_dirty_spte (old_spte ) && !is_dirty_spte (new_spte ))
549
- flush = true;
509
+ WARN_ON_ONCE (!is_shadow_present_pte (old_spte ) ||
510
+ spte_to_pfn (old_spte ) != spte_to_pfn (new_spte ));
550
511
551
- return flush ;
512
+ return leaf_spte_change_needs_tlb_flush ( old_spte , new_spte ) ;
552
513
}
553
514
554
515
/*
@@ -1606,8 +1567,13 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
1606
1567
clear_bit ((ffs (shadow_accessed_mask ) - 1 ),
1607
1568
(unsigned long * )sptep );
1608
1569
} else {
1570
+ /*
1571
+ * WARN if mmu_spte_update() signals the need
1572
+ * for a TLB flush, as Access tracking a SPTE
1573
+ * should never trigger an _immediate_ flush.
1574
+ */
1609
1575
spte = mark_spte_for_access_track (spte );
1610
- mmu_spte_update_no_track ( sptep , spte );
1576
+ WARN_ON_ONCE ( mmu_spte_update ( sptep , spte ) );
1611
1577
}
1612
1578
young = true;
1613
1579
}
@@ -1655,27 +1621,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
1655
1621
#endif
1656
1622
}
1657
1623
1658
- /*
1659
- * This value is the sum of all of the kvm instances's
1660
- * kvm->arch.n_used_mmu_pages values. We need a global,
1661
- * aggregate version in order to make the slab shrinker
1662
- * faster
1663
- */
1664
- static inline void kvm_mod_used_mmu_pages (struct kvm * kvm , long nr )
1665
- {
1666
- kvm -> arch .n_used_mmu_pages += nr ;
1667
- percpu_counter_add (& kvm_total_used_mmu_pages , nr );
1668
- }
1669
-
1670
1624
static void kvm_account_mmu_page (struct kvm * kvm , struct kvm_mmu_page * sp )
1671
1625
{
1672
- kvm_mod_used_mmu_pages ( kvm , +1 ) ;
1626
+ kvm -> arch . n_used_mmu_pages ++ ;
1673
1627
kvm_account_pgtable_pages ((void * )sp -> spt , +1 );
1674
1628
}
1675
1629
1676
1630
static void kvm_unaccount_mmu_page (struct kvm * kvm , struct kvm_mmu_page * sp )
1677
1631
{
1678
- kvm_mod_used_mmu_pages ( kvm , -1 ) ;
1632
+ kvm -> arch . n_used_mmu_pages -- ;
1679
1633
kvm_account_pgtable_pages ((void * )sp -> spt , -1 );
1680
1634
}
1681
1635
@@ -3147,13 +3101,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
3147
3101
}
3148
3102
3149
3103
int kvm_mmu_max_mapping_level (struct kvm * kvm ,
3150
- const struct kvm_memory_slot * slot , gfn_t gfn ,
3151
- int max_level )
3104
+ const struct kvm_memory_slot * slot , gfn_t gfn )
3152
3105
{
3153
3106
bool is_private = kvm_slot_can_be_private (slot ) &&
3154
3107
kvm_mem_is_private (kvm , gfn );
3155
3108
3156
- return __kvm_mmu_max_mapping_level (kvm , slot , gfn , max_level , is_private );
3109
+ return __kvm_mmu_max_mapping_level (kvm , slot , gfn , PG_LEVEL_NUM , is_private );
3157
3110
}
3158
3111
3159
3112
void kvm_mmu_hugepage_adjust (struct kvm_vcpu * vcpu , struct kvm_page_fault * fault )
@@ -3373,7 +3326,7 @@ static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault
3373
3326
* by setting the Writable bit, which can be done out of mmu_lock.
3374
3327
*/
3375
3328
if (!fault -> present )
3376
- return !kvm_ad_enabled () ;
3329
+ return !kvm_ad_enabled ;
3377
3330
3378
3331
/*
3379
3332
* Note, instruction fetches and writes are mutually exclusive, ignore
@@ -3508,8 +3461,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
3508
3461
* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
3509
3462
* enabled, the SPTE can't be an access-tracked SPTE.
3510
3463
*/
3511
- if (unlikely (!kvm_ad_enabled ()) && is_access_track_spte (spte ))
3512
- new_spte = restore_acc_track_spte (new_spte );
3464
+ if (unlikely (!kvm_ad_enabled ) && is_access_track_spte (spte ))
3465
+ new_spte = restore_acc_track_spte (new_spte ) |
3466
+ shadow_accessed_mask ;
3513
3467
3514
3468
/*
3515
3469
* To keep things simple, only SPTEs that are MMU-writable can
@@ -5485,7 +5439,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5485
5439
role .efer_nx = true;
5486
5440
role .smm = cpu_role .base .smm ;
5487
5441
role .guest_mode = cpu_role .base .guest_mode ;
5488
- role .ad_disabled = !kvm_ad_enabled () ;
5442
+ role .ad_disabled = !kvm_ad_enabled ;
5489
5443
role .level = kvm_mmu_get_tdp_level (vcpu );
5490
5444
role .direct = true;
5491
5445
role .has_4_byte_gpte = false;
@@ -6413,8 +6367,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
6413
6367
{
6414
6368
struct kvm_mmu_page * sp , * node ;
6415
6369
int nr_zapped , batch = 0 ;
6370
+ LIST_HEAD (invalid_list );
6416
6371
bool unstable ;
6417
6372
6373
+ lockdep_assert_held (& kvm -> slots_lock );
6374
+
6418
6375
restart :
6419
6376
list_for_each_entry_safe_reverse (sp , node ,
6420
6377
& kvm -> arch .active_mmu_pages , link ) {
@@ -6446,7 +6403,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
6446
6403
}
6447
6404
6448
6405
unstable = __kvm_mmu_prepare_zap_page (kvm , sp ,
6449
- & kvm -> arch . zapped_obsolete_pages , & nr_zapped );
6406
+ & invalid_list , & nr_zapped );
6450
6407
batch += nr_zapped ;
6451
6408
6452
6409
if (unstable )
@@ -6462,7 +6419,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
6462
6419
* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
6463
6420
* running with an obsolete MMU.
6464
6421
*/
6465
- kvm_mmu_commit_zap_page (kvm , & kvm -> arch . zapped_obsolete_pages );
6422
+ kvm_mmu_commit_zap_page (kvm , & invalid_list );
6466
6423
}
6467
6424
6468
6425
/*
@@ -6525,16 +6482,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
6525
6482
kvm_tdp_mmu_zap_invalidated_roots (kvm );
6526
6483
}
6527
6484
6528
- static bool kvm_has_zapped_obsolete_pages (struct kvm * kvm )
6529
- {
6530
- return unlikely (!list_empty_careful (& kvm -> arch .zapped_obsolete_pages ));
6531
- }
6532
-
6533
6485
void kvm_mmu_init_vm (struct kvm * kvm )
6534
6486
{
6535
6487
kvm -> arch .shadow_mmio_value = shadow_mmio_value ;
6536
6488
INIT_LIST_HEAD (& kvm -> arch .active_mmu_pages );
6537
- INIT_LIST_HEAD (& kvm -> arch .zapped_obsolete_pages );
6538
6489
INIT_LIST_HEAD (& kvm -> arch .possible_nx_huge_pages );
6539
6490
spin_lock_init (& kvm -> arch .mmu_unsync_pages_lock );
6540
6491
@@ -6768,7 +6719,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm,
6768
6719
continue ;
6769
6720
}
6770
6721
6771
- spte = make_huge_page_split_spte (kvm , huge_spte , sp -> role , index );
6722
+ spte = make_small_spte (kvm , huge_spte , sp -> role , index );
6772
6723
mmu_spte_set (sptep , spte );
6773
6724
__rmap_add (kvm , cache , slot , sptep , gfn , sp -> role .access );
6774
6725
}
@@ -6951,8 +6902,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6951
6902
* mapping if the indirect sp has level = 1.
6952
6903
*/
6953
6904
if (sp -> role .direct &&
6954
- sp -> role .level < kvm_mmu_max_mapping_level (kvm , slot , sp -> gfn ,
6955
- PG_LEVEL_NUM )) {
6905
+ sp -> role .level < kvm_mmu_max_mapping_level (kvm , slot , sp -> gfn )) {
6956
6906
kvm_zap_one_rmap_spte (kvm , rmap_head , sptep );
6957
6907
6958
6908
if (kvm_available_flush_remote_tlbs_range ())
@@ -6980,8 +6930,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6980
6930
kvm_flush_remote_tlbs_memslot (kvm , slot );
6981
6931
}
6982
6932
6983
- void kvm_mmu_zap_collapsible_sptes (struct kvm * kvm ,
6984
- const struct kvm_memory_slot * slot )
6933
+ void kvm_mmu_recover_huge_pages (struct kvm * kvm ,
6934
+ const struct kvm_memory_slot * slot )
6985
6935
{
6986
6936
if (kvm_memslots_have_rmaps (kvm )) {
6987
6937
write_lock (& kvm -> mmu_lock );
@@ -6991,7 +6941,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6991
6941
6992
6942
if (tdp_mmu_enabled ) {
6993
6943
read_lock (& kvm -> mmu_lock );
6994
- kvm_tdp_mmu_zap_collapsible_sptes (kvm , slot );
6944
+ kvm_tdp_mmu_recover_huge_pages (kvm , slot );
6995
6945
read_unlock (& kvm -> mmu_lock );
6996
6946
}
6997
6947
}
@@ -7146,72 +7096,6 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
7146
7096
}
7147
7097
}
7148
7098
7149
- static unsigned long mmu_shrink_scan (struct shrinker * shrink ,
7150
- struct shrink_control * sc )
7151
- {
7152
- struct kvm * kvm ;
7153
- int nr_to_scan = sc -> nr_to_scan ;
7154
- unsigned long freed = 0 ;
7155
-
7156
- mutex_lock (& kvm_lock );
7157
-
7158
- list_for_each_entry (kvm , & vm_list , vm_list ) {
7159
- int idx ;
7160
-
7161
- /*
7162
- * Never scan more than sc->nr_to_scan VM instances.
7163
- * Will not hit this condition practically since we do not try
7164
- * to shrink more than one VM and it is very unlikely to see
7165
- * !n_used_mmu_pages so many times.
7166
- */
7167
- if (!nr_to_scan -- )
7168
- break ;
7169
- /*
7170
- * n_used_mmu_pages is accessed without holding kvm->mmu_lock
7171
- * here. We may skip a VM instance errorneosly, but we do not
7172
- * want to shrink a VM that only started to populate its MMU
7173
- * anyway.
7174
- */
7175
- if (!kvm -> arch .n_used_mmu_pages &&
7176
- !kvm_has_zapped_obsolete_pages (kvm ))
7177
- continue ;
7178
-
7179
- idx = srcu_read_lock (& kvm -> srcu );
7180
- write_lock (& kvm -> mmu_lock );
7181
-
7182
- if (kvm_has_zapped_obsolete_pages (kvm )) {
7183
- kvm_mmu_commit_zap_page (kvm ,
7184
- & kvm -> arch .zapped_obsolete_pages );
7185
- goto unlock ;
7186
- }
7187
-
7188
- freed = kvm_mmu_zap_oldest_mmu_pages (kvm , sc -> nr_to_scan );
7189
-
7190
- unlock :
7191
- write_unlock (& kvm -> mmu_lock );
7192
- srcu_read_unlock (& kvm -> srcu , idx );
7193
-
7194
- /*
7195
- * unfair on small ones
7196
- * per-vm shrinkers cry out
7197
- * sadness comes quickly
7198
- */
7199
- list_move_tail (& kvm -> vm_list , & vm_list );
7200
- break ;
7201
- }
7202
-
7203
- mutex_unlock (& kvm_lock );
7204
- return freed ;
7205
- }
7206
-
7207
- static unsigned long mmu_shrink_count (struct shrinker * shrink ,
7208
- struct shrink_control * sc )
7209
- {
7210
- return percpu_counter_read_positive (& kvm_total_used_mmu_pages );
7211
- }
7212
-
7213
- static struct shrinker * mmu_shrinker ;
7214
-
7215
7099
static void mmu_destroy_caches (void )
7216
7100
{
7217
7101
kmem_cache_destroy (pte_list_desc_cache );
@@ -7338,23 +7222,8 @@ int kvm_mmu_vendor_module_init(void)
7338
7222
if (!mmu_page_header_cache )
7339
7223
goto out ;
7340
7224
7341
- if (percpu_counter_init (& kvm_total_used_mmu_pages , 0 , GFP_KERNEL ))
7342
- goto out ;
7343
-
7344
- mmu_shrinker = shrinker_alloc (0 , "x86-mmu" );
7345
- if (!mmu_shrinker )
7346
- goto out_shrinker ;
7347
-
7348
- mmu_shrinker -> count_objects = mmu_shrink_count ;
7349
- mmu_shrinker -> scan_objects = mmu_shrink_scan ;
7350
- mmu_shrinker -> seeks = DEFAULT_SEEKS * 10 ;
7351
-
7352
- shrinker_register (mmu_shrinker );
7353
-
7354
7225
return 0 ;
7355
7226
7356
- out_shrinker :
7357
- percpu_counter_destroy (& kvm_total_used_mmu_pages );
7358
7227
out :
7359
7228
mmu_destroy_caches ();
7360
7229
return ret ;
@@ -7371,8 +7240,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
7371
7240
void kvm_mmu_vendor_module_exit (void )
7372
7241
{
7373
7242
mmu_destroy_caches ();
7374
- percpu_counter_destroy (& kvm_total_used_mmu_pages );
7375
- shrinker_free (mmu_shrinker );
7376
7243
}
7377
7244
7378
7245
/*
0 commit comments