Skip to content

Commit 41ebae2

Browse files
committed
Merge tag 'kvm-x86-mmu-6.9' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.9: - Clean up code related to unprotecting shadow pages when retrying a guest instruction after failed #PF-induced emulation. - Zap TDP MMU roots at 4KiB granularity to minimize the delay in yielding if a reschedule is needed, e.g. if a high priority task needs to run. Because KVM doesn't support yielding in the middle of processing a zapped non-leaf SPTE, zapping at 1GiB granularity can result in multi-millisecond lag when attempting to schedule in a high priority. - Rework TDP MMU root unload, free, and alloc to run with mmu_lock held for read, e.g. to avoid serializing vCPUs when userspace deletes a memslot. - Allocate write-tracking metadata on-demand to avoid the memory overhead when running kernels built with KVMGT support (external write-tracking enabled), but for workloads that don't use nested virtualization (shadow paging) or KVMGT.
2 parents c9cd0be + a364c01 commit 41ebae2

File tree

6 files changed

+203
-76
lines changed

6 files changed

+203
-76
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,6 +1468,15 @@ struct kvm_arch {
14681468
*/
14691469
bool shadow_root_allocated;
14701470

1471+
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
1472+
/*
1473+
* If set, the VM has (or had) an external write tracking user, and
1474+
* thus all write tracking metadata has been allocated, even if KVM
1475+
* itself isn't using write tracking.
1476+
*/
1477+
bool external_write_tracking_enabled;
1478+
#endif
1479+
14711480
#if IS_ENABLED(CONFIG_HYPERV)
14721481
hpa_t hv_root_tdp;
14731482
spinlock_t hv_root_tdp_lock;

arch/x86/kvm/mmu/mmu.c

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
35753575
if (WARN_ON_ONCE(!sp))
35763576
return;
35773577

3578-
if (is_tdp_mmu_page(sp))
3578+
if (is_tdp_mmu_page(sp)) {
3579+
lockdep_assert_held_read(&kvm->mmu_lock);
35793580
kvm_tdp_mmu_put_root(kvm, sp);
3580-
else if (!--sp->root_count && sp->role.invalid)
3581-
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3581+
} else {
3582+
lockdep_assert_held_write(&kvm->mmu_lock);
3583+
if (!--sp->root_count && sp->role.invalid)
3584+
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3585+
}
35823586

35833587
*root_hpa = INVALID_PAGE;
35843588
}
@@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
35873591
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
35883592
ulong roots_to_free)
35893593
{
3594+
bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
35903595
int i;
35913596
LIST_HEAD(invalid_list);
35923597
bool free_active_root;
@@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
36093614
return;
36103615
}
36113616

3612-
write_lock(&kvm->mmu_lock);
3617+
if (is_tdp_mmu)
3618+
read_lock(&kvm->mmu_lock);
3619+
else
3620+
write_lock(&kvm->mmu_lock);
36133621

36143622
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
36153623
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
36353643
mmu->root.pgd = 0;
36363644
}
36373645

3638-
kvm_mmu_commit_zap_page(kvm, &invalid_list);
3639-
write_unlock(&kvm->mmu_lock);
3646+
if (is_tdp_mmu) {
3647+
read_unlock(&kvm->mmu_lock);
3648+
WARN_ON_ONCE(!list_empty(&invalid_list));
3649+
} else {
3650+
kvm_mmu_commit_zap_page(kvm, &invalid_list);
3651+
write_unlock(&kvm->mmu_lock);
3652+
}
36403653
}
36413654
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
36423655

@@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
36933706
unsigned i;
36943707
int r;
36953708

3709+
if (tdp_mmu_enabled)
3710+
return kvm_tdp_mmu_alloc_root(vcpu);
3711+
36963712
write_lock(&vcpu->kvm->mmu_lock);
36973713
r = make_mmu_pages_available(vcpu);
36983714
if (r < 0)
36993715
goto out_unlock;
37003716

3701-
if (tdp_mmu_enabled) {
3702-
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
3703-
mmu->root.hpa = root;
3704-
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3717+
if (shadow_root_level >= PT64_ROOT_4LEVEL) {
37053718
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
37063719
mmu->root.hpa = root;
37073720
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
@@ -6997,9 +7010,7 @@ int kvm_mmu_vendor_module_init(void)
69977010

69987011
kvm_mmu_reset_all_pte_masks();
69997012

7000-
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
7001-
sizeof(struct pte_list_desc),
7002-
0, SLAB_ACCOUNT, NULL);
7013+
pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
70037014
if (!pte_list_desc_cache)
70047015
goto out;
70057016

arch/x86/kvm/mmu/page_track.c

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,23 @@
2020
#include "mmu_internal.h"
2121
#include "page_track.h"
2222

23+
static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
24+
{
25+
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
26+
/*
27+
* Read external_write_tracking_enabled before related pointers. Pairs
28+
* with the smp_store_release in kvm_page_track_write_tracking_enable().
29+
*/
30+
return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
31+
#else
32+
return false;
33+
#endif
34+
}
35+
2336
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
2437
{
25-
return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
26-
!tdp_enabled || kvm_shadow_root_allocated(kvm);
38+
return kvm_external_write_tracking_enabled(kvm) ||
39+
kvm_shadow_root_allocated(kvm) || !tdp_enabled;
2740
}
2841

2942
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
153166
return init_srcu_struct(&head->track_srcu);
154167
}
155168

169+
static int kvm_enable_external_write_tracking(struct kvm *kvm)
170+
{
171+
struct kvm_memslots *slots;
172+
struct kvm_memory_slot *slot;
173+
int r = 0, i, bkt;
174+
175+
mutex_lock(&kvm->slots_arch_lock);
176+
177+
/*
178+
* Check for *any* write tracking user (not just external users) under
179+
* lock. This avoids unnecessary work, e.g. if KVM itself is using
180+
* write tracking, or if two external users raced when registering.
181+
*/
182+
if (kvm_page_track_write_tracking_enabled(kvm))
183+
goto out_success;
184+
185+
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
186+
slots = __kvm_memslots(kvm, i);
187+
kvm_for_each_memslot(slot, bkt, slots) {
188+
/*
189+
* Intentionally do NOT free allocations on failure to
190+
* avoid having to track which allocations were made
191+
* now versus when the memslot was created. The
192+
* metadata is guaranteed to be freed when the slot is
193+
* freed, and will be kept/used if userspace retries
194+
* the failed ioctl() instead of killing the VM.
195+
*/
196+
r = kvm_page_track_write_tracking_alloc(slot);
197+
if (r)
198+
goto out_unlock;
199+
}
200+
}
201+
202+
out_success:
203+
/*
204+
* Ensure that external_write_tracking_enabled becomes true strictly
205+
* after all the related pointers are set.
206+
*/
207+
smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
208+
out_unlock:
209+
mutex_unlock(&kvm->slots_arch_lock);
210+
return r;
211+
}
212+
156213
/*
157214
* register the notifier so that event interception for the tracked guest
158215
* pages can be received.
@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
161218
struct kvm_page_track_notifier_node *n)
162219
{
163220
struct kvm_page_track_notifier_head *head;
221+
int r;
164222

165223
if (!kvm || kvm->mm != current->mm)
166224
return -ESRCH;
167225

226+
if (!kvm_external_write_tracking_enabled(kvm)) {
227+
r = kvm_enable_external_write_tracking(kvm);
228+
if (r)
229+
return r;
230+
}
231+
168232
kvm_get_kvm(kvm);
169233

170234
head = &kvm->arch.track_notifier_head;

0 commit comments

Comments
 (0)