Skip to content

Commit 1b6c146

Browse files
committed
Merge tag 'kvm-x86-fixes-6.8-2' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 6.8, round 2: - When emulating an atomic access, mark the gfn as dirty in the memslot to fix a bug where KVM could fail to mark the slot as dirty during live migration, ultimately resulting in guest data corruption due to a dirty page not being re-copied from the source to the target. - Check for mmu_notifier invalidation events before faulting in the pfn, and before acquiring mmu_lock, to avoid unnecessary work and lock contention. Contending mmu_lock is especially problematic on preemptible kernels, as KVM may yield mmu_lock in response to the contention, which severely degrades overall performance due to vCPUs making it difficult for the task that triggered invalidation to make forward progress. Note, due to another kernel bug, this fix isn't limited to preemtible kernels, as any kernel built with CONFIG_PREEMPT_DYNAMIC=y will yield contended rwlocks and spinlocks. https://lore.kernel.org/all/20240110214723.695930-1-seanjc@google.com
2 parents 5ef1d8c + d02c357 commit 1b6c146

File tree

3 files changed

+78
-0
lines changed

3 files changed

+78
-0
lines changed

arch/x86/kvm/mmu/mmu.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4405,6 +4405,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
44054405
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
44064406
smp_rmb();
44074407

4408+
/*
4409+
* Check for a relevant mmu_notifier invalidation event before getting
4410+
* the pfn from the primary MMU, and before acquiring mmu_lock.
4411+
*
4412+
* For mmu_lock, if there is an in-progress invalidation and the kernel
4413+
* allows preemption, the invalidation task may drop mmu_lock and yield
4414+
* in response to mmu_lock being contended, which is *very* counter-
4415+
* productive as this vCPU can't actually make forward progress until
4416+
* the invalidation completes.
4417+
*
4418+
* Retrying now can also avoid unnessary lock contention in the primary
4419+
* MMU, as the primary MMU doesn't necessarily hold a single lock for
4420+
* the duration of the invalidation, i.e. faulting in a conflicting pfn
4421+
* can cause the invalidation to take longer by holding locks that are
4422+
* needed to complete the invalidation.
4423+
*
4424+
* Do the pre-check even for non-preemtible kernels, i.e. even if KVM
4425+
* will never yield mmu_lock in response to contention, as this vCPU is
4426+
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
4427+
* to detect retry guarantees the worst case latency for the vCPU.
4428+
*/
4429+
if (fault->slot &&
4430+
mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
4431+
return RET_PF_RETRY;
4432+
44084433
ret = __kvm_faultin_pfn(vcpu, fault);
44094434
if (ret != RET_PF_CONTINUE)
44104435
return ret;
@@ -4415,6 +4440,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
44154440
if (unlikely(!fault->slot))
44164441
return kvm_handle_noslot_fault(vcpu, fault, access);
44174442

4443+
/*
4444+
* Check again for a relevant mmu_notifier invalidation event purely to
4445+
* avoid contending mmu_lock. Most invalidations will be detected by
4446+
* the previous check, but checking is extremely cheap relative to the
4447+
* overall cost of failing to detect the invalidation until after
4448+
* mmu_lock is acquired.
4449+
*/
4450+
if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) {
4451+
kvm_release_pfn_clean(fault->pfn);
4452+
return RET_PF_RETRY;
4453+
}
4454+
44184455
return RET_PF_CONTINUE;
44194456
}
44204457

@@ -4442,6 +4479,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
44424479
if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
44434480
return true;
44444481

4482+
/*
4483+
* Check for a relevant mmu_notifier invalidation event one last time
4484+
* now that mmu_lock is held, as the "unsafe" checks performed without
4485+
* holding mmu_lock can get false negatives.
4486+
*/
44454487
return fault->slot &&
44464488
mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
44474489
}

arch/x86/kvm/x86.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8007,6 +8007,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
80078007

80088008
if (r < 0)
80098009
return X86EMUL_UNHANDLEABLE;
8010+
8011+
/*
8012+
* Mark the page dirty _before_ checking whether or not the CMPXCHG was
8013+
* successful, as the old value is written back on failure. Note, for
8014+
* live migration, this is unnecessarily conservative as CMPXCHG writes
8015+
* back the original value and the access is atomic, but KVM's ABI is
8016+
* that all writes are dirty logged, regardless of the value written.
8017+
*/
8018+
kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa));
8019+
80108020
if (r)
80118021
return X86EMUL_CMPXCHG_FAILED;
80128022

include/linux/kvm_host.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2031,6 +2031,32 @@ static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
20312031
return 1;
20322032
return 0;
20332033
}
2034+
2035+
/*
2036+
* This lockless version of the range-based retry check *must* be paired with a
2037+
* call to the locked version after acquiring mmu_lock, i.e. this is safe to
2038+
* use only as a pre-check to avoid contending mmu_lock. This version *will*
2039+
* get false negatives and false positives.
2040+
*/
2041+
static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm,
2042+
unsigned long mmu_seq,
2043+
gfn_t gfn)
2044+
{
2045+
/*
2046+
* Use READ_ONCE() to ensure the in-progress flag and sequence counter
2047+
* are always read from memory, e.g. so that checking for retry in a
2048+
* loop won't result in an infinite retry loop. Don't force loads for
2049+
* start+end, as the key to avoiding infinite retry loops is observing
2050+
* the 1=>0 transition of in-progress, i.e. getting false negatives
2051+
* due to stale start+end values is acceptable.
2052+
*/
2053+
if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) &&
2054+
gfn >= kvm->mmu_invalidate_range_start &&
2055+
gfn < kvm->mmu_invalidate_range_end)
2056+
return true;
2057+
2058+
return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq;
2059+
}
20342060
#endif
20352061

20362062
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING

0 commit comments

Comments
 (0)