Skip to content

Commit 636b528

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull more kvm updates from Paolo Bonzini: "Generic: - selftest compilation fix for non-x86 - KVM: avoid warning on s390 in mark_page_dirty x86: - fix page write-protection bug and improve comments - use binary search to lookup the PMU event filter, add test - enable_pmu module parameter support for Intel CPUs - switch blocked_vcpu_on_cpu_lock to raw spinlock - cleanups of blocked vCPU logic - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression) - various small fixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits) docs: kvm: fix WARNINGs from api.rst selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c kvm: selftests: Do not indent with spaces kvm: selftests: sync uapi/linux/kvm.h with Linux header selftests: kvm: add amx_test to .gitignore KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops KVM: SVM: Drop AVIC's intermediate avic_set_running() helper KVM: VMX: Don't do full kick when handling posted interrupt wakeup KVM: VMX: Fold fallback path into triggering posted IRQ helper KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ KVM: VMX: Don't do full kick when triggering posted interrupt "fails" KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers ...
2 parents dc5341f + e2e83a7 commit 636b528

36 files changed

+1425
-632
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
55455545
The Stats Data block contains an array of 64-bit values in the same order
55465546
as the descriptors in Descriptors block.
55475547

5548-
4.42 KVM_GET_XSAVE2
5549-
------------------
5548+
4.134 KVM_GET_XSAVE2
5549+
--------------------
55505550

55515551
:Capability: KVM_CAP_XSAVE2
55525552
:Architectures: x86
@@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
73637363
limit the attack surface on KVM's MSR emulation code.
73647364

73657365
8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
7366-
-----------------------------
7366+
-------------------------------------
73677367

73687368
Architectures: x86
73697369

arch/x86/include/asm/kvm-x86-ops.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
5555
KVM_X86_OP_NULL(tlb_remote_flush_with_range)
5656
KVM_X86_OP(tlb_flush_gva)
5757
KVM_X86_OP(tlb_flush_guest)
58+
KVM_X86_OP(vcpu_pre_run)
5859
KVM_X86_OP(run)
5960
KVM_X86_OP_NULL(handle_exit)
6061
KVM_X86_OP_NULL(skip_emulated_instruction)
@@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
9899
KVM_X86_OP_NULL(request_immediate_exit)
99100
KVM_X86_OP(sched_in)
100101
KVM_X86_OP_NULL(update_cpu_dirty_logging)
101-
KVM_X86_OP_NULL(pre_block)
102-
KVM_X86_OP_NULL(post_block)
103102
KVM_X86_OP_NULL(vcpu_blocking)
104103
KVM_X86_OP_NULL(vcpu_unblocking)
105104
KVM_X86_OP_NULL(update_pi_irte)

arch/x86/include/asm/kvm_host.h

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
13811381
*/
13821382
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
13831383

1384+
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
13841385
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
13851386
int (*handle_exit)(struct kvm_vcpu *vcpu,
13861387
enum exit_fastpath_completion exit_fastpath);
@@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
14541455
const struct kvm_pmu_ops *pmu_ops;
14551456
const struct kvm_x86_nested_ops *nested_ops;
14561457

1457-
/*
1458-
* Architecture specific hooks for vCPU blocking due to
1459-
* HLT instruction.
1460-
* Returns for .pre_block():
1461-
* - 0 means continue to block the vCPU.
1462-
* - 1 means we cannot block the vCPU since some event
1463-
* happens during this period, such as, 'ON' bit in
1464-
* posted-interrupts descriptor is set.
1465-
*/
1466-
int (*pre_block)(struct kvm_vcpu *vcpu);
1467-
void (*post_block)(struct kvm_vcpu *vcpu);
1468-
14691458
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
14701459
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
14711460

arch/x86/kvm/cpuid.c

Lines changed: 66 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
119119
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
120120
}
121121

122+
/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
123+
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
124+
int nent)
125+
{
126+
struct kvm_cpuid_entry2 *orig;
127+
int i;
128+
129+
if (nent != vcpu->arch.cpuid_nent)
130+
return -EINVAL;
131+
132+
for (i = 0; i < nent; i++) {
133+
orig = &vcpu->arch.cpuid_entries[i];
134+
if (e2[i].function != orig->function ||
135+
e2[i].index != orig->index ||
136+
e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
137+
e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
138+
return -EINVAL;
139+
}
140+
141+
return 0;
142+
}
143+
122144
static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
123145
{
124146
u32 function;
@@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
145167
}
146168
}
147169

148-
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
170+
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
171+
struct kvm_cpuid_entry2 *entries, int nent)
149172
{
150173
u32 base = vcpu->arch.kvm_cpuid_base;
151174

152175
if (!base)
153176
return NULL;
154177

155-
return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
178+
return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
179+
}
180+
181+
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
182+
{
183+
return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
184+
vcpu->arch.cpuid_nent);
156185
}
157186

158187
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
167196
vcpu->arch.pv_cpuid.features = best->eax;
168197
}
169198

170-
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
199+
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
200+
int nent)
171201
{
172202
struct kvm_cpuid_entry2 *best;
173203

174-
best = kvm_find_cpuid_entry(vcpu, 1, 0);
204+
best = cpuid_entry2_find(entries, nent, 1, 0);
175205
if (best) {
176206
/* Update OSXSAVE bit */
177207
if (boot_cpu_has(X86_FEATURE_XSAVE))
@@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
182212
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
183213
}
184214

185-
best = kvm_find_cpuid_entry(vcpu, 7, 0);
215+
best = cpuid_entry2_find(entries, nent, 7, 0);
186216
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
187217
cpuid_entry_change(best, X86_FEATURE_OSPKE,
188218
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
189219

190-
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
220+
best = cpuid_entry2_find(entries, nent, 0xD, 0);
191221
if (best)
192222
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
193223

194-
best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
224+
best = cpuid_entry2_find(entries, nent, 0xD, 1);
195225
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
196226
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
197227
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
198228

199-
best = kvm_find_kvm_cpuid_features(vcpu);
229+
best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
200230
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
201231
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
202232
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
203233

204234
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
205-
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
235+
best = cpuid_entry2_find(entries, nent, 0x1, 0);
206236
if (best)
207237
cpuid_entry_change(best, X86_FEATURE_MWAIT,
208238
vcpu->arch.ia32_misc_enable_msr &
209239
MSR_IA32_MISC_ENABLE_MWAIT);
210240
}
211241
}
242+
243+
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
244+
{
245+
__kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
246+
}
212247
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
213248

214249
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
@@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
298333
{
299334
int r;
300335

336+
__kvm_update_cpuid_runtime(vcpu, e2, nent);
337+
338+
/*
339+
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
340+
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
341+
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
342+
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
343+
* the core vCPU model on the fly. It would've been better to forbid any
344+
* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
345+
* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
346+
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
347+
* whether the supplied CPUID data is equal to what's already set.
348+
*/
349+
if (vcpu->arch.last_vmentry_cpu != -1)
350+
return kvm_cpuid_check_equal(vcpu, e2, nent);
351+
301352
r = kvm_check_cpuid(vcpu, e2, nent);
302353
if (r)
303354
return r;
@@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
307358
vcpu->arch.cpuid_nent = nent;
308359

309360
kvm_update_kvm_cpuid_base(vcpu);
310-
kvm_update_cpuid_runtime(vcpu);
311361
kvm_vcpu_after_set_cpuid(vcpu);
312362

313363
return 0;
@@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
795845
perf_get_x86_pmu_capability(&cap);
796846

797847
/*
798-
* Only support guest architectural pmu on a host
799-
* with architectural pmu.
848+
* The guest architecture pmu is only supported if the architecture
849+
* pmu exists on the host and the module parameters allow it.
800850
*/
801-
if (!cap.version)
851+
if (!cap.version || !enable_pmu)
802852
memset(&cap, 0, sizeof(cap));
803853

804854
eax.split.version_id = min(cap.version, 2);
@@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
886936
--array->nent;
887937
continue;
888938
}
939+
940+
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
941+
entry->ecx &= ~BIT_ULL(2);
889942
entry->edx = 0;
890943
}
891944
break;

arch/x86/kvm/lapic.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
19501950
{
19511951
restart_apic_timer(vcpu->arch.apic);
19521952
}
1953-
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
19541953

19551954
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
19561955
{
@@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
19621961
start_sw_timer(apic);
19631962
preempt_enable();
19641963
}
1965-
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
19661964

19671965
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
19681966
{

arch/x86/kvm/mmu/mmu.c

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
57565756
continue;
57575757

57585758
flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5759+
57595760
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
57605761
start, end - 1, true, flush);
57615762
}
@@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
58255826
}
58265827

58275828
/*
5828-
* We can flush all the TLBs out of the mmu lock without TLB
5829-
* corruption since we just change the spte from writable to
5830-
* readonly so that we only need to care the case of changing
5831-
* spte from present to present (changing the spte from present
5832-
* to nonpresent will flush all the TLBs immediately), in other
5833-
* words, the only case we care is mmu_spte_update() where we
5834-
* have checked Host-writable | MMU-writable instead of
5835-
* PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
5836-
* anymore.
5829+
* Flush TLBs if any SPTEs had to be write-protected to ensure that
5830+
* guest writes are reflected in the dirty bitmap before the memslot
5831+
* update completes, i.e. before enabling dirty logging is visible to
5832+
* userspace.
5833+
*
5834+
* Perform the TLB flush outside the mmu_lock to reduce the amount of
5835+
* time the lock is held. However, this does mean that another CPU can
5836+
* now grab mmu_lock and encounter a write-protected SPTE while CPUs
5837+
* still have a writable mapping for the associated GFN in their TLB.
5838+
*
5839+
* This is safe but requires KVM to be careful when making decisions
5840+
* based on the write-protection status of an SPTE. Specifically, KVM
5841+
* also write-protects SPTEs to monitor changes to guest page tables
5842+
* during shadow paging, and must guarantee no CPUs can write to those
5843+
* page before the lock is dropped. As mentioned in the previous
5844+
* paragraph, a write-protected SPTE is no guarantee that CPU cannot
5845+
* perform writes. So to determine if a TLB flush is truly required, KVM
5846+
* will clear a separate software-only bit (MMU-writable) and skip the
5847+
* flush if-and-only-if this bit was already clear.
5848+
*
5849+
* See DEFAULT_SPTE_MMU_WRITEABLE for more details.
58375850
*/
58385851
if (flush)
58395852
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);

arch/x86/kvm/mmu/spte.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
216216

217217
new_spte &= ~PT_WRITABLE_MASK;
218218
new_spte &= ~shadow_host_writable_mask;
219+
new_spte &= ~shadow_mmu_writable_mask;
219220

220221
new_spte = mark_spte_for_access_track(new_spte);
221222

arch/x86/kvm/mmu/spte.h

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
6060
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
6161
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
6262

63-
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
64-
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
65-
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
66-
6763
/*
6864
* The mask/shift to use for saving the original R/X bits when marking the PTE
6965
* as not-present for access tracking purposes. We do not save the W bit as the
@@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
7874
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
7975
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
8076

77+
/*
78+
* *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
79+
* writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
80+
* that map guest pages in read-only memslots and read-only VMAs.
81+
*
82+
* Invariants:
83+
* - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
84+
*
85+
*
86+
* *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
87+
* allows writes to the guest page mapped by the SPTE. This bit is cleared when
88+
* the guest page mapped by the SPTE contains a page table that is being
89+
* monitored for shadow paging. In this case the SPTE can only be made writable
90+
* by unsyncing the shadow page under the mmu_lock.
91+
*
92+
* Invariants:
93+
* - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
94+
* - If MMU-writable is set, Host-writable must be set.
95+
*
96+
* If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
97+
* to track writes for dirty logging. For such SPTEs, KVM will locklessly set
98+
* PT_WRITABLE_MASK upon the next write from the guest and record the write in
99+
* the dirty log (see fast_page_fault()).
100+
*/
101+
102+
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
103+
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
104+
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
105+
81106
/*
82107
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
83108
* to not overlap the A/D type mask or the saved access bits of access-tracked
@@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
316341

317342
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
318343
{
319-
return (spte & shadow_host_writable_mask) &&
320-
(spte & shadow_mmu_writable_mask);
344+
if (spte & shadow_mmu_writable_mask) {
345+
WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
346+
return true;
347+
}
348+
349+
WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
350+
return false;
321351
}
322352

323353
static inline u64 get_mmio_spte_generation(u64 spte)

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
14421442
!is_last_spte(iter.old_spte, iter.level))
14431443
continue;
14441444

1445-
if (!is_writable_pte(iter.old_spte))
1446-
break;
1447-
14481445
new_spte = iter.old_spte &
14491446
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
14501447

1448+
if (new_spte == iter.old_spte)
1449+
break;
1450+
14511451
tdp_mmu_set_spte(kvm, &iter, new_spte);
14521452
spte_set = true;
14531453
}

0 commit comments

Comments
 (0)