Skip to content

Commit 41786cc

Browse files
committed
Merge tag 'kvm-x86-misc-6.12' of https://github.com/kvm-x86/linux into HEAD
KVM x86 misc changes for 6.12 - Advertise AVX10.1 to userspace (effectively prep work for the "real" AVX10 functionality that is on the horizon). - Rework common MSR handling code to suppress errors on userspace accesses to unsupported-but-advertised MSRs. This will allow removing (almost?) all of KVM's exemptions for userspace access to MSRs that shouldn't exist based on the vCPU model (the actual cleanup is non-trivial future work). - Rework KVM's handling of x2APIC ICR, again, because AMD (x2AVIC) splits the 64-bit value into the legacy ICR and ICR2 storage, whereas Intel (APICv) stores the entire 64-bit value a the ICR offset. - Fix a bug where KVM would fail to exit to userspace if one was triggered by a fastpath exit handler. - Add fastpath handling of HLT VM-Exit to expedite re-entering the guest when there's already a pending wake event at the time of the exit. - Finally fix the RSM vs. nested VM-Enter WARN by forcing the vCPU out of guest mode prior to signalling SHUTDOWN (architecturally, the SHUTDOWN is supposed to hit L1, not L2).
2 parents 7056c4e + 4ca077f commit 41786cc

File tree

21 files changed

+702
-525
lines changed

21 files changed

+702
-525
lines changed

arch/x86/include/asm/cpuid.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function)
179179
case 0x1d:
180180
case 0x1e:
181181
case 0x1f:
182+
case 0x24:
182183
case 0x8000001d:
183184
return true;
184185
}

arch/x86/include/asm/kvm-x86-ops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
125125
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
126126
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
127127
KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
128-
KVM_X86_OP(get_msr_feature)
128+
KVM_X86_OP(get_feature_msr)
129129
KVM_X86_OP(check_emulate_instruction)
130130
KVM_X86_OP(apic_init_signal_blocked)
131131
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)

arch/x86/include/asm/kvm_host.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ enum exit_fastpath_completion {
212212
EXIT_FASTPATH_NONE,
213213
EXIT_FASTPATH_REENTER_GUEST,
214214
EXIT_FASTPATH_EXIT_HANDLED,
215+
EXIT_FASTPATH_EXIT_USERSPACE,
215216
};
216217
typedef enum exit_fastpath_completion fastpath_t;
217218

@@ -1730,6 +1731,8 @@ struct kvm_x86_ops {
17301731
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
17311732
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
17321733
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
1734+
1735+
const bool x2apic_icr_is_split;
17331736
const unsigned long required_apicv_inhibits;
17341737
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
17351738
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
@@ -1809,7 +1812,7 @@ struct kvm_x86_ops {
18091812
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
18101813
void (*guest_memory_reclaimed)(struct kvm *kvm);
18111814

1812-
int (*get_msr_feature)(struct kvm_msr_entry *entry);
1815+
int (*get_feature_msr)(u32 msr, u64 *data);
18131816

18141817
int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
18151818
void *insn, int insn_len);

arch/x86/kvm/cpuid.c

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,7 @@ void kvm_set_cpu_caps(void)
705705

706706
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
707707
F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
708-
F(AMX_COMPLEX)
708+
F(AMX_COMPLEX) | F(AVX10)
709709
);
710710

711711
kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
@@ -721,6 +721,10 @@ void kvm_set_cpu_caps(void)
721721
SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
722722
);
723723

724+
kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX,
725+
F(AVX10_128) | F(AVX10_256) | F(AVX10_512)
726+
);
727+
724728
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
725729
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
726730
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -949,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
949953
switch (function) {
950954
case 0:
951955
/* Limited to the highest leaf implemented in KVM. */
952-
entry->eax = min(entry->eax, 0x1fU);
956+
entry->eax = min(entry->eax, 0x24U);
953957
break;
954958
case 1:
955959
cpuid_entry_override(entry, CPUID_1_EDX);
@@ -1174,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
11741178
break;
11751179
}
11761180
break;
1181+
case 0x24: {
1182+
u8 avx10_version;
1183+
1184+
if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
1185+
entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
1186+
break;
1187+
}
1188+
1189+
/*
1190+
* The AVX10 version is encoded in EBX[7:0]. Note, the version
1191+
* is guaranteed to be >=1 if AVX10 is supported. Note #2, the
1192+
* version needs to be captured before overriding EBX features!
1193+
*/
1194+
avx10_version = min_t(u8, entry->ebx & 0xff, 1);
1195+
cpuid_entry_override(entry, CPUID_24_0_EBX);
1196+
entry->ebx |= avx10_version;
1197+
1198+
entry->eax = 0;
1199+
entry->ecx = 0;
1200+
entry->edx = 0;
1201+
break;
1202+
}
11771203
case KVM_CPUID_SIGNATURE: {
11781204
const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
11791205
entry->eax = KVM_CPUID_FEATURES;

arch/x86/kvm/lapic.c

Lines changed: 53 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1944,7 +1944,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
19441944
u64 ns = 0;
19451945
ktime_t expire;
19461946
struct kvm_vcpu *vcpu = apic->vcpu;
1947-
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
1947+
u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
19481948
unsigned long flags;
19491949
ktime_t now;
19501950

@@ -2453,6 +2453,43 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
24532453
}
24542454
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
24552455

2456+
#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
2457+
2458+
int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
2459+
{
2460+
if (data & X2APIC_ICR_RESERVED_BITS)
2461+
return 1;
2462+
2463+
/*
2464+
* The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
2465+
* only AMD requires it to be zero, Intel essentially just ignores the
2466+
* bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
2467+
* the CPU performs the reserved bits checks, i.e. the underlying CPU
2468+
* behavior will "win". Arbitrarily clear the BUSY bit, as there is no
2469+
* sane way to provide consistent behavior with respect to hardware.
2470+
*/
2471+
data &= ~APIC_ICR_BUSY;
2472+
2473+
kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
2474+
if (kvm_x86_ops.x2apic_icr_is_split) {
2475+
kvm_lapic_set_reg(apic, APIC_ICR, data);
2476+
kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
2477+
} else {
2478+
kvm_lapic_set_reg64(apic, APIC_ICR, data);
2479+
}
2480+
trace_kvm_apic_write(APIC_ICR, data);
2481+
return 0;
2482+
}
2483+
2484+
static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
2485+
{
2486+
if (kvm_x86_ops.x2apic_icr_is_split)
2487+
return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
2488+
(u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
2489+
2490+
return kvm_lapic_get_reg64(apic, APIC_ICR);
2491+
}
2492+
24562493
/* emulate APIC access in a trap manner */
24572494
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
24582495
{
@@ -2470,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
24702507
* maybe-unecessary write, and both are in the noise anyways.
24712508
*/
24722509
if (apic_x2apic_mode(apic) && offset == APIC_ICR)
2473-
kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
2510+
WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
24742511
else
24752512
kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
24762513
}
@@ -2990,18 +3027,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
29903027

29913028
/*
29923029
* In x2APIC mode, the LDR is fixed and based on the id. And
2993-
* ICR is internally a single 64-bit register, but needs to be
2994-
* split to ICR+ICR2 in userspace for backwards compatibility.
3030+
* if the ICR is _not_ split, ICR is internally a single 64-bit
3031+
* register, but needs to be split to ICR+ICR2 in userspace for
3032+
* backwards compatibility.
29953033
*/
2996-
if (set) {
3034+
if (set)
29973035
*ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
29983036

2999-
icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
3000-
(u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
3001-
__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
3002-
} else {
3003-
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
3004-
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
3037+
if (!kvm_x86_ops.x2apic_icr_is_split) {
3038+
if (set) {
3039+
icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
3040+
(u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
3041+
__kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
3042+
} else {
3043+
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
3044+
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
3045+
}
30053046
}
30063047
}
30073048

@@ -3194,22 +3235,12 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
31943235
return 0;
31953236
}
31963237

3197-
int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
3198-
{
3199-
data &= ~APIC_ICR_BUSY;
3200-
3201-
kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
3202-
kvm_lapic_set_reg64(apic, APIC_ICR, data);
3203-
trace_kvm_apic_write(APIC_ICR, data);
3204-
return 0;
3205-
}
3206-
32073238
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
32083239
{
32093240
u32 low;
32103241

32113242
if (reg == APIC_ICR) {
3212-
*data = kvm_lapic_get_reg64(apic, APIC_ICR);
3243+
*data = kvm_x2apic_icr_read(apic);
32133244
return 0;
32143245
}
32153246

arch/x86/kvm/lapic.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
9696
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
9797
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
9898
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
99-
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
10099
void kvm_recalculate_apic_map(struct kvm *kvm);
101100
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
102101
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);

arch/x86/kvm/mmu.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
223223

224224
bool kvm_mmu_may_ignore_guest_pat(void);
225225

226-
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
227-
228226
int kvm_mmu_post_init_vm(struct kvm *kvm);
229227
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
230228

arch/x86/kvm/mmu/mmu_internal.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,8 +349,6 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
349349
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
350350
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
351351

352-
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
353-
354352
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
355353
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
356354

arch/x86/kvm/reverse_cpuid.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs {
1717
CPUID_8000_0007_EDX,
1818
CPUID_8000_0022_EAX,
1919
CPUID_7_2_EDX,
20+
CPUID_24_0_EBX,
2021
NR_KVM_CPU_CAPS,
2122

2223
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs {
4647
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
4748
#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
4849
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
50+
#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19)
4951

5052
/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
5153
#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
@@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs {
5557
#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
5658
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
5759

60+
/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
61+
#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
62+
#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
63+
#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
64+
5865
/* CPUID level 0x80000007 (EDX). */
5966
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
6067

@@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
9097
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
9198
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
9299
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
100+
[CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
93101
};
94102

95103
/*

arch/x86/kvm/smm.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -624,17 +624,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
624624
#endif
625625

626626
/*
627-
* Give leave_smm() a chance to make ISA-specific changes to the vCPU
628-
* state (e.g. enter guest mode) before loading state from the SMM
629-
* state-save area.
627+
* FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
628+
* mode should happen _after_ loading state from SMRAM. However, KVM
629+
* piggybacks the nested VM-Enter flows (which is wrong for many other
630+
* reasons), and so nSVM/nVMX would clobber state that is loaded from
631+
* SMRAM and from the VMCS/VMCB.
630632
*/
631633
if (kvm_x86_call(leave_smm)(vcpu, &smram))
632634
return X86EMUL_UNHANDLEABLE;
633635

634636
#ifdef CONFIG_X86_64
635637
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
636-
return rsm_load_state_64(ctxt, &smram.smram64);
638+
ret = rsm_load_state_64(ctxt, &smram.smram64);
637639
else
638640
#endif
639-
return rsm_load_state_32(ctxt, &smram.smram32);
641+
ret = rsm_load_state_32(ctxt, &smram.smram32);
642+
643+
/*
644+
* If RSM fails and triggers shutdown, architecturally the shutdown
645+
* occurs *before* the transition to guest mode. But due to KVM's
646+
* flawed handling of RSM to L2 (see above), the vCPU may already be
647+
* in_guest_mode(). Force the vCPU out of guest mode before delivering
648+
* the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
649+
* that architecturally shouldn't be possible.
650+
*/
651+
if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
652+
kvm_leave_nested(vcpu);
653+
return ret;
640654
}

0 commit comments

Comments
 (0)