Skip to content

Commit bd7fe98

Browse files
committed
Merge tag 'kvm-x86-svm-6.6' of https://github.com/kvm-x86/linux into HEAD
KVM: x86: SVM changes for 6.6: - Add support for SEV-ES DebugSwap, i.e. allow SEV-ES guests to use debug registers and generate/handle #DBs - Clean up LBR virtualization code - Fix a bug where KVM fails to set the target pCPU during an IRTE update - Fix fatal bugs in SEV-ES intrahost migration - Fix a bug where the recent (architecturally correct) change to reinject #BP and skip INT3 broke SEV guests (can't decode INT3 to skip it)
2 parents 755e732 + 80d0f52 commit bd7fe98

File tree

7 files changed

+252
-136
lines changed

7 files changed

+252
-136
lines changed

arch/x86/include/asm/cpufeatures.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@
438438
#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
439439
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
440440
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
441+
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */
441442

442443
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
443444
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */

arch/x86/include/asm/svm.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
288288

289289
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
290290

291+
#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
291292

292293
struct vmcb_seg {
293294
u16 selector;
@@ -345,7 +346,7 @@ struct vmcb_save_area {
345346
u64 last_excp_from;
346347
u64 last_excp_to;
347348
u8 reserved_0x298[72];
348-
u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
349+
u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
349350
} __packed;
350351

351352
/* Save area definition for SEV-ES and SEV-SNP guests */
@@ -512,7 +513,7 @@ struct ghcb {
512513
} __packed;
513514

514515

515-
#define EXPECTED_VMCB_SAVE_AREA_SIZE 740
516+
#define EXPECTED_VMCB_SAVE_AREA_SIZE 744
516517
#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
517518
#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648
518519
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024

arch/x86/kvm/svm/avic.c

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
791791
int ret = 0;
792792
unsigned long flags;
793793
struct amd_svm_iommu_ir *ir;
794+
u64 entry;
794795

795796
/**
796797
* In some cases, the existing irte is updated and re-set,
@@ -824,6 +825,18 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
824825
ir->data = pi->ir_data;
825826

826827
spin_lock_irqsave(&svm->ir_list_lock, flags);
828+
829+
/*
830+
* Update the target pCPU for IOMMU doorbells if the vCPU is running.
831+
* If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM
832+
* will update the pCPU info when the vCPU awkened and/or scheduled in.
833+
* See also avic_vcpu_load().
834+
*/
835+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
836+
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
837+
amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
838+
true, pi->ir_data);
839+
827840
list_add(&ir->node, &svm->ir_list);
828841
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
829842
out:
@@ -986,37 +999,35 @@ static inline int
986999
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
9871000
{
9881001
int ret = 0;
989-
unsigned long flags;
9901002
struct amd_svm_iommu_ir *ir;
9911003
struct vcpu_svm *svm = to_svm(vcpu);
9921004

1005+
lockdep_assert_held(&svm->ir_list_lock);
1006+
9931007
if (!kvm_arch_has_assigned_device(vcpu->kvm))
9941008
return 0;
9951009

9961010
/*
9971011
* Here, we go through the per-vcpu ir_list to update all existing
9981012
* interrupt remapping table entry targeting this vcpu.
9991013
*/
1000-
spin_lock_irqsave(&svm->ir_list_lock, flags);
1001-
10021014
if (list_empty(&svm->ir_list))
1003-
goto out;
1015+
return 0;
10041016

10051017
list_for_each_entry(ir, &svm->ir_list, node) {
10061018
ret = amd_iommu_update_ga(cpu, r, ir->data);
10071019
if (ret)
1008-
break;
1020+
return ret;
10091021
}
1010-
out:
1011-
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1012-
return ret;
1022+
return 0;
10131023
}
10141024

10151025
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
10161026
{
10171027
u64 entry;
10181028
int h_physical_id = kvm_cpu_get_apicid(cpu);
10191029
struct vcpu_svm *svm = to_svm(vcpu);
1030+
unsigned long flags;
10201031

10211032
lockdep_assert_preemption_disabled();
10221033

@@ -1033,6 +1044,15 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
10331044
if (kvm_vcpu_is_blocking(vcpu))
10341045
return;
10351046

1047+
/*
1048+
* Grab the per-vCPU interrupt remapping lock even if the VM doesn't
1049+
* _currently_ have assigned devices, as that can change. Holding
1050+
* ir_list_lock ensures that either svm_ir_list_add() will consume
1051+
* up-to-date entry information, or that this task will wait until
1052+
* svm_ir_list_add() completes to set the new target pCPU.
1053+
*/
1054+
spin_lock_irqsave(&svm->ir_list_lock, flags);
1055+
10361056
entry = READ_ONCE(*(svm->avic_physical_id_cache));
10371057
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
10381058

@@ -1042,25 +1062,48 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
10421062

10431063
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
10441064
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
1065+
1066+
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
10451067
}
10461068

10471069
void avic_vcpu_put(struct kvm_vcpu *vcpu)
10481070
{
10491071
u64 entry;
10501072
struct vcpu_svm *svm = to_svm(vcpu);
1073+
unsigned long flags;
10511074

10521075
lockdep_assert_preemption_disabled();
10531076

1077+
/*
1078+
* Note, reading the Physical ID entry outside of ir_list_lock is safe
1079+
* as only the pCPU that has loaded (or is loading) the vCPU is allowed
1080+
* to modify the entry, and preemption is disabled. I.e. the vCPU
1081+
* can't be scheduled out and thus avic_vcpu_{put,load}() can't run
1082+
* recursively.
1083+
*/
10541084
entry = READ_ONCE(*(svm->avic_physical_id_cache));
10551085

10561086
/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
10571087
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
10581088
return;
10591089

1090+
/*
1091+
* Take and hold the per-vCPU interrupt remapping lock while updating
1092+
* the Physical ID entry even though the lock doesn't protect against
1093+
* multiple writers (see above). Holding ir_list_lock ensures that
1094+
* either svm_ir_list_add() will consume up-to-date entry information,
1095+
* or that this task will wait until svm_ir_list_add() completes to
1096+
* mark the vCPU as not running.
1097+
*/
1098+
spin_lock_irqsave(&svm->ir_list_lock, flags);
1099+
10601100
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
10611101

10621102
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
10631103
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
1104+
1105+
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1106+
10641107
}
10651108

10661109
void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)

arch/x86/kvm/svm/sev.c

Lines changed: 83 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <asm/pkru.h>
2424
#include <asm/trapnr.h>
2525
#include <asm/fpu/xcr.h>
26+
#include <asm/debugreg.h>
2627

2728
#include "mmu.h"
2829
#include "x86.h"
@@ -54,9 +55,14 @@ module_param_named(sev, sev_enabled, bool, 0444);
5455
/* enable/disable SEV-ES support */
5556
static bool sev_es_enabled = true;
5657
module_param_named(sev_es, sev_es_enabled, bool, 0444);
58+
59+
/* enable/disable SEV-ES DebugSwap support */
60+
static bool sev_es_debug_swap_enabled = true;
61+
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
5762
#else
5863
#define sev_enabled false
5964
#define sev_es_enabled false
65+
#define sev_es_debug_swap_enabled false
6066
#endif /* CONFIG_KVM_AMD_SEV */
6167

6268
static u8 sev_enc_bit;
@@ -606,6 +612,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
606612
save->xss = svm->vcpu.arch.ia32_xss;
607613
save->dr6 = svm->vcpu.arch.dr6;
608614

615+
if (sev_es_debug_swap_enabled)
616+
save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
617+
609618
pr_debug("Virtual Machine Save Area (VMSA):\n");
610619
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
611620

@@ -619,6 +628,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
619628
struct vcpu_svm *svm = to_svm(vcpu);
620629
int ret;
621630

631+
if (vcpu->guest_debug) {
632+
pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported");
633+
return -EINVAL;
634+
}
635+
622636
/* Perform some pre-encryption checks against the VMSA */
623637
ret = sev_es_sync_vmsa(svm);
624638
if (ret)
@@ -1725,7 +1739,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
17251739
* Note, the source is not required to have the same number of
17261740
* vCPUs as the destination when migrating a vanilla SEV VM.
17271741
*/
1728-
src_vcpu = kvm_get_vcpu(dst_kvm, i);
1742+
src_vcpu = kvm_get_vcpu(src_kvm, i);
17291743
src_svm = to_svm(src_vcpu);
17301744

17311745
/*
@@ -2171,7 +2185,7 @@ void __init sev_hardware_setup(void)
21712185
bool sev_es_supported = false;
21722186
bool sev_supported = false;
21732187

2174-
if (!sev_enabled || !npt_enabled)
2188+
if (!sev_enabled || !npt_enabled || !nrips)
21752189
goto out;
21762190

21772191
/*
@@ -2256,6 +2270,9 @@ void __init sev_hardware_setup(void)
22562270

22572271
sev_enabled = sev_supported;
22582272
sev_es_enabled = sev_es_supported;
2273+
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
2274+
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
2275+
sev_es_debug_swap_enabled = false;
22592276
#endif
22602277
}
22612278

@@ -2881,7 +2898,10 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
28812898
svm->sev_es.ghcb_sa);
28822899
break;
28832900
case SVM_VMGEXIT_NMI_COMPLETE:
2884-
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET);
2901+
++vcpu->stat.nmi_window_exits;
2902+
svm->nmi_masked = false;
2903+
kvm_make_request(KVM_REQ_EVENT, vcpu);
2904+
ret = 1;
28852905
break;
28862906
case SVM_VMGEXIT_AP_HLT_LOOP:
28872907
ret = kvm_emulate_ap_reset_hold(vcpu);
@@ -2944,6 +2964,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
29442964

29452965
static void sev_es_init_vmcb(struct vcpu_svm *svm)
29462966
{
2967+
struct vmcb *vmcb = svm->vmcb01.ptr;
29472968
struct kvm_vcpu *vcpu = &svm->vcpu;
29482969

29492970
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE;
@@ -2952,9 +2973,12 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
29522973
/*
29532974
* An SEV-ES guest requires a VMSA area that is a separate from the
29542975
* VMCB page. Do not include the encryption mask on the VMSA physical
2955-
* address since hardware will access it using the guest key.
2976+
* address since hardware will access it using the guest key. Note,
2977+
* the VMSA will be NULL if this vCPU is the destination for intrahost
2978+
* migration, and will be copied later.
29562979
*/
2957-
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
2980+
if (svm->sev_es.vmsa)
2981+
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
29582982

29592983
/* Can't intercept CR register access, HV can't modify CR registers */
29602984
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
@@ -2972,8 +2996,23 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
29722996
svm_set_intercept(svm, TRAP_CR4_WRITE);
29732997
svm_set_intercept(svm, TRAP_CR8_WRITE);
29742998

2975-
/* No support for enable_vmware_backdoor */
2976-
clr_exception_intercept(svm, GP_VECTOR);
2999+
vmcb->control.intercepts[INTERCEPT_DR] = 0;
3000+
if (!sev_es_debug_swap_enabled) {
3001+
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
3002+
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
3003+
recalc_intercepts(svm);
3004+
} else {
3005+
/*
3006+
* Disable #DB intercept iff DebugSwap is enabled. KVM doesn't
3007+
* allow debugging SEV-ES guests, and enables DebugSwap iff
3008+
* NO_NESTED_DATA_BP is supported, so there's no reason to
3009+
* intercept #DB when DebugSwap is enabled. For simplicity
3010+
* with respect to guest debug, intercept #DB for other VMs
3011+
* even if NO_NESTED_DATA_BP is supported, i.e. even if the
3012+
* guest can't DoS the CPU with infinite #DB vectoring.
3013+
*/
3014+
clr_exception_intercept(svm, DB_VECTOR);
3015+
}
29773016

29783017
/* Can't intercept XSETBV, HV can't modify XCR0 directly */
29793018
svm_clr_intercept(svm, INTERCEPT_XSETBV);
@@ -3000,6 +3039,12 @@ void sev_init_vmcb(struct vcpu_svm *svm)
30003039
svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
30013040
clr_exception_intercept(svm, UD_VECTOR);
30023041

3042+
/*
3043+
* Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as
3044+
* KVM can't decrypt guest memory to decode the faulting instruction.
3045+
*/
3046+
clr_exception_intercept(svm, GP_VECTOR);
3047+
30033048
if (sev_es_guest(svm->vcpu.kvm))
30043049
sev_es_init_vmcb(svm);
30053050
}
@@ -3018,20 +3063,41 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
30183063
void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
30193064
{
30203065
/*
3021-
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
3022-
* of which one step is to perform a VMLOAD. KVM performs the
3023-
* corresponding VMSAVE in svm_prepare_guest_switch for both
3024-
* traditional and SEV-ES guests.
3066+
* All host state for SEV-ES guests is categorized into three swap types
3067+
* based on how it is handled by hardware during a world switch:
3068+
*
3069+
* A: VMRUN: Host state saved in host save area
3070+
* VMEXIT: Host state loaded from host save area
3071+
*
3072+
* B: VMRUN: Host state _NOT_ saved in host save area
3073+
* VMEXIT: Host state loaded from host save area
3074+
*
3075+
* C: VMRUN: Host state _NOT_ saved in host save area
3076+
* VMEXIT: Host state initialized to default(reset) values
3077+
*
3078+
* Manually save type-B state, i.e. state that is loaded by VMEXIT but
3079+
* isn't saved by VMRUN, that isn't already saved by VMSAVE (performed
3080+
* by common SVM code).
30253081
*/
3026-
3027-
/* XCR0 is restored on VMEXIT, save the current host value */
30283082
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
3029-
3030-
/* PKRU is restored on VMEXIT, save the current host value */
30313083
hostsa->pkru = read_pkru();
3032-
3033-
/* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
30343084
hostsa->xss = host_xss;
3085+
3086+
/*
3087+
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
3088+
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
3089+
* saves and loads debug registers (Type-A).
3090+
*/
3091+
if (sev_es_debug_swap_enabled) {
3092+
hostsa->dr0 = native_get_debugreg(0);
3093+
hostsa->dr1 = native_get_debugreg(1);
3094+
hostsa->dr2 = native_get_debugreg(2);
3095+
hostsa->dr3 = native_get_debugreg(3);
3096+
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
3097+
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
3098+
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
3099+
hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
3100+
}
30353101
}
30363102

30373103
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)

0 commit comments

Comments
 (0)