Skip to content

Commit 73331c5

Browse files
committed
Merge branch 'kvm-fixes-for-5.18-rc5' into HEAD
Fixes for (relatively) old bugs, to be merged in both the -rc and next development trees: * Fix potential races when walking host page table * Fix bad user ABI for KVM_EXIT_SYSTEM_EVENT * Fix shadow page table leak when KVM runs nested
2 parents 484c22d + 4418723 commit 73331c5

File tree

10 files changed

+121
-32
lines changed

10 files changed

+121
-32
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field.
59865986
#define KVM_SYSTEM_EVENT_RESET 2
59875987
#define KVM_SYSTEM_EVENT_CRASH 3
59885988
__u32 type;
5989-
__u64 flags;
5989+
__u32 ndata;
5990+
__u64 data[16];
59905991
} system_event;
59915992

59925993
If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
59935994
a system-level event using some architecture specific mechanism (hypercall
59945995
or some special instruction). In case of ARM64, this is triggered using
5995-
HVC instruction based PSCI call from the vcpu. The 'type' field describes
5996-
the system-level event type. The 'flags' field describes architecture
5997-
specific flags for the system-level event.
5996+
HVC instruction based PSCI call from the vcpu.
59985997

5998+
The 'type' field describes the system-level event type.
59995999
Valid values for 'type' are:
60006000

60016001
- KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
@@ -6010,10 +6010,20 @@ Valid values for 'type' are:
60106010
to ignore the request, or to gather VM memory core dump and/or
60116011
reset/shutdown of the VM.
60126012

6013-
Valid flags are:
6013+
If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
6014+
architecture specific information for the system-level event. Only
6015+
the first `ndata` items (possibly zero) of the data array are valid.
60146016

6015-
- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
6016-
a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
6017+
- for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
6018+
the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
6019+
specification.
6020+
6021+
- for RISC-V, data[0] is set to the value of the second argument of the
6022+
``sbi_system_reset`` call.
6023+
6024+
Previous versions of Linux defined a `flags` member in this struct. The
6025+
field is now aliased to `data[0]`. Userspace can assume that it is only
6026+
written if ndata is greater than 0.
60176027

60186028
::
60196029

arch/arm64/kvm/psci.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
181181

182182
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
183183
vcpu->run->system_event.type = type;
184-
vcpu->run->system_event.flags = flags;
184+
vcpu->run->system_event.ndata = 1;
185+
vcpu->run->system_event.data[0] = flags;
185186
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
186187
}
187188

arch/riscv/kvm/vcpu_sbi.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
8383

8484
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
8585
struct kvm_run *run,
86-
u32 type, u64 flags)
86+
u32 type, u64 reason)
8787
{
8888
unsigned long i;
8989
struct kvm_vcpu *tmp;
@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
9494

9595
memset(&run->system_event, 0, sizeof(run->system_event));
9696
run->system_event.type = type;
97-
run->system_event.flags = flags;
97+
run->system_event.ndata = 1;
98+
run->system_event.data[0] = reason;
9899
run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
99100
}
100101

arch/x86/kvm/mmu.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
6565
return ((2ULL << (e - s)) - 1) << s;
6666
}
6767

68+
/*
69+
* The number of non-reserved physical address bits irrespective of features
70+
* that repurpose legal bits, e.g. MKTME.
71+
*/
72+
extern u8 __read_mostly shadow_phys_bits;
73+
74+
static inline gfn_t kvm_mmu_max_gfn(void)
75+
{
76+
/*
77+
* Note that this uses the host MAXPHYADDR, not the guest's.
78+
* EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
79+
* assuming KVM is running on bare metal, guest accesses beyond
80+
* host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
81+
* (either EPT Violation/Misconfig or #NPF), and so KVM will never
82+
* install a SPTE for such addresses. If KVM is running as a VM
83+
* itself, on the other hand, it might see a MAXPHYADDR that is less
84+
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
85+
* disallows such SPTEs entirely and simplifies the TDP MMU.
86+
*/
87+
int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
88+
89+
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
90+
}
91+
6892
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
6993
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
7094

arch/x86/kvm/mmu/mmu.c

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
28042804
const struct kvm_memory_slot *slot)
28052805
{
28062806
unsigned long hva;
2807-
pte_t *pte;
2808-
int level;
2807+
unsigned long flags;
2808+
int level = PG_LEVEL_4K;
2809+
pgd_t pgd;
2810+
p4d_t p4d;
2811+
pud_t pud;
2812+
pmd_t pmd;
28092813

28102814
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
28112815
return PG_LEVEL_4K;
@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
28202824
*/
28212825
hva = __gfn_to_hva_memslot(slot, gfn);
28222826

2823-
pte = lookup_address_in_mm(kvm->mm, hva, &level);
2824-
if (unlikely(!pte))
2825-
return PG_LEVEL_4K;
2827+
/*
2828+
* Lookup the mapping level in the current mm. The information
2829+
* may become stale soon, but it is safe to use as long as
2830+
* 1) mmu_notifier_retry was checked after taking mmu_lock, and
2831+
* 2) mmu_lock is taken now.
2832+
*
2833+
* We still need to disable IRQs to prevent concurrent tear down
2834+
* of page tables.
2835+
*/
2836+
local_irq_save(flags);
2837+
2838+
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
2839+
if (pgd_none(pgd))
2840+
goto out;
2841+
2842+
p4d = READ_ONCE(*p4d_offset(&pgd, hva));
2843+
if (p4d_none(p4d) || !p4d_present(p4d))
2844+
goto out;
28262845

2846+
pud = READ_ONCE(*pud_offset(&p4d, hva));
2847+
if (pud_none(pud) || !pud_present(pud))
2848+
goto out;
2849+
2850+
if (pud_large(pud)) {
2851+
level = PG_LEVEL_1G;
2852+
goto out;
2853+
}
2854+
2855+
pmd = READ_ONCE(*pmd_offset(&pud, hva));
2856+
if (pmd_none(pmd) || !pmd_present(pmd))
2857+
goto out;
2858+
2859+
if (pmd_large(pmd))
2860+
level = PG_LEVEL_2M;
2861+
2862+
out:
2863+
local_irq_restore(flags);
28272864
return level;
28282865
}
28292866

@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
29923029
/*
29933030
* If MMIO caching is disabled, emulate immediately without
29943031
* touching the shadow page tables as attempting to install an
2995-
* MMIO SPTE will just be an expensive nop.
3032+
* MMIO SPTE will just be an expensive nop. Do not cache MMIO
3033+
* whose gfn is greater than host.MAXPHYADDR, any guest that
3034+
* generates such gfns is running nested and is being tricked
3035+
* by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
3036+
* and only if L1's MAXPHYADDR is inaccurate with respect to
3037+
* the hardware's).
29963038
*/
2997-
if (unlikely(!shadow_mmio_value)) {
3039+
if (unlikely(!shadow_mmio_value) ||
3040+
unlikely(fault->gfn > kvm_mmu_max_gfn())) {
29983041
*ret_val = RET_PF_EMULATE;
29993042
return true;
30003043
}

arch/x86/kvm/mmu/spte.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
201201
*/
202202
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
203203

204-
/*
205-
* The number of non-reserved physical address bits irrespective of features
206-
* that repurpose legal bits, e.g. MKTME.
207-
*/
208-
extern u8 __read_mostly shadow_phys_bits;
209-
210204
static inline bool is_mmio_spte(u64 spte)
211205
{
212206
return (spte & shadow_mmio_mask) == shadow_mmio_value &&

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -815,22 +815,23 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
815815
return iter->yielded;
816816
}
817817

818-
static inline gfn_t tdp_mmu_max_gfn_host(void)
818+
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
819819
{
820820
/*
821-
* Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
822-
* will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
823-
* and so KVM will never install a SPTE for such addresses.
821+
* Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
822+
* a gpa range that would exceed the max gfn, and KVM does not create
823+
* MMIO SPTEs for "impossible" gfns, instead sending such accesses down
824+
* the slow emulation path every time.
824825
*/
825-
return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
826+
return kvm_mmu_max_gfn() + 1;
826827
}
827828

828829
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
829830
bool shared, int zap_level)
830831
{
831832
struct tdp_iter iter;
832833

833-
gfn_t end = tdp_mmu_max_gfn_host();
834+
gfn_t end = tdp_mmu_max_gfn_exclusive();
834835
gfn_t start = 0;
835836

836837
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
@@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
923924
{
924925
struct tdp_iter iter;
925926

926-
end = min(end, tdp_mmu_max_gfn_host());
927+
end = min(end, tdp_mmu_max_gfn_exclusive());
927928

928929
lockdep_assert_held_write(&kvm->mmu_lock);
929930

arch/x86/kvm/x86.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10020,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
1002010020
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
1002110021
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1002210022
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
10023+
vcpu->run->system_event.ndata = 0;
1002310024
r = 0;
1002410025
goto out;
1002510026
}
1002610027
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
1002710028
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1002810029
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
10030+
vcpu->run->system_event.ndata = 0;
1002910031
r = 0;
1003010032
goto out;
1003110033
}
@@ -12009,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
1200912011
struct kvm_memory_slot *new,
1201012012
enum kvm_mr_change change)
1201112013
{
12012-
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
12014+
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
12015+
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
12016+
return -EINVAL;
12017+
1201312018
return kvm_alloc_memslot_metadata(kvm, new);
12019+
}
1201412020

1201512021
if (change == KVM_MR_FLAGS_ONLY)
1201612022
memcpy(&new->arch, &old->arch, sizeof(old->arch));

include/uapi/linux/kvm.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,13 @@ struct kvm_run {
445445
#define KVM_SYSTEM_EVENT_RESET 2
446446
#define KVM_SYSTEM_EVENT_CRASH 3
447447
__u32 type;
448-
__u64 flags;
448+
__u32 ndata;
449+
union {
450+
#ifndef __KERNEL__
451+
__u64 flags;
452+
#endif
453+
__u64 data[16];
454+
};
449455
} system_event;
450456
/* KVM_EXIT_S390_STSI */
451457
struct {
@@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt {
11441150
#define KVM_CAP_S390_MEM_OP_EXTENSION 211
11451151
#define KVM_CAP_PMU_CAPABILITY 212
11461152
#define KVM_CAP_DISABLE_QUIRKS2 213
1153+
/* #define KVM_CAP_VM_TSC_CONTROL 214 */
1154+
#define KVM_CAP_SYSTEM_EVENT_DATA 215
11471155

11481156
#ifdef KVM_CAP_IRQ_ROUTING
11491157

virt/kvm/kvm_main.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4354,6 +4354,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
43544354
return 0;
43554355
#endif
43564356
case KVM_CAP_BINARY_STATS_FD:
4357+
case KVM_CAP_SYSTEM_EVENT_DATA:
43574358
return 1;
43584359
default:
43594360
break;

0 commit comments

Comments
 (0)