Skip to content

Commit 1c3bed8

Browse files
committed
Merge tag 'kvm-x86-fixes-6.9-rcN' of https://github.com/kvm-x86/linux into HEAD
- Fix a mostly benign bug in the gfn_to_pfn_cache infrastructure where KVM would allow userspace to refresh the cache with a bogus GPA. The bug has existed for quite some time, but was exposed by a new sanity check added in 6.9 (to ensure a cache is either GPA-based or HVA-based). - Drop an unused param from gfn_to_pfn_cache_invalidate_start() that got left behind during a 6.9 cleanup. - Disable support for virtualizing adaptive PEBS, as KVM's implementation is architecturally broken and can leak host LBRs to the guest. - Fix a bug where KVM neglects to set the enable bits for general purpose counters in PERF_GLOBAL_CTRL when initializing the virtual PMU. Both Intel and AMD architectures require the bits to be set at RESET in order for v2 PMUs to be backwards compatible with software that was written for v1 PMUs, i.e. for software that will never manually set the global enables. - Disable LBR virtualization on CPUs that don't support LBR callstacks, as KVM unconditionally uses PERF_SAMPLE_BRANCH_CALL_STACK when creating the virtual LBR perf event, i.e. KVM will always fail to create LBR events on such CPUs. - Fix a math goof in x86's hugepage logic for KVM_SET_MEMORY_ATTRIBUTES that results in an array overflow (detected by KASAN). - Fix a flaw in the max_guest_memory selftest that results in it exhausting the supply of ucall structures when run with more than 256 vCPUs. - Mark KVM_MEM_READONLY as supported for RISC-V in set_memory_region_test. - Fix a bug where KVM incorrectly thinks a TDP MMU root is an indirect shadow root due KVM unnecessarily clobbering root_role.direct when userspace sets guest CPUID. - Fix a dirty logging bug in the where KVM fails to write-protect TDP MMU SPTEs used for L2 if Page-Modification Logging is enabled for L1 and the L1 hypervisor is NOT using EPT (if nEPT is enabled, KVM doesn't use the TDP MMU to run L2). For simplicity, KVM always disables PML when running L2, but the TDP MMU wasn't accounting for root-specific conditions that force write- protect based dirty logging.
2 parents 49ff3b4 + eefb85b commit 1c3bed8

File tree

15 files changed

+194
-89
lines changed

15 files changed

+194
-89
lines changed

arch/x86/events/intel/lbr.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1693,6 +1693,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
16931693
lbr->from = x86_pmu.lbr_from;
16941694
lbr->to = x86_pmu.lbr_to;
16951695
lbr->info = x86_pmu.lbr_info;
1696+
lbr->has_callstack = x86_pmu_has_lbr_callstack();
16961697
}
16971698
EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
16981699

arch/x86/include/asm/perf_event.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,7 @@ struct x86_pmu_lbr {
555555
unsigned int from;
556556
unsigned int to;
557557
unsigned int info;
558+
bool has_callstack;
558559
};
559560

560561
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);

arch/x86/kvm/mmu/mmu.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5576,9 +5576,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
55765576
* that problem is swept under the rug; KVM's CPUID API is horrific and
55775577
* it's all but impossible to solve it without introducing a new API.
55785578
*/
5579-
vcpu->arch.root_mmu.root_role.word = 0;
5580-
vcpu->arch.guest_mmu.root_role.word = 0;
5581-
vcpu->arch.nested_mmu.root_role.word = 0;
5579+
vcpu->arch.root_mmu.root_role.invalid = 1;
5580+
vcpu->arch.guest_mmu.root_role.invalid = 1;
5581+
vcpu->arch.nested_mmu.root_role.invalid = 1;
55825582
vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
55835583
vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
55845584
vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
@@ -7399,7 +7399,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
73997399
* by the memslot, KVM can't use a hugepage due to the
74007400
* misaligned address regardless of memory attributes.
74017401
*/
7402-
if (gfn >= slot->base_gfn) {
7402+
if (gfn >= slot->base_gfn &&
7403+
gfn + nr_pages <= slot->base_gfn + slot->npages) {
74037404
if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
74047405
hugepage_clear_mixed(slot, gfn, level);
74057406
else

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1548,17 +1548,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
15481548
}
15491549
}
15501550

1551-
/*
1552-
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1553-
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1554-
* If AD bits are not enabled, this will require clearing the writable bit on
1555-
* each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1556-
* be flushed.
1557-
*/
1551+
static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
1552+
{
1553+
/*
1554+
* All TDP MMU shadow pages share the same role as their root, aside
1555+
* from level, so it is valid to key off any shadow page to determine if
1556+
* write protection is needed for an entire tree.
1557+
*/
1558+
return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
1559+
}
1560+
15581561
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
15591562
gfn_t start, gfn_t end)
15601563
{
1561-
u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
1564+
const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
1565+
shadow_dirty_mask;
15621566
struct tdp_iter iter;
15631567
bool spte_set = false;
15641568

@@ -1573,7 +1577,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
15731577
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
15741578
continue;
15751579

1576-
KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1580+
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
15771581
spte_ad_need_write_protect(iter.old_spte));
15781582

15791583
if (!(iter.old_spte & dbit))
@@ -1590,11 +1594,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
15901594
}
15911595

15921596
/*
1593-
* Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1594-
* AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1595-
* If AD bits are not enabled, this will require clearing the writable bit on
1596-
* each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1597-
* be flushed.
1597+
* Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1598+
* memslot. Returns true if an SPTE has been changed and the TLBs need to be
1599+
* flushed.
15981600
*/
15991601
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
16001602
const struct kvm_memory_slot *slot)
@@ -1610,18 +1612,11 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
16101612
return spte_set;
16111613
}
16121614

1613-
/*
1614-
* Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1615-
* set in mask, starting at gfn. The given memslot is expected to contain all
1616-
* the GFNs represented by set bits in the mask. If AD bits are enabled,
1617-
* clearing the dirty status will involve clearing the dirty bit on each SPTE
1618-
* or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1619-
*/
16201615
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
16211616
gfn_t gfn, unsigned long mask, bool wrprot)
16221617
{
1623-
u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
1624-
shadow_dirty_mask;
1618+
const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
1619+
shadow_dirty_mask;
16251620
struct tdp_iter iter;
16261621

16271622
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -1633,7 +1628,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
16331628
if (!mask)
16341629
break;
16351630

1636-
KVM_MMU_WARN_ON(kvm_ad_enabled() &&
1631+
KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
16371632
spte_ad_need_write_protect(iter.old_spte));
16381633

16391634
if (iter.level > PG_LEVEL_4K ||
@@ -1659,11 +1654,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
16591654
}
16601655

16611656
/*
1662-
* Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1663-
* set in mask, starting at gfn. The given memslot is expected to contain all
1664-
* the GFNs represented by set bits in the mask. If AD bits are enabled,
1665-
* clearing the dirty status will involve clearing the dirty bit on each SPTE
1666-
* or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1657+
* Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1658+
* which a bit is set in mask, starting at gfn. The given memslot is expected to
1659+
* contain all the GFNs represented by set bits in the mask.
16671660
*/
16681661
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
16691662
struct kvm_memory_slot *slot,

arch/x86/kvm/pmu.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -775,8 +775,20 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
775775
pmu->pebs_data_cfg_mask = ~0ull;
776776
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
777777

778-
if (vcpu->kvm->arch.enable_pmu)
779-
static_call(kvm_x86_pmu_refresh)(vcpu);
778+
if (!vcpu->kvm->arch.enable_pmu)
779+
return;
780+
781+
static_call(kvm_x86_pmu_refresh)(vcpu);
782+
783+
/*
784+
* At RESET, both Intel and AMD CPUs set all enable bits for general
785+
* purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786+
* was written for v1 PMUs don't unknowingly leave GP counters disabled
787+
* in the global controls). Emulate that behavior when refreshing the
788+
* PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
789+
*/
790+
if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
791+
pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
780792
}
781793

782794
void kvm_pmu_init(struct kvm_vcpu *vcpu)

arch/x86/kvm/vmx/pmu_intel.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
535535
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
536536
if (cpuid_model_is_consistent(vcpu) &&
537537
(perf_capabilities & PMU_CAP_LBR_FMT))
538-
x86_perf_get_lbr(&lbr_desc->records);
538+
memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps));
539539
else
540540
lbr_desc->records.nr = 0;
541541

arch/x86/kvm/vmx/vmx.c

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,8 @@ module_param(ple_window_max, uint, 0444);
218218
int __read_mostly pt_mode = PT_MODE_SYSTEM;
219219
module_param(pt_mode, int, S_IRUGO);
220220

221+
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
222+
221223
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
222224
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
223225
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -7862,10 +7864,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
78627864
vmx_update_exception_bitmap(vcpu);
78637865
}
78647866

7865-
static u64 vmx_get_perf_capabilities(void)
7867+
static __init u64 vmx_get_perf_capabilities(void)
78667868
{
78677869
u64 perf_cap = PMU_CAP_FW_WRITES;
7868-
struct x86_pmu_lbr lbr;
78697870
u64 host_perf_cap = 0;
78707871

78717872
if (!enable_pmu)
@@ -7875,15 +7876,43 @@ static u64 vmx_get_perf_capabilities(void)
78757876
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
78767877

78777878
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
7878-
x86_perf_get_lbr(&lbr);
7879-
if (lbr.nr)
7879+
x86_perf_get_lbr(&vmx_lbr_caps);
7880+
7881+
/*
7882+
* KVM requires LBR callstack support, as the overhead due to
7883+
* context switching LBRs without said support is too high.
7884+
* See intel_pmu_create_guest_lbr_event() for more info.
7885+
*/
7886+
if (!vmx_lbr_caps.has_callstack)
7887+
memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
7888+
else if (vmx_lbr_caps.nr)
78807889
perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
78817890
}
78827891

78837892
if (vmx_pebs_supported()) {
78847893
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
7885-
if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
7886-
perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7894+
7895+
/*
7896+
* Disallow adaptive PEBS as it is functionally broken, can be
7897+
* used by the guest to read *host* LBRs, and can be used to
7898+
* bypass userspace event filters. To correctly and safely
7899+
* support adaptive PEBS, KVM needs to:
7900+
*
7901+
* 1. Account for the ADAPTIVE flag when (re)programming fixed
7902+
* counters.
7903+
*
7904+
* 2. Gain support from perf (or take direct control of counter
7905+
* programming) to support events without adaptive PEBS
7906+
* enabled for the hardware counter.
7907+
*
7908+
* 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
7909+
* adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
7910+
*
7911+
* 4. Document which PMU events are effectively exposed to the
7912+
* guest via adaptive PEBS, and make adaptive PEBS mutually
7913+
* exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
7914+
*/
7915+
perf_cap &= ~PERF_CAP_PEBS_BASELINE;
78877916
}
78887917

78897918
return perf_cap;

arch/x86/kvm/vmx/vmx.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "vmx_ops.h"
1616
#include "../cpuid.h"
1717
#include "run_flags.h"
18+
#include "../mmu.h"
1819

1920
#define MSR_TYPE_R 1
2021
#define MSR_TYPE_W 2
@@ -109,6 +110,8 @@ struct lbr_desc {
109110
bool msr_passthrough;
110111
};
111112

113+
extern struct x86_pmu_lbr vmx_lbr_caps;
114+
112115
/*
113116
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
114117
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -719,7 +722,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
719722
if (!enable_ept)
720723
return true;
721724

722-
return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
725+
return allow_smaller_maxphyaddr &&
726+
cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
723727
}
724728

725729
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)

tools/testing/selftests/kvm/max_guest_memory_test.c

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
2222
{
2323
uint64_t gpa;
2424

25-
for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
26-
*((volatile uint64_t *)gpa) = gpa;
27-
28-
GUEST_DONE();
25+
for (;;) {
26+
for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
27+
*((volatile uint64_t *)gpa) = gpa;
28+
GUEST_SYNC(0);
29+
}
2930
}
3031

3132
struct vcpu_info {
@@ -55,7 +56,7 @@ static void rendezvous_with_boss(void)
5556
static void run_vcpu(struct kvm_vcpu *vcpu)
5657
{
5758
vcpu_run(vcpu);
58-
TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
59+
TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
5960
}
6061

6162
static void *vcpu_worker(void *data)
@@ -64,17 +65,13 @@ static void *vcpu_worker(void *data)
6465
struct kvm_vcpu *vcpu = info->vcpu;
6566
struct kvm_vm *vm = vcpu->vm;
6667
struct kvm_sregs sregs;
67-
struct kvm_regs regs;
6868

6969
vcpu_args_set(vcpu, 3, info->start_gpa, info->end_gpa, vm->page_size);
7070

71-
/* Snapshot regs before the first run. */
72-
vcpu_regs_get(vcpu, &regs);
7371
rendezvous_with_boss();
7472

7573
run_vcpu(vcpu);
7674
rendezvous_with_boss();
77-
vcpu_regs_set(vcpu, &regs);
7875
vcpu_sregs_get(vcpu, &sregs);
7976
#ifdef __x86_64__
8077
/* Toggle CR0.WP to trigger a MMU context reset. */

tools/testing/selftests/kvm/set_memory_region_test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ static void test_invalid_memory_region_flags(void)
333333
struct kvm_vm *vm;
334334
int r, i;
335335

336-
#if defined __aarch64__ || defined __x86_64__
336+
#if defined __aarch64__ || defined __riscv || defined __x86_64__
337337
supported_flags |= KVM_MEM_READONLY;
338338
#endif
339339

0 commit comments

Comments
 (0)