Skip to content

Commit e9025cd

Browse files
committed
Merge tag 'kvm-x86-pmu-6.9' of https://github.com/kvm-x86/linux into HEAD
KVM x86 PMU changes for 6.9: - Fix several bugs where KVM speciously prevents the guest from utilizing fixed counters and architectural event encodings based on whether or not guest CPUID reports support for the _architectural_ encoding. - Fix a variety of bugs in KVM's emulation of RDPMC, e.g. for "fast" reads, priority of VMX interception vs #GP, PMC types in architectural PMUs, etc. - Add a selftest to verify KVM correctly emulates RDMPC, counter availability, and a variety of other PMC-related behaviors that depend on guest CPUID, i.e. are difficult to validate via KVM-Unit-Tests. - Zero out PMU metadata on AMD if the virtual PMU is disabled to avoid wasting cycles, e.g. when checking if a PMC event needs to be synthesized when skipping an instruction. - Optimize triggering of emulated events, e.g. for "count instructions" events when skipping an instruction, which yields a ~10% performance improvement in VM-Exit microbenchmarks when a vPMU is exposed to the guest. - Tighten the check for "PMI in guest" to reduce false positives if an NMI arrives in the host while KVM is handling an IRQ VM-Exit.
2 parents b00471a + 812d432 commit e9025cd

23 files changed

+1262
-398
lines changed

arch/x86/include/asm/kvm-x86-pmu-ops.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,9 @@ BUILD_BUG_ON(1)
1212
* a NULL definition, for example if "static_call_cond()" will be used
1313
* at the call sites.
1414
*/
15-
KVM_X86_PMU_OP(hw_event_available)
16-
KVM_X86_PMU_OP(pmc_idx_to_pmc)
1715
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
1816
KVM_X86_PMU_OP(msr_idx_to_pmc)
19-
KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
17+
KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early)
2018
KVM_X86_PMU_OP(is_valid_msr)
2119
KVM_X86_PMU_OP(get_msr)
2220
KVM_X86_PMU_OP(set_msr)

arch/x86/include/asm/kvm_host.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,7 @@ struct kvm_pmc {
536536
#define KVM_PMC_MAX_FIXED 3
537537
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
538538
#define KVM_AMD_PMC_MAX_GENERIC 6
539+
539540
struct kvm_pmu {
540541
u8 version;
541542
unsigned nr_arch_gp_counters;
@@ -1889,8 +1890,16 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
18891890
}
18901891
#endif /* CONFIG_HYPERV */
18911892

1893+
enum kvm_intr_type {
1894+
/* Values are arbitrary, but must be non-zero. */
1895+
KVM_HANDLING_IRQ = 1,
1896+
KVM_HANDLING_NMI,
1897+
};
1898+
1899+
/* Enable perf NMI and timer modes to work, and minimise false positives. */
18921900
#define kvm_arch_pmi_in_guest(vcpu) \
1893-
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
1901+
((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
1902+
(!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))
18941903

18951904
void __init kvm_mmu_x86_module_init(void);
18961905
int kvm_mmu_vendor_module_init(void);

arch/x86/kvm/emulate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3955,7 +3955,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
39553955
* protected mode.
39563956
*/
39573957
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
3958-
ctxt->ops->check_pmc(ctxt, rcx))
3958+
ctxt->ops->check_rdpmc_early(ctxt, rcx))
39593959
return emulate_gp(ctxt, 0);
39603960

39613961
return X86EMUL_CONTINUE;

arch/x86/kvm/kvm_emulate.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ struct x86_emulate_ops {
208208
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
209209
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
210210
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
211-
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
211+
int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
212212
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
213213
void (*halt)(struct x86_emulate_ctxt *ctxt);
214214
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);

arch/x86/kvm/pmu.c

Lines changed: 103 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
3030
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
3131

32+
struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
33+
EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
34+
3235
/* Precise Distribution of Instructions Retired (PDIR) */
3336
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
3437
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
@@ -67,7 +70,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
6770
* all perf counters (both gp and fixed). The mapping relationship
6871
* between pmc and perf counters is as the following:
6972
* * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
70-
* [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
73+
* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
7174
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
7275
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
7376
*/
@@ -411,7 +414,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
411414
static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
412415
int idx)
413416
{
414-
int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
417+
int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
415418

416419
if (filter->action == KVM_PMU_EVENT_DENY &&
417420
test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
@@ -441,11 +444,10 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
441444
static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
442445
{
443446
return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
444-
static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
445447
check_pmu_event_filter(pmc);
446448
}
447449

448-
static void reprogram_counter(struct kvm_pmc *pmc)
450+
static int reprogram_counter(struct kvm_pmc *pmc)
449451
{
450452
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
451453
u64 eventsel = pmc->eventsel;
@@ -456,7 +458,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
456458
emulate_overflow = pmc_pause_counter(pmc);
457459

458460
if (!pmc_event_is_allowed(pmc))
459-
goto reprogram_complete;
461+
return 0;
460462

461463
if (emulate_overflow)
462464
__kvm_perf_overflow(pmc, false);
@@ -466,7 +468,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
466468

467469
if (pmc_is_fixed(pmc)) {
468470
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
469-
pmc->idx - INTEL_PMC_IDX_FIXED);
471+
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
470472
if (fixed_ctr_ctrl & 0x1)
471473
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
472474
if (fixed_ctr_ctrl & 0x2)
@@ -477,43 +479,45 @@ static void reprogram_counter(struct kvm_pmc *pmc)
477479
}
478480

479481
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
480-
goto reprogram_complete;
482+
return 0;
481483

482484
pmc_release_perf_event(pmc);
483485

484486
pmc->current_config = new_config;
485487

486-
/*
487-
* If reprogramming fails, e.g. due to contention, leave the counter's
488-
* regprogram bit set, i.e. opportunistically try again on the next PMU
489-
* refresh. Don't make a new request as doing so can stall the guest
490-
* if reprogramming repeatedly fails.
491-
*/
492-
if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
493-
(eventsel & pmu->raw_event_mask),
494-
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
495-
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
496-
eventsel & ARCH_PERFMON_EVENTSEL_INT))
497-
return;
498-
499-
reprogram_complete:
500-
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
488+
return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
489+
(eventsel & pmu->raw_event_mask),
490+
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
491+
!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
492+
eventsel & ARCH_PERFMON_EVENTSEL_INT);
501493
}
502494

503495
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
504496
{
497+
DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
505498
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
499+
struct kvm_pmc *pmc;
506500
int bit;
507501

508-
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
509-
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
502+
bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
510503

511-
if (unlikely(!pmc)) {
512-
clear_bit(bit, pmu->reprogram_pmi);
513-
continue;
514-
}
504+
/*
505+
* The reprogramming bitmap can be written asynchronously by something
506+
* other than the task that holds vcpu->mutex, take care to clear only
507+
* the bits that will actually processed.
508+
*/
509+
BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
510+
atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
515511

516-
reprogram_counter(pmc);
512+
kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
513+
/*
514+
* If reprogramming fails, e.g. due to contention, re-set the
515+
* regprogram bit set, i.e. opportunistically try again on the
516+
* next PMU refresh. Don't make a new request as doing so can
517+
* stall the guest if reprogramming repeatedly fails.
518+
*/
519+
if (reprogram_counter(pmc))
520+
set_bit(pmc->idx, pmu->reprogram_pmi);
517521
}
518522

519523
/*
@@ -525,10 +529,20 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
525529
kvm_pmu_cleanup(vcpu);
526530
}
527531

528-
/* check if idx is a valid index to access PMU */
529-
bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
532+
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
530533
{
531-
return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
534+
/*
535+
* On Intel, VMX interception has priority over RDPMC exceptions that
536+
* aren't already handled by the emulator, i.e. there are no additional
537+
* check needed for Intel PMUs.
538+
*
539+
* On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540+
* i.e. an invalid PMC results in a #GP, not #VMEXIT.
541+
*/
542+
if (!kvm_pmu_ops.check_rdpmc_early)
543+
return 0;
544+
545+
return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
532546
}
533547

534548
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -567,10 +581,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
567581

568582
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
569583
{
570-
bool fast_mode = idx & (1u << 31);
571584
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
572585
struct kvm_pmc *pmc;
573-
u64 mask = fast_mode ? ~0u : ~0ull;
586+
u64 mask = ~0ull;
574587

575588
if (!pmu->version)
576589
return 1;
@@ -716,11 +729,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
716729

717730
bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
718731

719-
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
720-
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
721-
if (!pmc)
722-
continue;
723-
732+
kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
724733
pmc_stop_counter(pmc);
725734
pmc->counter = 0;
726735
pmc->emulated_counter = 0;
@@ -741,6 +750,8 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
741750
*/
742751
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
743752
{
753+
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
754+
744755
if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
745756
return;
746757

@@ -750,8 +761,22 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
750761
*/
751762
kvm_pmu_reset(vcpu);
752763

753-
bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
754-
static_call(kvm_x86_pmu_refresh)(vcpu);
764+
pmu->version = 0;
765+
pmu->nr_arch_gp_counters = 0;
766+
pmu->nr_arch_fixed_counters = 0;
767+
pmu->counter_bitmask[KVM_PMC_GP] = 0;
768+
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
769+
pmu->reserved_bits = 0xffffffff00200000ull;
770+
pmu->raw_event_mask = X86_RAW_EVENT_MASK;
771+
pmu->global_ctrl_mask = ~0ull;
772+
pmu->global_status_mask = ~0ull;
773+
pmu->fixed_ctr_ctrl_mask = ~0ull;
774+
pmu->pebs_enable_mask = ~0ull;
775+
pmu->pebs_data_cfg_mask = ~0ull;
776+
bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
777+
778+
if (vcpu->kvm->arch.enable_pmu)
779+
static_call(kvm_x86_pmu_refresh)(vcpu);
755780
}
756781

757782
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -776,10 +801,8 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
776801
bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
777802
pmu->pmc_in_use, X86_PMC_IDX_MAX);
778803

779-
for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
780-
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
781-
782-
if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
804+
kvm_for_each_pmc(pmu, pmc, i, bitmask) {
805+
if (pmc->perf_event && !pmc_speculative_in_use(pmc))
783806
pmc_stop_counter(pmc);
784807
}
785808

@@ -799,13 +822,6 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
799822
kvm_pmu_request_counter_reprogram(pmc);
800823
}
801824

802-
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
803-
unsigned int perf_hw_id)
804-
{
805-
return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
806-
AMD64_RAW_EVENT_MASK_NB);
807-
}
808-
809825
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
810826
{
811827
bool select_os, select_user;
@@ -817,29 +833,56 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
817833
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
818834
} else {
819835
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
820-
pmc->idx - INTEL_PMC_IDX_FIXED);
836+
pmc->idx - KVM_FIXED_PMC_BASE_IDX);
821837
select_os = config & 0x1;
822838
select_user = config & 0x2;
823839
}
824840

841+
/*
842+
* Skip the CPL lookup, which isn't free on Intel, if the result will
843+
* be the same regardless of the CPL.
844+
*/
845+
if (select_os == select_user)
846+
return select_os;
847+
825848
return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
826849
}
827850

828-
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
851+
void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
829852
{
853+
DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
830854
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
831855
struct kvm_pmc *pmc;
832856
int i;
833857

834-
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
835-
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
858+
BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
836859

837-
if (!pmc || !pmc_event_is_allowed(pmc))
860+
if (!kvm_pmu_has_perf_global_ctrl(pmu))
861+
bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
862+
else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
863+
(unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
864+
return;
865+
866+
kvm_for_each_pmc(pmu, pmc, i, bitmap) {
867+
/*
868+
* Ignore checks for edge detect (all events currently emulated
869+
* but KVM are always rising edges), pin control (unsupported
870+
* by modern CPUs), and counter mask and its invert flag (KVM
871+
* doesn't emulate multiple events in a single clock cycle).
872+
*
873+
* Note, the uppermost nibble of AMD's mask overlaps Intel's
874+
* IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
875+
* bits (bits 35:34). Checking the "in HLE/RTM transaction"
876+
* flags is correct as the vCPU can't be in a transaction if
877+
* KVM is emulating an instruction. Checking the reserved bits
878+
* might be wrong if they are defined in the future, but so
879+
* could ignoring them, so do the simple thing for now.
880+
*/
881+
if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
882+
!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
838883
continue;
839884

840-
/* Ignore checks for edge detect, pin control, invert and CMASK bits */
841-
if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
842-
kvm_pmu_incr_counter(pmc);
885+
kvm_pmu_incr_counter(pmc);
843886
}
844887
}
845888
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);

0 commit comments

Comments
 (0)