Skip to content

Commit fcce7c1

Browse files
committed
Merge tag 'kvm-x86-pvclock-6.15' of https://github.com/kvm-x86/linux into HEAD
KVM PV clock changes for 6.15: - Don't take kvm->lock when iterating over vCPUs in the suspend notifier to fix a largely theoretical deadlock. - Use the vCPU's actual Xen PV clock information when starting the Xen timer, as the cached state in arch.hv_clock can be stale/bogus. - Fix a bug where KVM could bleed PVCLOCK_GUEST_STOPPED across different PV clocks. - Restrict PVCLOCK_GUEST_STOPPED to kvmclock, as KVM's suspend notifier only accounts for kvmclock, and there's no evidence that the flag is actually supported by Xen guests. - Clean up the per-vCPU "cache" of its reference pvclock, and instead only track the vCPU's TSC scaling (multipler+shift) metadata (which is moderately expensive to compute, and rarely changes for modern setups).
2 parents 9b093f5 + 1b3c380 commit fcce7c1

File tree

3 files changed

+121
-66
lines changed

3 files changed

+121
-66
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -911,7 +911,8 @@ struct kvm_vcpu_arch {
911911
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
912912

913913
gpa_t time;
914-
struct pvclock_vcpu_time_info hv_clock;
914+
s8 pvclock_tsc_shift;
915+
u32 pvclock_tsc_mul;
915916
unsigned int hw_tsc_khz;
916917
struct gfn_to_pfn_cache pv_time;
917918
/* set guest stopped flag in pvclock flags field */

arch/x86/kvm/x86.c

Lines changed: 57 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -3121,15 +3121,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
31213121
return data.clock;
31223122
}
31233123

3124-
static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
3124+
static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
3125+
struct kvm_vcpu *vcpu,
31253126
struct gfn_to_pfn_cache *gpc,
3126-
unsigned int offset,
3127-
bool force_tsc_unstable)
3127+
unsigned int offset)
31283128
{
3129-
struct kvm_vcpu_arch *vcpu = &v->arch;
31303129
struct pvclock_vcpu_time_info *guest_hv_clock;
3130+
struct pvclock_vcpu_time_info hv_clock;
31313131
unsigned long flags;
31323132

3133+
memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));
3134+
31333135
read_lock_irqsave(&gpc->lock, flags);
31343136
while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
31353137
read_unlock_irqrestore(&gpc->lock, flags);
@@ -3149,52 +3151,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
31493151
* it is consistent.
31503152
*/
31513153

3152-
guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
3154+
guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
31533155
smp_wmb();
31543156

31553157
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
3156-
vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3157-
3158-
if (vcpu->pvclock_set_guest_stopped_request) {
3159-
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
3160-
vcpu->pvclock_set_guest_stopped_request = false;
3161-
}
3162-
3163-
memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
3158+
hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
31643159

3165-
if (force_tsc_unstable)
3166-
guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
3160+
memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock));
31673161

31683162
smp_wmb();
31693163

3170-
guest_hv_clock->version = ++vcpu->hv_clock.version;
3164+
guest_hv_clock->version = ++hv_clock.version;
31713165

31723166
kvm_gpc_mark_dirty_in_slot(gpc);
31733167
read_unlock_irqrestore(&gpc->lock, flags);
31743168

3175-
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
3169+
trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
31763170
}
31773171

31783172
static int kvm_guest_time_update(struct kvm_vcpu *v)
31793173
{
3174+
struct pvclock_vcpu_time_info hv_clock = {};
31803175
unsigned long flags, tgt_tsc_khz;
31813176
unsigned seq;
31823177
struct kvm_vcpu_arch *vcpu = &v->arch;
31833178
struct kvm_arch *ka = &v->kvm->arch;
31843179
s64 kernel_ns;
31853180
u64 tsc_timestamp, host_tsc;
3186-
u8 pvclock_flags;
31873181
bool use_master_clock;
3188-
#ifdef CONFIG_KVM_XEN
3189-
/*
3190-
* For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
3191-
* explicitly told to use TSC as its clocksource Xen will not set this bit.
3192-
* This default behaviour led to bugs in some guest kernels which cause
3193-
* problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
3194-
*/
3195-
bool xen_pvclock_tsc_unstable =
3196-
ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
3197-
#endif
31983182

31993183
kernel_ns = 0;
32003184
host_tsc = 0;
@@ -3255,35 +3239,58 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
32553239

32563240
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
32573241
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
3258-
&vcpu->hv_clock.tsc_shift,
3259-
&vcpu->hv_clock.tsc_to_system_mul);
3242+
&vcpu->pvclock_tsc_shift,
3243+
&vcpu->pvclock_tsc_mul);
32603244
vcpu->hw_tsc_khz = tgt_tsc_khz;
32613245
kvm_xen_update_tsc_info(v);
32623246
}
32633247

3264-
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
3265-
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3248+
hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
3249+
hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
3250+
hv_clock.tsc_timestamp = tsc_timestamp;
3251+
hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
32663252
vcpu->last_guest_tsc = tsc_timestamp;
32673253

32683254
/* If the host uses TSC clocksource, then it is stable */
3269-
pvclock_flags = 0;
3255+
hv_clock.flags = 0;
32703256
if (use_master_clock)
3271-
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
3257+
hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT;
3258+
3259+
if (vcpu->pv_time.active) {
3260+
/*
3261+
* GUEST_STOPPED is only supported by kvmclock, and KVM's
3262+
* historic behavior is to only process the request if kvmclock
3263+
* is active/enabled.
3264+
*/
3265+
if (vcpu->pvclock_set_guest_stopped_request) {
3266+
hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
3267+
vcpu->pvclock_set_guest_stopped_request = false;
3268+
}
3269+
kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
32723270

3273-
vcpu->hv_clock.flags = pvclock_flags;
3271+
hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED;
3272+
}
3273+
3274+
kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
32743275

3275-
if (vcpu->pv_time.active)
3276-
kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
32773276
#ifdef CONFIG_KVM_XEN
3277+
/*
3278+
* For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
3279+
* explicitly told to use TSC as its clocksource Xen will not set this bit.
3280+
* This default behaviour led to bugs in some guest kernels which cause
3281+
* problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
3282+
*
3283+
* Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters!
3284+
*/
3285+
if (ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
3286+
hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT;
3287+
32783288
if (vcpu->xen.vcpu_info_cache.active)
3279-
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
3280-
offsetof(struct compat_vcpu_info, time),
3281-
xen_pvclock_tsc_unstable);
3289+
kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
3290+
offsetof(struct compat_vcpu_info, time));
32823291
if (vcpu->xen.vcpu_time_info_cache.active)
3283-
kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
3284-
xen_pvclock_tsc_unstable);
3292+
kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
32853293
#endif
3286-
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
32873294
return 0;
32883295
}
32893296

@@ -6910,23 +6917,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
69106917
{
69116918
struct kvm_vcpu *vcpu;
69126919
unsigned long i;
6913-
int ret = 0;
6914-
6915-
mutex_lock(&kvm->lock);
6916-
kvm_for_each_vcpu(i, vcpu, kvm) {
6917-
if (!vcpu->arch.pv_time.active)
6918-
continue;
69196920

6920-
ret = kvm_set_guest_paused(vcpu);
6921-
if (ret) {
6922-
kvm_err("Failed to pause guest VCPU%d: %d\n",
6923-
vcpu->vcpu_id, ret);
6924-
break;
6925-
}
6926-
}
6927-
mutex_unlock(&kvm->lock);
6921+
/*
6922+
* Ignore the return, marking the guest paused only "fails" if the vCPU
6923+
* isn't using kvmclock; continuing on is correct and desirable.
6924+
*/
6925+
kvm_for_each_vcpu(i, vcpu, kvm)
6926+
(void)kvm_set_guest_paused(vcpu);
69286927

6929-
return ret ? NOTIFY_BAD : NOTIFY_DONE;
6928+
return NOTIFY_DONE;
69306929
}
69316930

69326931
int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)

arch/x86/kvm/xen.c

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
150150
return HRTIMER_NORESTART;
151151
}
152152

153+
static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu,
154+
struct pvclock_vcpu_time_info *hv_clock,
155+
struct gfn_to_pfn_cache *gpc,
156+
unsigned int offset)
157+
{
158+
unsigned long flags;
159+
int r;
160+
161+
read_lock_irqsave(&gpc->lock, flags);
162+
while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) {
163+
read_unlock_irqrestore(&gpc->lock, flags);
164+
165+
r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock));
166+
if (r)
167+
return r;
168+
169+
read_lock_irqsave(&gpc->lock, flags);
170+
}
171+
172+
memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock));
173+
read_unlock_irqrestore(&gpc->lock, flags);
174+
175+
/*
176+
* Sanity check TSC shift+multiplier to verify the guest's view of time
177+
* is more or less consistent.
178+
*/
179+
if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift ||
180+
hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul)
181+
return -EINVAL;
182+
183+
return 0;
184+
}
185+
153186
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
154187
bool linux_wa)
155188
{
189+
struct kvm_vcpu_xen *xen = &vcpu->arch.xen;
156190
int64_t kernel_now, delta;
157191
uint64_t guest_now;
192+
int r = -EOPNOTSUPP;
158193

159194
/*
160195
* The guest provides the requested timeout in absolute nanoseconds
@@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
173208
* the absolute CLOCK_MONOTONIC time at which the timer should
174209
* fire.
175210
*/
176-
if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
177-
static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
211+
do {
212+
struct pvclock_vcpu_time_info hv_clock;
178213
uint64_t host_tsc, guest_tsc;
179214

215+
if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
216+
!vcpu->kvm->arch.use_master_clock)
217+
break;
218+
219+
/*
220+
* If both Xen PV clocks are active, arbitrarily try to use the
221+
* compat clock first, but also try to use the non-compat clock
222+
* if the compat clock is unusable. The two PV clocks hold the
223+
* same information, but it's possible one (or both) is stale
224+
* and/or currently unreachable.
225+
*/
226+
if (xen->vcpu_info_cache.active)
227+
r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache,
228+
offsetof(struct compat_vcpu_info, time));
229+
if (r && xen->vcpu_time_info_cache.active)
230+
r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0);
231+
if (r)
232+
break;
233+
180234
if (!IS_ENABLED(CONFIG_64BIT) ||
181235
!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
182236
/*
@@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
197251

198252
/* Calculate the guest kvmclock as the guest would do it. */
199253
guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
200-
guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
201-
guest_tsc);
202-
} else {
254+
guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc);
255+
} while (0);
256+
257+
if (r) {
203258
/*
204259
* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
205260
*
@@ -2261,8 +2316,8 @@ void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
22612316

22622317
entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
22632318
if (entry) {
2264-
entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
2265-
entry->edx = vcpu->arch.hv_clock.tsc_shift;
2319+
entry->ecx = vcpu->arch.pvclock_tsc_mul;
2320+
entry->edx = vcpu->arch.pvclock_tsc_shift;
22662321
}
22672322

22682323
entry = kvm_find_cpuid_entry_index(vcpu, function, 2);

0 commit comments

Comments
 (0)