Skip to content

Commit 451a707

Browse files
dwmw2sean-jc
authored andcommitted
KVM: x86/xen: improve accuracy of Xen timers
A test program such as http://david.woodhou.se/timerlat.c confirms user reports that timers are increasingly inaccurate as the lifetime of a guest increases. Reporting the actual delay observed when asking for 100µs of sleep, it starts off OK on a newly-launched guest but gets worse over time, giving incorrect sleep times: root@ip-10-0-193-21:~# ./timerlat -c -n 5 00000000 latency 103243/100000 (3.2430%) 00000001 latency 103243/100000 (3.2430%) 00000002 latency 103242/100000 (3.2420%) 00000003 latency 103245/100000 (3.2450%) 00000004 latency 103245/100000 (3.2450%) The biggest problem is that get_kvmclock_ns() returns inaccurate values when the guest TSC is scaled. The guest sees a TSC value scaled from the host TSC by a mul/shift conversion (hopefully done in hardware). The guest then converts that guest TSC value into nanoseconds using the mul/shift conversion given to it by the KVM pvclock information. But get_kvmclock_ns() performs only a single conversion directly from host TSC to nanoseconds, giving a different result. A test program at http://david.woodhou.se/tsdrift.c demonstrates the cumulative error over a day. It's non-trivial to fix get_kvmclock_ns(), although I'll come back to that. The actual guest hv_clock is per-CPU, and *theoretically* each vCPU could be running at a *different* frequency. But this patch is needed anyway because... The other issue with Xen timers was that the code would snapshot the host CLOCK_MONOTONIC at some point in time, and then... after a few interrupts may have occurred, some preemption perhaps... would also read the guest's kvmclock. Then it would proceed under the false assumption that those two happened at the *same* time. Any time which *actually* elapsed between reading the two clocks was introduced as inaccuracies in the time at which the timer fired. Fix it to use a variant of kvm_get_time_and_clockread(), which reads the host TSC just *once*, then use the returned TSC value to calculate the kvmclock (making sure to do that the way the guest would instead of making the same mistake get_kvmclock_ns() does). Sadly, hrtimers based on CLOCK_MONOTONIC_RAW are not supported, so Xen timers still have to use CLOCK_MONOTONIC. In practice the difference between the two won't matter over the timescales involved, as the *absolute* values don't matter; just the delta. This does mean a new variant of kvm_get_time_and_clockread() is needed; called kvm_get_monotonic_and_clockread() because that's what it does. Fixes: 5363952 ("KVM: x86/xen: handle PV timers oneshot mode") Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Paul Durrant <paul@xen.org> Link: https://lore.kernel.org/r/20240227115648.3104-2-dwmw2@infradead.org [sean: massage moved comment, tweak if statement formatting] Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 003d914 commit 451a707

File tree

3 files changed

+152
-40
lines changed

3 files changed

+152
-40
lines changed

arch/x86/kvm/x86.c

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2862,7 +2862,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
28622862
return v * clock->mult;
28632863
}
28642864

2865-
static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2865+
/*
2866+
* As with get_kvmclock_base_ns(), this counts from boot time, at the
2867+
* frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
2868+
*/
2869+
static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
28662870
{
28672871
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
28682872
unsigned long seq;
@@ -2881,6 +2885,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
28812885
return mode;
28822886
}
28832887

2888+
/*
2889+
* This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
2890+
* no boot time offset.
2891+
*/
2892+
static int do_monotonic(s64 *t, u64 *tsc_timestamp)
2893+
{
2894+
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2895+
unsigned long seq;
2896+
int mode;
2897+
u64 ns;
2898+
2899+
do {
2900+
seq = read_seqcount_begin(&gtod->seq);
2901+
ns = gtod->clock.base_cycles;
2902+
ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2903+
ns >>= gtod->clock.shift;
2904+
ns += ktime_to_ns(gtod->clock.offset);
2905+
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2906+
*t = ns;
2907+
2908+
return mode;
2909+
}
2910+
28842911
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
28852912
{
28862913
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2902,18 +2929,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
29022929
return mode;
29032930
}
29042931

2905-
/* returns true if host is using TSC based clocksource */
2932+
/*
2933+
* Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
2934+
* reports the TSC value from which it do so. Returns true if host is
2935+
* using TSC based clocksource.
2936+
*/
29062937
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
29072938
{
29082939
/* checked again under seqlock below */
29092940
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
29102941
return false;
29112942

2912-
return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2913-
tsc_timestamp));
2943+
return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
2944+
tsc_timestamp));
29142945
}
29152946

2916-
/* returns true if host is using TSC based clocksource */
2947+
/*
2948+
* Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
2949+
* so. Returns true if host is using TSC based clocksource.
2950+
*/
2951+
bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2952+
{
2953+
/* checked again under seqlock below */
2954+
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2955+
return false;
2956+
2957+
return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
2958+
tsc_timestamp));
2959+
}
2960+
2961+
/*
2962+
* Calculates CLOCK_REALTIME and reports the TSC value from which it did
2963+
* so. Returns true if host is using TSC based clocksource.
2964+
*
2965+
* DO NOT USE this for anything related to migration. You want CLOCK_TAI
2966+
* for that.
2967+
*/
29172968
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
29182969
u64 *tsc_timestamp)
29192970
{

arch/x86/kvm/x86.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
294294

295295
u64 get_kvmclock_ns(struct kvm *kvm);
296296
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
297+
bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
297298

298299
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
299300
gva_t addr, void *val, unsigned int bytes,

arch/x86/kvm/xen.c

Lines changed: 95 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <xen/interface/sched.h>
2525

2626
#include <asm/xen/cpuid.h>
27+
#include <asm/pvclock.h>
2728

2829
#include "cpuid.h"
2930
#include "trace.h"
@@ -149,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
149150
return HRTIMER_NORESTART;
150151
}
151152

152-
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
153+
static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
154+
bool linux_wa)
153155
{
156+
int64_t kernel_now, delta;
157+
uint64_t guest_now;
158+
159+
/*
160+
* The guest provides the requested timeout in absolute nanoseconds
161+
* of the KVM clock — as *it* sees it, based on the scaled TSC and
162+
* the pvclock information provided by KVM.
163+
*
164+
* The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
165+
* so use CLOCK_MONOTONIC. In the timescales covered by timers, the
166+
* difference won't matter much as there is no cumulative effect.
167+
*
168+
* Calculate the time for some arbitrary point in time around "now"
169+
* in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
170+
* delta between the kvmclock "now" value and the guest's requested
171+
* timeout, apply the "Linux workaround" described below, and add
172+
* the resulting delta to the CLOCK_MONOTONIC "now" value, to get
173+
* the absolute CLOCK_MONOTONIC time at which the timer should
174+
* fire.
175+
*/
176+
if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
177+
static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
178+
uint64_t host_tsc, guest_tsc;
179+
180+
if (!IS_ENABLED(CONFIG_64BIT) ||
181+
!kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
182+
/*
183+
* Don't fall back to get_kvmclock_ns() because it's
184+
* broken; it has a systemic error in its results
185+
* because it scales directly from host TSC to
186+
* nanoseconds, and doesn't scale first to guest TSC
187+
* and *then* to nanoseconds as the guest does.
188+
*
189+
* There is a small error introduced here because time
190+
* continues to elapse between the ktime_get() and the
191+
* subsequent rdtsc(). But not the systemic drift due
192+
* to get_kvmclock_ns().
193+
*/
194+
kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
195+
host_tsc = rdtsc();
196+
}
197+
198+
/* Calculate the guest kvmclock as the guest would do it. */
199+
guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
200+
guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
201+
guest_tsc);
202+
} else {
203+
/*
204+
* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
205+
*
206+
* Also if the guest PV clock hasn't been set up yet, as is
207+
* likely to be the case during migration when the vCPU has
208+
* not been run yet. It would be possible to calculate the
209+
* scaling factors properly in that case but there's not much
210+
* point in doing so. The get_kvmclock_ns() drift accumulates
211+
* over time, so it's OK to use it at startup. Besides, on
212+
* migration there's going to be a little bit of skew in the
213+
* precise moment at which timers fire anyway. Often they'll
214+
* be in the "past" by the time the VM is running again after
215+
* migration.
216+
*/
217+
guest_now = get_kvmclock_ns(vcpu->kvm);
218+
kernel_now = ktime_get();
219+
}
220+
221+
delta = guest_abs - guest_now;
222+
223+
/*
224+
* Xen has a 'Linux workaround' in do_set_timer_op() which checks for
225+
* negative absolute timeout values (caused by integer overflow), and
226+
* for values about 13 days in the future (2^50ns) which would be
227+
* caused by jiffies overflow. For those cases, Xen sets the timeout
228+
* 100ms in the future (not *too* soon, since if a guest really did
229+
* set a long timeout on purpose we don't want to keep churning CPU
230+
* time by waking it up). Emulate Xen's workaround when starting the
231+
* timer in response to __HYPERVISOR_set_timer_op.
232+
*/
233+
if (linux_wa &&
234+
unlikely((int64_t)guest_abs < 0 ||
235+
(delta > 0 && (uint32_t) (delta >> 50) != 0))) {
236+
delta = 100 * NSEC_PER_MSEC;
237+
guest_abs = guest_now + delta;
238+
}
239+
154240
/*
155241
* Avoid races with the old timer firing. Checking timer_expires
156242
* to avoid calling hrtimer_cancel() will only have false positives
@@ -162,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
162248
atomic_set(&vcpu->arch.xen.timer_pending, 0);
163249
vcpu->arch.xen.timer_expires = guest_abs;
164250

165-
if (delta_ns <= 0) {
251+
if (delta <= 0)
166252
xen_timer_callback(&vcpu->arch.xen.timer);
167-
} else {
168-
ktime_t ktime_now = ktime_get();
253+
else
169254
hrtimer_start(&vcpu->arch.xen.timer,
170-
ktime_add_ns(ktime_now, delta_ns),
255+
ktime_add_ns(kernel_now, delta),
171256
HRTIMER_MODE_ABS_HARD);
172-
}
173257
}
174258

175259
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
@@ -997,9 +1081,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
9971081

9981082
/* Start the timer if the new value has a valid vector+expiry. */
9991083
if (data->u.timer.port && data->u.timer.expires_ns)
1000-
kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
1001-
data->u.timer.expires_ns -
1002-
get_kvmclock_ns(vcpu->kvm));
1084+
kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
10031085

10041086
r = 0;
10051087
break;
@@ -1472,7 +1554,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
14721554
{
14731555
struct vcpu_set_singleshot_timer oneshot;
14741556
struct x86_exception e;
1475-
s64 delta;
14761557

14771558
if (!kvm_xen_timer_enabled(vcpu))
14781559
return false;
@@ -1506,9 +1587,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
15061587
return true;
15071588
}
15081589

1509-
/* A delta <= 0 results in an immediate callback, which is what we want */
1510-
delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
1511-
kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
1590+
kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
15121591
*r = 0;
15131592
return true;
15141593

@@ -1531,29 +1610,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
15311610
if (!kvm_xen_timer_enabled(vcpu))
15321611
return false;
15331612

1534-
if (timeout) {
1535-
uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
1536-
int64_t delta = timeout - guest_now;
1537-
1538-
/* Xen has a 'Linux workaround' in do_set_timer_op() which
1539-
* checks for negative absolute timeout values (caused by
1540-
* integer overflow), and for values about 13 days in the
1541-
* future (2^50ns) which would be caused by jiffies
1542-
* overflow. For those cases, it sets the timeout 100ms in
1543-
* the future (not *too* soon, since if a guest really did
1544-
* set a long timeout on purpose we don't want to keep
1545-
* churning CPU time by waking it up).
1546-
*/
1547-
if (unlikely((int64_t)timeout < 0 ||
1548-
(delta > 0 && (uint32_t) (delta >> 50) != 0))) {
1549-
delta = 100 * NSEC_PER_MSEC;
1550-
timeout = guest_now + delta;
1551-
}
1552-
1553-
kvm_xen_start_timer(vcpu, timeout, delta);
1554-
} else {
1613+
if (timeout)
1614+
kvm_xen_start_timer(vcpu, timeout, true);
1615+
else
15551616
kvm_xen_stop_timer(vcpu);
1556-
}
15571617

15581618
*r = 0;
15591619
return true;

0 commit comments

Comments
 (0)