Skip to content

Commit b71f980

Browse files
KAGA-KOKOIngo Molnar
authored andcommitted
timekeeping: Prevent coarse clocks going backwards
Lei Chen raised an issue with CLOCK_MONOTONIC_COARSE seeing time inconsistencies. Lei tracked down that this was being caused by the adjustment: tk->tkr_mono.xtime_nsec -= offset; which is made to compensate for the unaccumulated cycles in offset when the multiplicator is adjusted forward, so that the non-_COARSE clockids don't see inconsistencies. However, the _COARSE clockid getter functions use the adjusted xtime_nsec value directly and do not compensate the negative offset via the clocksource delta multiplied with the new multiplicator. In that case the caller can observe time going backwards in consecutive calls. By design, this negative adjustment should be fine, because the logic run from timekeeping_adjust() is done after it accumulated approximately multiplicator * interval_cycles into xtime_nsec. The accumulated value is always larger then the mult_adj * offset value, which is subtracted from xtime_nsec. Both operations are done together under the tk_core.lock, so the net change to xtime_nsec is always always be positive. However, do_adjtimex() calls into timekeeping_advance() as well, to apply the NTP frequency adjustment immediately. In this case, timekeeping_advance() does not return early when the offset is smaller then interval_cycles. In that case there is no time accumulated into xtime_nsec. But the subsequent call into timekeeping_adjust(), which modifies the multiplicator, subtracts from xtime_nsec to correct for the new multiplicator. Here because there was no accumulation, xtime_nsec becomes smaller than before, which opens a window up to the next accumulation, where the _COARSE clockid getters, which don't compensate for the offset, can observe the inconsistency. This has been tried to be fixed by forwarding the timekeeper in the case that adjtimex() adjusts the multiplier, which resets the offset to zero: 757b000 ("timekeeping: Fix possible inconsistencies in _COARSE clockids") That works correctly, but unfortunately causes a regression on the adjtimex() side. There are two issues: 1) The forwarding of the base time moves the update out of the original period and establishes a new one. 2) The clearing of the accumulated NTP error is changing the behaviour as well. User-space expects that multiplier/frequency updates are in effect, when the syscall returns, so delaying the update to the next tick is not solving the problem either. Commit 757b000 was reverted so that the established expectations of user space implementations (ntpd, chronyd) are restored, but that obviously brought the inconsistencies back. One of the initial approaches to fix this was to establish a separate storage for the coarse time getter nanoseconds part by calculating it from the offset. That was dropped on the floor because not having yet another state to maintain was simpler. But given the result of the above exercise, this solution turns out to be the right one. Bring it back in a slightly modified form. Thus introduce timekeeper::coarse_nsec and store that nanoseconds part in it, switch the time getter functions and the VDSO update to use that value. coarse_nsec is set on operations which forward or initialize the timekeeper and after time was accumulated during a tick. If there is no accumulation the timestamp is unchanged. This leaves the adjtimex() behaviour unmodified and prevents coarse time from going backwards. [ jstultz: Simplified the coarse_nsec calculation and kept behavior so coarse clockids aren't adjusted on each inter-tick adjtimex call, slightly reworked the comments and commit message ] Fixes: da15cfd ("time: Introduce CLOCK_REALTIME_COARSE") Reported-by: Lei Chen <lei.chen@smartx.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: John Stultz <jstultz@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/all/20250419054706.2319105-1-jstultz@google.com Closes: https://lore.kernel.org/lkml/20250310030004.3705801-1-lei.chen@smartx.com/
1 parent b443265 commit b71f980

File tree

3 files changed

+49
-13
lines changed

3 files changed

+49
-13
lines changed

include/linux/timekeeper_internal.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ struct tk_read_base {
5151
* @offs_real: Offset clock monotonic -> clock realtime
5252
* @offs_boot: Offset clock monotonic -> clock boottime
5353
* @offs_tai: Offset clock monotonic -> clock tai
54-
* @tai_offset: The current UTC to TAI offset in seconds
54+
* @coarse_nsec: The nanoseconds part for coarse time getters
5555
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
5656
* @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
5757
* @clock_was_set_seq: The sequence number of clock was set events
@@ -76,6 +76,7 @@ struct tk_read_base {
7676
* ntp shifted nano seconds.
7777
* @ntp_err_mult: Multiplication factor for scaled math conversion
7878
* @skip_second_overflow: Flag used to avoid updating NTP twice with same second
79+
* @tai_offset: The current UTC to TAI offset in seconds
7980
*
8081
* Note: For timespec(64) based interfaces wall_to_monotonic is what
8182
* we need to add to xtime (or xtime corrected for sub jiffy times)
@@ -100,7 +101,7 @@ struct tk_read_base {
100101
* which results in the following cacheline layout:
101102
*
102103
* 0: seqcount, tkr_mono
103-
* 1: xtime_sec ... tai_offset
104+
* 1: xtime_sec ... coarse_nsec
104105
* 2: tkr_raw, raw_sec
105106
* 3,4: Internal variables
106107
*
@@ -121,7 +122,7 @@ struct timekeeper {
121122
ktime_t offs_real;
122123
ktime_t offs_boot;
123124
ktime_t offs_tai;
124-
s32 tai_offset;
125+
u32 coarse_nsec;
125126

126127
/* Cacheline 2: */
127128
struct tk_read_base tkr_raw;
@@ -144,6 +145,7 @@ struct timekeeper {
144145
u32 ntp_error_shift;
145146
u32 ntp_err_mult;
146147
u32 skip_second_overflow;
148+
s32 tai_offset;
147149
};
148150

149151
#ifdef CONFIG_GENERIC_TIME_VSYSCALL

kernel/time/timekeeping.c

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -164,17 +164,42 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
164164
return ts;
165165
}
166166

167+
static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
168+
{
169+
struct timespec64 ts;
170+
171+
ts.tv_sec = tk->xtime_sec;
172+
ts.tv_nsec = tk->coarse_nsec;
173+
return ts;
174+
}
175+
176+
/*
177+
* Update the nanoseconds part for the coarse time keepers. They can't rely
178+
* on xtime_nsec because xtime_nsec could be adjusted by a small negative
179+
* amount when the multiplication factor of the clock is adjusted, which
180+
* could cause the coarse clocks to go slightly backwards. See
181+
* timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
182+
* clockids which only is updated when the clock has been set or we have
183+
* accumulated time.
184+
*/
185+
static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
186+
{
187+
tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
188+
}
189+
167190
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
168191
{
169192
tk->xtime_sec = ts->tv_sec;
170193
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
194+
tk_update_coarse_nsecs(tk);
171195
}
172196

173197
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
174198
{
175199
tk->xtime_sec += ts->tv_sec;
176200
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
177201
tk_normalize_xtime(tk);
202+
tk_update_coarse_nsecs(tk);
178203
}
179204

180205
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
708733
tk_normalize_xtime(tk);
709734
delta -= incr;
710735
}
736+
tk_update_coarse_nsecs(tk);
711737
}
712738

713739
/**
@@ -804,16 +830,16 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
804830
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
805831
{
806832
struct timekeeper *tk = &tk_core.timekeeper;
807-
unsigned int seq;
808833
ktime_t base, *offset = offsets[offs];
834+
unsigned int seq;
809835
u64 nsecs;
810836

811837
WARN_ON(timekeeping_suspended);
812838

813839
do {
814840
seq = read_seqcount_begin(&tk_core.seq);
815841
base = ktime_add(tk->tkr_mono.base, *offset);
816-
nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
842+
nsecs = tk->coarse_nsec;
817843

818844
} while (read_seqcount_retry(&tk_core.seq, seq));
819845

@@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
21612187
struct timekeeper *real_tk = &tk_core.timekeeper;
21622188
unsigned int clock_set = 0;
21632189
int shift = 0, maxshift;
2164-
u64 offset;
2190+
u64 offset, orig_offset;
21652191

21662192
guard(raw_spinlock_irqsave)(&tk_core.lock);
21672193

@@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
21722198
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
21732199
tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
21742200
tk->tkr_mono.clock->max_raw_delta);
2175-
2201+
orig_offset = offset;
21762202
/* Check if there's really nothing to do */
21772203
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
21782204
return false;
@@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
22052231
*/
22062232
clock_set |= accumulate_nsecs_to_secs(tk);
22072233

2234+
/*
2235+
* To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
2236+
* making small negative adjustments to the base xtime_nsec
2237+
* value, only update the coarse clocks if we accumulated time
2238+
*/
2239+
if (orig_offset != offset)
2240+
tk_update_coarse_nsecs(tk);
2241+
22082242
timekeeping_update_from_shadow(&tk_core, clock_set);
22092243

22102244
return !!clock_set;
@@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
22482282
do {
22492283
seq = read_seqcount_begin(&tk_core.seq);
22502284

2251-
*ts = tk_xtime(tk);
2285+
*ts = tk_xtime_coarse(tk);
22522286
} while (read_seqcount_retry(&tk_core.seq, seq));
22532287
}
22542288
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
22712305

22722306
do {
22732307
seq = read_seqcount_begin(&tk_core.seq);
2274-
*ts = tk_xtime(tk);
2308+
*ts = tk_xtime_coarse(tk);
22752309
offset = tk_core.timekeeper.offs_real;
22762310
} while (read_seqcount_retry(&tk_core.seq, seq));
22772311

@@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
23502384
do {
23512385
seq = read_seqcount_begin(&tk_core.seq);
23522386

2353-
now = tk_xtime(tk);
2387+
now = tk_xtime_coarse(tk);
23542388
mono = tk->wall_to_monotonic;
23552389
} while (read_seqcount_retry(&tk_core.seq, seq));
23562390

23572391
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
2358-
now.tv_nsec + mono.tv_nsec);
2392+
now.tv_nsec + mono.tv_nsec);
23592393
}
23602394
EXPORT_SYMBOL(ktime_get_coarse_ts64);
23612395

kernel/time/vsyscall.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,12 +98,12 @@ void update_vsyscall(struct timekeeper *tk)
9898
/* CLOCK_REALTIME_COARSE */
9999
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
100100
vdso_ts->sec = tk->xtime_sec;
101-
vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
101+
vdso_ts->nsec = tk->coarse_nsec;
102102

103103
/* CLOCK_MONOTONIC_COARSE */
104104
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
105105
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
106-
nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
106+
nsec = tk->coarse_nsec;
107107
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
108108
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
109109

0 commit comments

Comments
 (0)