Skip to content

Commit f24dc33

Browse files
committed
Merge tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer subsystem updates from Ingo Molnar: - Various preparatory cleanups & enhancements of the timer-wheel code, in preparation for the WIP 'pull timers at expiry' timer migration model series (which will replace the current 'push timers at enqueue' migration model), by Anna-Maria Behnsen: - Update comments and clean up confusing variable names - Add debug check to warn about time travel - Improve/expand timer-wheel tracepoints - Optimize away unnecessary IPIs for deferrable timers - Restructure & clean up next_expiry_recalc() - Clean up forward_timer_base() - Introduce __forward_timer_base() and use it to simplify and micro-optimize get_next_timer_interrupt() - Restructure the get_next_timer_interrupt()'s idle logic for better readability and to enable a minor optimization. - Fix the nextevt calculation when no timers are pending - Fix the sysfs_get_uname() prototype declaration * tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: timers: Fix nextevt calculation when no timers are pending timers: Rework idle logic timers: Use already existing function for forwarding timer base timers: Split out forward timer base functionality timers: Clarify check in forward_timer_base() timers: Move store of next event into __next_timer_interrupt() timers: Do not IPI for deferrable timers tracing/timers: Add tracepoint for tracking timer base is_idle flag tracing/timers: Enhance timer_start tracepoint tick-sched: Warn when next tick seems to be in the past tick/sched: Cleanup confusing variables tick-sched: Fix function names in comments time: Make sysfs_get_uname() function visible in header
2 parents 46a08b4 + da65f29 commit f24dc33

File tree

4 files changed

+109
-69
lines changed

4 files changed

+109
-69
lines changed

include/trace/events/timer.h

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,38 +46,38 @@ DEFINE_EVENT(timer_class, timer_init,
4646

4747
/**
4848
* timer_start - called when the timer is started
49-
* @timer: pointer to struct timer_list
50-
* @expires: the timers expiry time
51-
* @flags: the timers flags
49+
* @timer: pointer to struct timer_list
50+
* @bucket_expiry: the bucket expiry time
5251
*/
5352
TRACE_EVENT(timer_start,
5453

5554
TP_PROTO(struct timer_list *timer,
56-
unsigned long expires,
57-
unsigned int flags),
55+
unsigned long bucket_expiry),
5856

59-
TP_ARGS(timer, expires, flags),
57+
TP_ARGS(timer, bucket_expiry),
6058

6159
TP_STRUCT__entry(
6260
__field( void *, timer )
6361
__field( void *, function )
6462
__field( unsigned long, expires )
63+
__field( unsigned long, bucket_expiry )
6564
__field( unsigned long, now )
6665
__field( unsigned int, flags )
6766
),
6867

6968
TP_fast_assign(
7069
__entry->timer = timer;
7170
__entry->function = timer->function;
72-
__entry->expires = expires;
71+
__entry->expires = timer->expires;
72+
__entry->bucket_expiry = bucket_expiry;
7373
__entry->now = jiffies;
74-
__entry->flags = flags;
74+
__entry->flags = timer->flags;
7575
),
7676

77-
TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
77+
TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
7878
__entry->timer, __entry->function, __entry->expires,
7979
(long)__entry->expires - __entry->now,
80-
__entry->flags & TIMER_CPUMASK,
80+
__entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
8181
__entry->flags >> TIMER_ARRAYSHIFT,
8282
decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
8383
);
@@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel,
142142
TP_ARGS(timer)
143143
);
144144

145+
TRACE_EVENT(timer_base_idle,
146+
147+
TP_PROTO(bool is_idle, unsigned int cpu),
148+
149+
TP_ARGS(is_idle, cpu),
150+
151+
TP_STRUCT__entry(
152+
__field( bool, is_idle )
153+
__field( unsigned int, cpu )
154+
),
155+
156+
TP_fast_assign(
157+
__entry->is_idle = is_idle;
158+
__entry->cpu = cpu;
159+
),
160+
161+
TP_printk("is_idle=%d cpu=%d",
162+
__entry->is_idle, __entry->cpu)
163+
);
164+
145165
#define decode_clockid(type) \
146166
__print_symbolic(type, \
147167
{ CLOCK_REALTIME, "CLOCK_REALTIME" }, \

kernel/time/tick-internal.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
5656
ktime_t expires, bool force);
5757
extern void clockevents_handle_noop(struct clock_event_device *dev);
5858
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
59-
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
6059

6160
/* Broadcasting support */
6261
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
@@ -197,3 +196,5 @@ void hrtimers_resume_local(void);
197196
#else
198197
#define JIFFIES_SHIFT 8
199198
#endif
199+
200+
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);

kernel/time/tick-sched.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
839839
ts->next_timer = next_tick;
840840
}
841841

842+
/* Make sure next_tick is never before basemono! */
843+
if (WARN_ON_ONCE(basemono > next_tick))
844+
next_tick = basemono;
845+
842846
/*
843847
* If the tick is due in the next period, keep it ticking or
844848
* force prod the timer.
@@ -887,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
887891
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
888892
u64 basemono = ts->timer_expires_base;
889893
u64 expires = ts->timer_expires;
890-
ktime_t tick = expires;
891894

892895
/* Make sure we won't be trying to stop it twice in a row. */
893896
ts->timer_expires_base = 0;
@@ -910,7 +913,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
910913
/* Skip reprogram of event if it's not changed */
911914
if (ts->tick_stopped && (expires == ts->next_tick)) {
912915
/* Sanity check: make sure clockevent is actually programmed */
913-
if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
916+
if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
914917
return;
915918

916919
WARN_ON_ONCE(1);
@@ -920,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
920923
}
921924

922925
/*
923-
* nohz_stop_sched_tick() can be called several times before
924-
* nohz_restart_sched_tick() is called. This happens when
925-
* interrupts arrive which do not cause a reschedule. In the
926-
* first call we save the current tick time, so we can restart
927-
* the scheduler tick in nohz_restart_sched_tick().
926+
* tick_nohz_stop_tick() can be called several times before
927+
* tick_nohz_restart_sched_tick() is called. This happens when
928+
* interrupts arrive which do not cause a reschedule. In the first
929+
* call we save the current tick time, so we can restart the
930+
* scheduler tick in tick_nohz_restart_sched_tick().
928931
*/
929932
if (!ts->tick_stopped) {
930933
calc_load_nohz_start();
@@ -935,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
935938
trace_tick_stop(1, TICK_DEP_MASK_NONE);
936939
}
937940

938-
ts->next_tick = tick;
941+
ts->next_tick = expires;
939942

940943
/*
941944
* If the expiration time == KTIME_MAX, then we simply stop
@@ -950,11 +953,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
950953
}
951954

952955
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
953-
hrtimer_start(&ts->sched_timer, tick,
956+
hrtimer_start(&ts->sched_timer, expires,
954957
HRTIMER_MODE_ABS_PINNED_HARD);
955958
} else {
956-
hrtimer_set_expires(&ts->sched_timer, tick);
957-
tick_program_event(tick, 1);
959+
hrtimer_set_expires(&ts->sched_timer, expires);
960+
tick_program_event(expires, 1);
958961
}
959962
}
960963

kernel/time/timer.c

Lines changed: 63 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
571571
static void
572572
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
573573
{
574-
if (!is_timers_nohz_active())
575-
return;
576-
577574
/*
578-
* TODO: This wants some optimizing similar to the code below, but we
579-
* will do that when we switch from push to pull for deferrable timers.
575+
* Deferrable timers do not prevent the CPU from entering dynticks and
576+
* are not taken into account on the idle/nohz_full path. An IPI when a
577+
* new deferrable timer is enqueued will wake up the remote CPU but
578+
* nothing will be done with the deferrable timer base. Therefore skip
579+
* the remote IPI for deferrable timers completely.
580580
*/
581-
if (timer->flags & TIMER_DEFERRABLE) {
582-
if (tick_nohz_full_cpu(base->cpu))
583-
wake_up_nohz_cpu(base->cpu);
581+
if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
584582
return;
585-
}
586583

587584
/*
588585
* We might have to IPI the remote CPU if the base is idle and the
@@ -606,7 +603,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
606603
__set_bit(idx, base->pending_map);
607604
timer_set_idx(timer, idx);
608605

609-
trace_timer_start(timer, timer->expires, timer->flags);
606+
trace_timer_start(timer, bucket_expiry);
610607

611608
/*
612609
* Check whether this is the new first expiring timer. The
@@ -942,31 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
942939
return get_timer_this_cpu_base(tflags);
943940
}
944941

945-
static inline void forward_timer_base(struct timer_base *base)
942+
static inline void __forward_timer_base(struct timer_base *base,
943+
unsigned long basej)
946944
{
947-
unsigned long jnow = READ_ONCE(jiffies);
948-
949945
/*
950-
* No need to forward if we are close enough below jiffies.
951-
* Also while executing timers, base->clk is 1 offset ahead
952-
* of jiffies to avoid endless requeuing to current jiffies.
946+
* Check whether we can forward the base. We can only do that when
947+
* @basej is past base->clk otherwise we might rewind base->clk.
953948
*/
954-
if ((long)(jnow - base->clk) < 1)
949+
if (time_before_eq(basej, base->clk))
955950
return;
956951

957952
/*
958953
* If the next expiry value is > jiffies, then we fast forward to
959954
* jiffies otherwise we forward to the next expiry value.
960955
*/
961-
if (time_after(base->next_expiry, jnow)) {
962-
base->clk = jnow;
956+
if (time_after(base->next_expiry, basej)) {
957+
base->clk = basej;
963958
} else {
964959
if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
965960
return;
966961
base->clk = base->next_expiry;
967962
}
963+
968964
}
969965

966+
static inline void forward_timer_base(struct timer_base *base)
967+
{
968+
__forward_timer_base(base, READ_ONCE(jiffies));
969+
}
970970

971971
/*
972972
* We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1803,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
18031803
/*
18041804
* Search the first expiring timer in the various clock levels. Caller must
18051805
* hold base->lock.
1806+
*
1807+
* Store next expiry time in base->next_expiry.
18061808
*/
1807-
static unsigned long __next_timer_interrupt(struct timer_base *base)
1809+
static void next_expiry_recalc(struct timer_base *base)
18081810
{
18091811
unsigned long clk, next, adj;
18101812
unsigned lvl, offset = 0;
@@ -1870,10 +1872,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
18701872
clk += adj;
18711873
}
18721874

1875+
base->next_expiry = next;
18731876
base->next_expiry_recalc = false;
18741877
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
1875-
1876-
return next;
18771878
}
18781879

18791880
#ifdef CONFIG_NO_HZ_COMMON
@@ -1921,8 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
19211922
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
19221923
{
19231924
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1925+
unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
19241926
u64 expires = KTIME_MAX;
1925-
unsigned long nextevt;
1927+
bool was_idle;
19261928

19271929
/*
19281930
* Pretend that there is no timer pending if the cpu is offline.
@@ -1933,37 +1935,44 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
19331935

19341936
raw_spin_lock(&base->lock);
19351937
if (base->next_expiry_recalc)
1936-
base->next_expiry = __next_timer_interrupt(base);
1937-
nextevt = base->next_expiry;
1938+
next_expiry_recalc(base);
19381939

19391940
/*
19401941
* We have a fresh next event. Check whether we can forward the
1941-
* base. We can only do that when @basej is past base->clk
1942-
* otherwise we might rewind base->clk.
1942+
* base.
19431943
*/
1944-
if (time_after(basej, base->clk)) {
1945-
if (time_after(nextevt, basej))
1946-
base->clk = basej;
1947-
else if (time_after(nextevt, base->clk))
1948-
base->clk = nextevt;
1949-
}
1944+
__forward_timer_base(base, basej);
19501945

1951-
if (time_before_eq(nextevt, basej)) {
1952-
expires = basem;
1953-
base->is_idle = false;
1946+
if (base->timers_pending) {
1947+
nextevt = base->next_expiry;
1948+
1949+
/* If we missed a tick already, force 0 delta */
1950+
if (time_before(nextevt, basej))
1951+
nextevt = basej;
1952+
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
19541953
} else {
1955-
if (base->timers_pending)
1956-
expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
19571954
/*
1958-
* If we expect to sleep more than a tick, mark the base idle.
1959-
* Also the tick is stopped so any added timer must forward
1960-
* the base clk itself to keep granularity small. This idle
1961-
* logic is only maintained for the BASE_STD base, deferrable
1962-
* timers may still see large granularity skew (by design).
1955+
* Move next_expiry for the empty base into the future to
1956+
* prevent a unnecessary raise of the timer softirq when the
1957+
* next_expiry value will be reached even if there is no timer
1958+
* pending.
19631959
*/
1964-
if ((expires - basem) > TICK_NSEC)
1965-
base->is_idle = true;
1960+
base->next_expiry = nextevt;
19661961
}
1962+
1963+
/*
1964+
* Base is idle if the next event is more than a tick away.
1965+
*
1966+
* If the base is marked idle then any timer add operation must forward
1967+
* the base clk itself to keep granularity small. This idle logic is
1968+
* only maintained for the BASE_STD base, deferrable timers may still
1969+
* see large granularity skew (by design).
1970+
*/
1971+
was_idle = base->is_idle;
1972+
base->is_idle = time_after(nextevt, basej + 1);
1973+
if (was_idle != base->is_idle)
1974+
trace_timer_base_idle(base->is_idle, base->cpu);
1975+
19671976
raw_spin_unlock(&base->lock);
19681977

19691978
return cmp_next_hrtimer_event(basem, expires);
@@ -1984,7 +1993,10 @@ void timer_clear_idle(void)
19841993
* sending the IPI a few instructions smaller for the cost of taking
19851994
* the lock in the exit from idle path.
19861995
*/
1987-
base->is_idle = false;
1996+
if (base->is_idle) {
1997+
base->is_idle = false;
1998+
trace_timer_base_idle(false, smp_processor_id());
1999+
}
19882000
}
19892001
#endif
19902002

@@ -2015,8 +2027,12 @@ static inline void __run_timers(struct timer_base *base)
20152027
*/
20162028
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
20172029
&& base->timers_pending);
2030+
/*
2031+
* While executing timers, base->clk is set 1 offset ahead of
2032+
* jiffies to avoid endless requeuing to current jiffies.
2033+
*/
20182034
base->clk++;
2019-
base->next_expiry = __next_timer_interrupt(base);
2035+
next_expiry_recalc(base);
20202036

20212037
while (levels--)
20222038
expire_timers(base, heads + levels);

0 commit comments

Comments
 (0)