Skip to content

Commit 7ee9887

Browse files
anna-marialxKAGA-KOKO
authored andcommitted
timers: Implement the hierarchical pull model
Placing timers at enqueue time on a target CPU based on dubious heuristics does not make any sense: 1) Most timer wheel timers are canceled or rearmed before they expire. 2) The heuristics to predict which CPU will be busy when the timer expires are wrong by definition. So placing the timers at enqueue wastes precious cycles. The proper solution to this problem is to always queue the timers on the local CPU and allow the non pinned timers to be pulled onto a busy CPU at expiry time. Therefore split the timer storage into local pinned and global timers: Local pinned timers are always expired on the CPU on which they have been queued. Global timers can be expired on any CPU. As long as a CPU is busy it expires both local and global timers. When a CPU goes idle it arms for the first expiring local timer. If the first expiring pinned (local) timer is before the first expiring movable timer, then no action is required because the CPU will wake up before the first movable timer expires. If the first expiring movable timer is before the first expiring pinned (local) timer, then this timer is queued into an idle timerqueue and eventually expired by another active CPU. To avoid global locking the timerqueues are implemented as a hierarchy. The lowest level of the hierarchy holds the CPUs. The CPUs are associated to groups of 8, which are separated per node. If more than one CPU group exist, then a second level in the hierarchy collects the groups. Depending on the size of the system more than 2 levels are required. Each group has a "migrator" which checks the timerqueue during the tick for remote expirable timers. If the last CPU in a group goes idle it reports the first expiring event in the group up to the next group(s) in the hierarchy. If the last CPU goes idle it arms its timer for the first system wide expiring timer to ensure that no timer event is missed. Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Frederic Weisbecker <frederic@kernel.org> Link: https://lore.kernel.org/r/20240222103710.32582-1-anna-maria@linutronix.de
1 parent 57e95a5 commit 7ee9887

File tree

6 files changed

+2011
-8
lines changed

6 files changed

+2011
-8
lines changed

include/linux/cpuhotplug.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ enum cpuhp_state {
231231
CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
232232
CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE,
233233
CPUHP_AP_PERF_CSKY_ONLINE,
234+
CPUHP_AP_TMIGR_ONLINE,
234235
CPUHP_AP_WATCHDOG_ONLINE,
235236
CPUHP_AP_WORKQUEUE_ONLINE,
236237
CPUHP_AP_RANDOM_ONLINE,

kernel/time/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ endif
1717
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
1818
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
1919
obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
20+
ifeq ($(CONFIG_SMP),y)
21+
obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
22+
endif
2023
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
2124
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
2225
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o

kernel/time/tick-internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ extern void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
166166
extern void timer_lock_remote_bases(unsigned int cpu);
167167
extern void timer_unlock_remote_bases(unsigned int cpu);
168168
extern bool timer_base_is_idle(void);
169+
extern void timer_expire_remote(unsigned int cpu);
169170
# endif
170171
#else /* CONFIG_NO_HZ_COMMON */
171172
static inline void timers_update_nohz(void) { }

kernel/time/timer.c

Lines changed: 105 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include <asm/io.h>
5454

5555
#include "tick-internal.h"
56+
#include "timer_migration.h"
5657

5758
#define CREATE_TRACE_POINTS
5859
#include <trace/events/timer.h>
@@ -2169,6 +2170,64 @@ bool timer_base_is_idle(void)
21692170
{
21702171
return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
21712172
}
2173+
2174+
static void __run_timer_base(struct timer_base *base);
2175+
2176+
/**
2177+
* timer_expire_remote() - expire global timers of cpu
2178+
* @cpu: Remote CPU
2179+
*
2180+
* Expire timers of global base of remote CPU.
2181+
*/
2182+
void timer_expire_remote(unsigned int cpu)
2183+
{
2184+
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);
2185+
2186+
__run_timer_base(base);
2187+
}
2188+
2189+
static void timer_use_tmigr(unsigned long basej, u64 basem,
2190+
unsigned long *nextevt, bool *tick_stop_path,
2191+
bool timer_base_idle, struct timer_events *tevt)
2192+
{
2193+
u64 next_tmigr;
2194+
2195+
if (timer_base_idle)
2196+
next_tmigr = tmigr_cpu_new_timer(tevt->global);
2197+
else if (tick_stop_path)
2198+
next_tmigr = tmigr_cpu_deactivate(tevt->global);
2199+
else
2200+
next_tmigr = tmigr_quick_check(tevt->global);
2201+
2202+
/*
2203+
* If the CPU is the last going idle in timer migration hierarchy, make
2204+
* sure the CPU will wake up in time to handle remote timers.
2205+
* next_tmigr == KTIME_MAX if other CPUs are still active.
2206+
*/
2207+
if (next_tmigr < tevt->local) {
2208+
u64 tmp;
2209+
2210+
/* If we missed a tick already, force 0 delta */
2211+
if (next_tmigr < basem)
2212+
next_tmigr = basem;
2213+
2214+
tmp = div_u64(next_tmigr - basem, TICK_NSEC);
2215+
2216+
*nextevt = basej + (unsigned long)tmp;
2217+
tevt->local = next_tmigr;
2218+
}
2219+
}
2220+
# else
2221+
static void timer_use_tmigr(unsigned long basej, u64 basem,
2222+
unsigned long *nextevt, bool *tick_stop_path,
2223+
bool timer_base_idle, struct timer_events *tevt)
2224+
{
2225+
/*
2226+
* Make sure first event is written into tevt->local to not miss a
2227+
* timer on !SMP systems.
2228+
*/
2229+
tevt->local = min_t(u64, tevt->local, tevt->global);
2230+
}
21722231
# endif /* CONFIG_SMP */
21732232

21742233
static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
@@ -2177,7 +2236,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
21772236
struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
21782237
struct timer_base *base_local, *base_global;
21792238
unsigned long nextevt;
2180-
u64 expires;
2239+
bool idle_is_possible;
21812240

21822241
/*
21832242
* Pretend that there is no timer pending if the cpu is offline.
@@ -2198,6 +2257,22 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
21982257
nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
21992258
base_global, &tevt);
22002259

2260+
/*
2261+
* If the next event is only one jiffie ahead there is no need to call
2262+
* timer migration hierarchy related functions. The value for the next
2263+
* global timer in @tevt struct equals then KTIME_MAX. This is also
2264+
* true, when the timer base is idle.
2265+
*
2266+
* The proper timer migration hierarchy function depends on the callsite
2267+
* and whether timer base is idle or not. @nextevt will be updated when
2268+
* this CPU needs to handle the first timer migration hierarchy
2269+
* event. See timer_use_tmigr() for detailed information.
2270+
*/
2271+
idle_is_possible = time_after(nextevt, basej + 1);
2272+
if (idle_is_possible)
2273+
timer_use_tmigr(basej, basem, &nextevt, idle,
2274+
base_local->is_idle, &tevt);
2275+
22012276
/*
22022277
* We have a fresh next event. Check whether we can forward the
22032278
* base.
@@ -2210,7 +2285,10 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
22102285
*/
22112286
if (idle) {
22122287
/*
2213-
* Bases are idle if the next event is more than a tick away.
2288+
* Bases are idle if the next event is more than a tick
2289+
* away. Caution: @nextevt could have changed by enqueueing a
2290+
* global timer into timer migration hierarchy. Therefore a new
2291+
* check is required here.
22142292
*
22152293
* If the base is marked idle then any timer add operation must
22162294
* forward the base clk itself to keep granularity small. This
@@ -2223,23 +2301,35 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
22232301
trace_timer_base_idle(true, base_local->cpu);
22242302
}
22252303
*idle = base_local->is_idle;
2304+
2305+
/*
2306+
* When timer base is not set idle, undo the effect of
2307+
* tmigr_cpu_deactivate() to prevent inconsitent states - active
2308+
* timer base but inactive timer migration hierarchy.
2309+
*
2310+
* When timer base was already marked idle, nothing will be
2311+
* changed here.
2312+
*/
2313+
if (!base_local->is_idle && idle_is_possible)
2314+
tmigr_cpu_activate();
22262315
}
22272316

22282317
raw_spin_unlock(&base_global->lock);
22292318
raw_spin_unlock(&base_local->lock);
22302319

2231-
expires = min_t(u64, tevt.local, tevt.global);
2232-
2233-
return cmp_next_hrtimer_event(basem, expires);
2320+
return cmp_next_hrtimer_event(basem, tevt.local);
22342321
}
22352322

22362323
/**
22372324
* get_next_timer_interrupt() - return the time (clock mono) of the next timer
22382325
* @basej: base time jiffies
22392326
* @basem: base time clock monotonic
22402327
*
2241-
* Returns the tick aligned clock monotonic time of the next pending
2242-
* timer or KTIME_MAX if no timer is pending.
2328+
* Returns the tick aligned clock monotonic time of the next pending timer or
2329+
* KTIME_MAX if no timer is pending. If timer of global base was queued into
2330+
* timer migration hierarchy, first global timer is not taken into account. If
2331+
* it was the last CPU of timer migration hierarchy going idle, first global
2332+
* event is taken into account.
22432333
*/
22442334
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
22452335
{
@@ -2281,6 +2371,9 @@ void timer_clear_idle(void)
22812371
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
22822372
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
22832373
trace_timer_base_idle(false, smp_processor_id());
2374+
2375+
/* Activate without holding the timer_base->lock */
2376+
tmigr_cpu_activate();
22842377
}
22852378
#endif
22862379

@@ -2350,6 +2443,9 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
23502443
if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
23512444
run_timer_base(BASE_GLOBAL);
23522445
run_timer_base(BASE_DEF);
2446+
2447+
if (is_timers_nohz_active())
2448+
tmigr_handle_remote();
23532449
}
23542450
}
23552451

@@ -2364,7 +2460,8 @@ static void run_local_timers(void)
23642460

23652461
for (int i = 0; i < NR_BASES; i++, base++) {
23662462
/* Raise the softirq only if required. */
2367-
if (time_after_eq(jiffies, base->next_expiry)) {
2463+
if (time_after_eq(jiffies, base->next_expiry) ||
2464+
(i == BASE_DEF && tmigr_requires_handle_remote())) {
23682465
raise_softirq(TIMER_SOFTIRQ);
23692466
return;
23702467
}

0 commit comments

Comments
 (0)