Skip to content

Commit bcb0442

Browse files
committed
Merge tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo: - Add mechanism to count and report internal events. This significantly improves visibility on subtle corner conditions. - The default idle CPU selection logic is revamped and improved in multiple ways including being made topology aware. - sched_ext was disabling ttwu_queue for simplicity, which can be costly when hardware topology is more complex. Implement SCX_OPS_ALLOWED_QUEUED_WAKEUP so that BPF schedulers can selectively enable ttwu_queue. - tools/sched_ext updates to improve compatibility among others. - Other misc updates and fixes. - sched_ext/for-6.14-fixes were pulled a few times to receive prerequisite fixes and resolve conflicts. * tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (42 commits) sched_ext: idle: Refactor scx_select_cpu_dfl() sched_ext: idle: Honor idle flags in the built-in idle selection policy sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local() sched_ext: Add trace point to track sched_ext core events sched_ext: Change the event type from u64 to s64 sched_ext: Documentation: add task lifecycle summary tools/sched_ext: Provide a compatible helper for scx_bpf_events() selftests/sched_ext: Add NUMA-aware scheduler test tools/sched_ext: Provide consistent access to scx flags sched_ext: idle: Fix scx_bpf_pick_any_cpu_node() behavior sched_ext: idle: Introduce scx_bpf_nr_node_ids() sched_ext: idle: Introduce node-aware idle cpu kfunc helpers sched_ext: idle: Per-node idle cpumasks sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE sched_ext: idle: Make idle static keys private sched/topology: Introduce for_each_node_numadist() iterator mm/numa: Introduce nearest_node_nodemask() nodemask: numa: reorganize inclusion path nodemask: add nodes_copy() tools/sched_ext: Sync with scx repo ...
2 parents 94dc216 + e4855fc commit bcb0442

File tree

25 files changed

+2148
-783
lines changed

25 files changed

+2148
-783
lines changed

Documentation/scheduler/sched-ext.rst

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,42 @@ dispatching, and must be dispatched to with ``scx_bpf_dsq_insert()``. See
294294
the function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c``
295295
for more information.
296296

297+
Task Lifecycle
298+
--------------
299+
300+
The following pseudo-code summarizes the entire lifecycle of a task managed
301+
by a sched_ext scheduler:
302+
303+
.. code-block:: c
304+
305+
ops.init_task(); /* A new task is created */
306+
ops.enable(); /* Enable BPF scheduling for the task */
307+
308+
while (task in SCHED_EXT) {
309+
if (task can migrate)
310+
ops.select_cpu(); /* Called on wakeup (optimization) */
311+
312+
ops.runnable(); /* Task becomes ready to run */
313+
314+
while (task is runnable) {
315+
if (task is not in a DSQ) {
316+
ops.enqueue(); /* Task can be added to a DSQ */
317+
318+
/* A CPU becomes available */
319+
320+
ops.dispatch(); /* Task is moved to a local DSQ */
321+
}
322+
ops.running(); /* Task starts running on its assigned CPU */
323+
ops.tick(); /* Called every 1/HZ seconds */
324+
ops.stopping(); /* Task stops running (time slice expires or wait) */
325+
}
326+
327+
ops.quiescent(); /* Task releases its assigned CPU (wait) */
328+
}
329+
330+
ops.disable(); /* Disable BPF scheduling for the task */
331+
ops.exit_task(); /* Task is destroyed */
332+
297333
Where to Look
298334
=============
299335

MAINTAINERS

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21196,8 +21196,7 @@ S: Maintained
2119621196
W: https://github.com/sched-ext/scx
2119721197
T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
2119821198
F: include/linux/sched/ext.h
21199-
F: kernel/sched/ext.h
21200-
F: kernel/sched/ext.c
21199+
F: kernel/sched/ext*
2120121200
F: tools/sched_ext/
2120221201
F: tools/testing/selftests/sched_ext
2120321202

include/linux/nodemask.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@
9494
#include <linux/bitmap.h>
9595
#include <linux/minmax.h>
9696
#include <linux/nodemask_types.h>
97-
#include <linux/numa.h>
9897
#include <linux/random.h>
9998

10099
extern nodemask_t _unused_nodemask_arg_;
@@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s
191190
bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
192191
}
193192

193+
#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
194+
static __always_inline void __nodes_copy(nodemask_t *dstp,
195+
const nodemask_t *srcp, unsigned int nbits)
196+
{
197+
bitmap_copy(dstp->bits, srcp->bits, nbits);
198+
}
199+
194200
#define nodes_complement(dst, src) \
195201
__nodes_complement(&(dst), &(src), MAX_NUMNODES)
196202
static __always_inline void __nodes_complement(nodemask_t *dstp,

include/linux/nodemask_types.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@
33
#define __LINUX_NODEMASK_TYPES_H
44

55
#include <linux/bitops.h>
6-
#include <linux/numa.h>
6+
7+
#ifdef CONFIG_NODES_SHIFT
8+
#define NODES_SHIFT CONFIG_NODES_SHIFT
9+
#else
10+
#define NODES_SHIFT 0
11+
#endif
12+
13+
#define MAX_NUMNODES (1 << NODES_SHIFT)
14+
15+
#define NUMA_NO_NODE (-1)
716

817
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
918

include/linux/numa.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,8 @@
33
#define _LINUX_NUMA_H
44
#include <linux/init.h>
55
#include <linux/types.h>
6+
#include <linux/nodemask.h>
67

7-
#ifdef CONFIG_NODES_SHIFT
8-
#define NODES_SHIFT CONFIG_NODES_SHIFT
9-
#else
10-
#define NODES_SHIFT 0
11-
#endif
12-
13-
#define MAX_NUMNODES (1 << NODES_SHIFT)
14-
15-
#define NUMA_NO_NODE (-1)
168
#define NUMA_NO_MEMBLK (-1)
179

1810
static inline bool numa_valid_node(int nid)
@@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid);
3931
/* Generic implementation available */
4032
int numa_nearest_node(int node, unsigned int state);
4133

34+
int nearest_node_nodemask(int node, nodemask_t *mask);
35+
4236
#ifndef memory_add_physaddr_to_nid
4337
int memory_add_physaddr_to_nid(u64 start);
4438
#endif
@@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state)
5549
return NUMA_NO_NODE;
5650
}
5751

52+
static inline int nearest_node_nodemask(int node, nodemask_t *mask)
53+
{
54+
return NUMA_NO_NODE;
55+
}
56+
5857
static inline int memory_add_physaddr_to_nid(u64 start)
5958
{
6059
return 0;

include/linux/sched/ext.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ struct sched_ext_entity {
146146
u32 weight;
147147
s32 sticky_cpu;
148148
s32 holding_cpu;
149+
s32 selected_cpu;
149150
u32 kf_mask; /* see scx_kf_mask above */
150151
struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
151152
atomic_long_t ops_state;

include/linux/topology.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
261261
}
262262
#endif /* CONFIG_NUMA */
263263

264+
/**
265+
* for_each_node_numadist() - iterate over nodes in increasing distance
266+
* order, starting from a given node
267+
* @node: the iteration variable and the starting node.
268+
* @unvisited: a nodemask to keep track of the unvisited nodes.
269+
*
270+
* This macro iterates over NUMA node IDs in increasing distance from the
271+
* starting @node and yields MAX_NUMNODES when all the nodes have been
272+
* visited.
273+
*
274+
* Note that by the time the loop completes, the @unvisited nodemask will
275+
* be fully cleared, unless the loop exits early.
276+
*
277+
* The difference between for_each_node() and for_each_node_numadist() is
278+
* that the former allows to iterate over nodes in numerical order, whereas
279+
* the latter iterates over nodes in increasing order of distance.
280+
*
281+
* This complexity of this iterator is O(N^2), where N represents the
282+
* number of nodes, as each iteration involves scanning all nodes to
283+
* find the one with the shortest distance.
284+
*
285+
* Requires rcu_lock to be held.
286+
*/
287+
#define for_each_node_numadist(node, unvisited) \
288+
for (int __start = (node), \
289+
(node) = nearest_node_nodemask((__start), &(unvisited)); \
290+
(node) < MAX_NUMNODES; \
291+
node_clear((node), (unvisited)), \
292+
(node) = nearest_node_nodemask((__start), &(unvisited)))
293+
264294
/**
265295
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
266296
* from a given node.

include/trace/events/sched_ext.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,25 @@ TRACE_EVENT(sched_ext_dump,
2626
)
2727
);
2828

29+
TRACE_EVENT(sched_ext_event,
30+
TP_PROTO(const char *name, __s64 delta),
31+
TP_ARGS(name, delta),
32+
33+
TP_STRUCT__entry(
34+
__string(name, name)
35+
__field( __s64, delta )
36+
),
37+
38+
TP_fast_assign(
39+
__assign_str(name);
40+
__entry->delta = delta;
41+
),
42+
43+
TP_printk("name %s delta %lld",
44+
__get_str(name), __entry->delta
45+
)
46+
);
47+
2948
#endif /* _TRACE_SCHED_EXT_H */
3049

3150
/* This part must be outside protection */

kernel/sched/build_policy.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161

6262
#ifdef CONFIG_SCHED_CLASS_EXT
6363
# include "ext.c"
64+
# include "ext_idle.c"
6465
#endif
6566

6667
#include "syscalls.c"

kernel/sched/core.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3922,13 +3922,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
39223922

39233923
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
39243924
{
3925-
/*
3926-
* The BPF scheduler may depend on select_task_rq() being invoked during
3927-
* wakeups. In addition, @p may end up executing on a different CPU
3928-
* regardless of what happens in the wakeup path making the ttwu_queue
3929-
* optimization less meaningful. Skip if on SCX.
3930-
*/
3931-
if (task_on_scx(p))
3925+
/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
3926+
if (!scx_allow_ttwu_queue(p))
39323927
return false;
39333928

39343929
/*

0 commit comments

Comments
 (0)