Skip to content

Commit 97769a5

Browse files
author
Alexei Starovoitov
committed
mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation
Tracing BPF programs execute from tracepoints and kprobes where running context is unknown, but they need to request additional memory. The prior workarounds were using pre-allocated memory and BPF specific freelists to satisfy such allocation requests. Instead, introduce gfpflags_allow_spinning() condition that signals to the allocator that running context is unknown. Then rely on percpu free list of pages to allocate a page. try_alloc_pages() -> get_page_from_freelist() -> rmqueue() -> rmqueue_pcplist() will spin_trylock to grab the page from percpu free list. If it fails (due to re-entrancy or list being empty) then rmqueue_bulk()/rmqueue_buddy() will attempt to spin_trylock zone->lock and grab the page from there. spin_trylock() is not safe in PREEMPT_RT when in NMI or in hard IRQ. Bailout early in such case. The support for gfpflags_allow_spinning() mode for free_page and memcg comes in the next patches. This is a first step towards supporting BPF requirements in SLUB and getting rid of bpf_mem_alloc. That goal was discussed at LSFMM: https://lwn.net/Articles/974138/ Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/r/20250222024427.30294-3-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
1 parent 0aaddfb commit 97769a5

File tree

4 files changed

+127
-5
lines changed

4 files changed

+127
-5
lines changed

include/linux/gfp.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
3939
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
4040
}
4141

42+
static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
43+
{
44+
/*
45+
* !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
46+
* !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
47+
* All GFP_* flags including GFP_NOWAIT use one or both flags.
48+
* try_alloc_pages() is the only API that doesn't specify either flag.
49+
*
50+
* This is stronger than GFP_NOWAIT or GFP_ATOMIC because
51+
* those are guaranteed to never block on a sleeping lock.
52+
* Here we are enforcing that the allocation doesn't ever spin
53+
* on any locks (i.e. only trylocks). There is no high level
54+
* GFP_$FOO flag for this use in try_alloc_pages() as the
55+
* regular page allocator doesn't fully support this
56+
* allocation mode.
57+
*/
58+
return !(gfp_flags & __GFP_RECLAIM);
59+
}
60+
4261
#ifdef CONFIG_HIGHMEM
4362
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
4463
#else
@@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
335354
}
336355
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
337356

357+
struct page *try_alloc_pages_noprof(int nid, unsigned int order);
358+
#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
359+
338360
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
339361
#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
340362

lib/stackdepot.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
591591
depot_stack_handle_t handle = 0;
592592
struct page *page = NULL;
593593
void *prealloc = NULL;
594-
bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
594+
bool allow_spin = gfpflags_allow_spinning(alloc_flags);
595+
bool can_alloc = (depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC) && allow_spin;
595596
unsigned long flags;
596597
u32 hash;
597598

@@ -630,7 +631,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
630631
prealloc = page_address(page);
631632
}
632633

633-
if (in_nmi()) {
634+
if (in_nmi() || !allow_spin) {
634635
/* We can never allocate in NMI context. */
635636
WARN_ON_ONCE(can_alloc);
636637
/* Best effort; bail if we fail to take the lock. */

mm/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,6 +1187,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
11871187
#define ALLOC_NOFRAGMENT 0x0
11881188
#endif
11891189
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
1190+
#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
11901191
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
11911192

11921193
/* Flags that allow allocations below the min watermark. */

mm/page_alloc.c

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2307,7 +2307,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
23072307
unsigned long flags;
23082308
int i;
23092309

2310-
spin_lock_irqsave(&zone->lock, flags);
2310+
if (!spin_trylock_irqsave(&zone->lock, flags)) {
2311+
if (unlikely(alloc_flags & ALLOC_TRYLOCK))
2312+
return 0;
2313+
spin_lock_irqsave(&zone->lock, flags);
2314+
}
23112315
for (i = 0; i < count; ++i) {
23122316
struct page *page = __rmqueue(zone, order, migratetype,
23132317
alloc_flags);
@@ -2907,7 +2911,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
29072911

29082912
do {
29092913
page = NULL;
2910-
spin_lock_irqsave(&zone->lock, flags);
2914+
if (!spin_trylock_irqsave(&zone->lock, flags)) {
2915+
if (unlikely(alloc_flags & ALLOC_TRYLOCK))
2916+
return NULL;
2917+
spin_lock_irqsave(&zone->lock, flags);
2918+
}
29112919
if (alloc_flags & ALLOC_HIGHATOMIC)
29122920
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
29132921
if (!page) {
@@ -4511,7 +4519,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
45114519

45124520
might_alloc(gfp_mask);
45134521

4514-
if (should_fail_alloc_page(gfp_mask, order))
4522+
/*
4523+
* Don't invoke should_fail logic, since it may call
4524+
* get_random_u32() and printk() which need to spin_lock.
4525+
*/
4526+
if (!(*alloc_flags & ALLOC_TRYLOCK) &&
4527+
should_fail_alloc_page(gfp_mask, order))
45154528
return false;
45164529

45174530
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -7071,3 +7084,88 @@ static bool __free_unaccepted(struct page *page)
70717084
}
70727085

70737086
#endif /* CONFIG_UNACCEPTED_MEMORY */
7087+
7088+
/**
7089+
* try_alloc_pages - opportunistic reentrant allocation from any context
7090+
* @nid: node to allocate from
7091+
* @order: allocation order size
7092+
*
7093+
* Allocates pages of a given order from the given node. This is safe to
7094+
* call from any context (from atomic, NMI, and also reentrant
7095+
* allocator -> tracepoint -> try_alloc_pages_noprof).
7096+
* Allocation is best effort and to be expected to fail easily so nobody should
7097+
* rely on the success. Failures are not reported via warn_alloc().
7098+
* See always fail conditions below.
7099+
*
7100+
* Return: allocated page or NULL on failure.
7101+
*/
7102+
struct page *try_alloc_pages_noprof(int nid, unsigned int order)
7103+
{
7104+
/*
7105+
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
7106+
* Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
7107+
* is not safe in arbitrary context.
7108+
*
7109+
* These two are the conditions for gfpflags_allow_spinning() being true.
7110+
*
7111+
* Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
7112+
* to warn. Also warn would trigger printk() which is unsafe from
7113+
* various contexts. We cannot use printk_deferred_enter() to mitigate,
7114+
* since the running context is unknown.
7115+
*
7116+
* Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
7117+
* is safe in any context. Also zeroing the page is mandatory for
7118+
* BPF use cases.
7119+
*
7120+
* Though __GFP_NOMEMALLOC is not checked in the code path below,
7121+
* specify it here to highlight that try_alloc_pages()
7122+
* doesn't want to deplete reserves.
7123+
*/
7124+
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC;
7125+
unsigned int alloc_flags = ALLOC_TRYLOCK;
7126+
struct alloc_context ac = { };
7127+
struct page *page;
7128+
7129+
/*
7130+
* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
7131+
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
7132+
* task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
7133+
* mark the task as the owner of another rt_spin_lock which will
7134+
* confuse PI logic, so return immediately if called form hard IRQ or
7135+
* NMI.
7136+
*
7137+
* Note, irqs_disabled() case is ok. This function can be called
7138+
* from raw_spin_lock_irqsave region.
7139+
*/
7140+
if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
7141+
return NULL;
7142+
if (!pcp_allowed_order(order))
7143+
return NULL;
7144+
7145+
#ifdef CONFIG_UNACCEPTED_MEMORY
7146+
/* Bailout, since try_to_accept_memory_one() needs to take a lock */
7147+
if (has_unaccepted_memory())
7148+
return NULL;
7149+
#endif
7150+
/* Bailout, since _deferred_grow_zone() needs to take a lock */
7151+
if (deferred_pages_enabled())
7152+
return NULL;
7153+
7154+
if (nid == NUMA_NO_NODE)
7155+
nid = numa_node_id();
7156+
7157+
prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
7158+
&alloc_gfp, &alloc_flags);
7159+
7160+
/*
7161+
* Best effort allocation from percpu free list.
7162+
* If it's empty attempt to spin_trylock zone->lock.
7163+
*/
7164+
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
7165+
7166+
/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
7167+
7168+
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
7169+
kmsan_alloc_page(page, order, alloc_gfp);
7170+
return page;
7171+
}

0 commit comments

Comments
 (0)