Skip to content

Commit 8c57b68

Browse files
author
Alexei Starovoitov
committed
mm, bpf: Introduce free_pages_nolock()
Introduce free_pages_nolock() that can free pages without taking locks. It relies on trylock and can be called from any context. Since spin_trylock() cannot be used in PREEMPT_RT from hard IRQ or NMI it uses lockless link list to stash the pages which will be freed by subsequent free_pages() from good context. Do not use llist unconditionally. BPF maps continuously allocate/free, so we cannot unconditionally delay the freeing to llist. When the memory becomes free make it available to the kernel and BPF users right away if possible, and fallback to llist as the last resort. Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/r/20250222024427.30294-4-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
1 parent 97769a5 commit 8c57b68

File tree

6 files changed

+100
-14
lines changed

6 files changed

+100
-14
lines changed

include/linux/gfp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
379379
__get_free_pages((gfp_mask) | GFP_DMA, (order))
380380

381381
extern void __free_pages(struct page *page, unsigned int order);
382+
extern void free_pages_nolock(struct page *page, unsigned int order);
382383
extern void free_pages(unsigned long addr, unsigned int order);
383384

384385
#define __free_page(page) __free_pages((page), 0)

include/linux/mm_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ struct page {
9999
/* Or, free page */
100100
struct list_head buddy_list;
101101
struct list_head pcp_list;
102+
struct {
103+
struct llist_node pcp_llist;
104+
unsigned int order;
105+
};
102106
};
103107
/* See page-flags.h for PAGE_MAPPING_FLAGS */
104108
struct address_space *mapping;

include/linux/mmzone.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -972,6 +972,9 @@ struct zone {
972972
/* Primarily protects free_area */
973973
spinlock_t lock;
974974

975+
/* Pages to be freed when next trylock succeeds */
976+
struct llist_head trylock_free_pages;
977+
975978
/* Write-intensive fields used by compaction and vmstats. */
976979
CACHELINE_PADDING(_pad2_);
977980

lib/stackdepot.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
672672
exit:
673673
if (prealloc) {
674674
/* Stack depot didn't use this memory, free it. */
675-
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
675+
if (!allow_spin)
676+
free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER);
677+
else
678+
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
676679
}
677680
if (found)
678681
handle = found->handle.handle;

mm/page_alloc.c

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
8888
*/
8989
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
9090

91+
/* Free the page without taking locks. Rely on trylock only. */
92+
#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
93+
9194
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
9295
static DEFINE_MUTEX(pcp_batch_high_lock);
9396
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1249,13 +1252,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
12491252
} while (1);
12501253
}
12511254

1255+
static void add_page_to_zone_llist(struct zone *zone, struct page *page,
1256+
unsigned int order)
1257+
{
1258+
/* Remember the order */
1259+
page->order = order;
1260+
/* Add the page to the free list */
1261+
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
1262+
}
1263+
12521264
static void free_one_page(struct zone *zone, struct page *page,
12531265
unsigned long pfn, unsigned int order,
12541266
fpi_t fpi_flags)
12551267
{
1268+
struct llist_head *llhead;
12561269
unsigned long flags;
12571270

1258-
spin_lock_irqsave(&zone->lock, flags);
1271+
if (!spin_trylock_irqsave(&zone->lock, flags)) {
1272+
if (unlikely(fpi_flags & FPI_TRYLOCK)) {
1273+
add_page_to_zone_llist(zone, page, order);
1274+
return;
1275+
}
1276+
spin_lock_irqsave(&zone->lock, flags);
1277+
}
1278+
1279+
/* The lock succeeded. Process deferred pages. */
1280+
llhead = &zone->trylock_free_pages;
1281+
if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
1282+
struct llist_node *llnode;
1283+
struct page *p, *tmp;
1284+
1285+
llnode = llist_del_all(llhead);
1286+
llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
1287+
unsigned int p_order = p->order;
1288+
1289+
split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
1290+
__count_vm_events(PGFREE, 1 << p_order);
1291+
}
1292+
}
12591293
split_large_buddy(zone, page, pfn, order, fpi_flags);
12601294
spin_unlock_irqrestore(&zone->lock, flags);
12611295

@@ -2599,7 +2633,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
25992633

26002634
static void free_frozen_page_commit(struct zone *zone,
26012635
struct per_cpu_pages *pcp, struct page *page, int migratetype,
2602-
unsigned int order)
2636+
unsigned int order, fpi_t fpi_flags)
26032637
{
26042638
int high, batch;
26052639
int pindex;
@@ -2634,6 +2668,14 @@ static void free_frozen_page_commit(struct zone *zone,
26342668
}
26352669
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
26362670
pcp->free_count += (1 << order);
2671+
2672+
if (unlikely(fpi_flags & FPI_TRYLOCK)) {
2673+
/*
2674+
* Do not attempt to take a zone lock. Let pcp->count get
2675+
* over high mark temporarily.
2676+
*/
2677+
return;
2678+
}
26372679
high = nr_pcp_high(pcp, zone, batch, free_high);
26382680
if (pcp->count >= high) {
26392681
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2648,7 +2690,8 @@ static void free_frozen_page_commit(struct zone *zone,
26482690
/*
26492691
* Free a pcp page
26502692
*/
2651-
void free_frozen_pages(struct page *page, unsigned int order)
2693+
static void __free_frozen_pages(struct page *page, unsigned int order,
2694+
fpi_t fpi_flags)
26522695
{
26532696
unsigned long __maybe_unused UP_flags;
26542697
struct per_cpu_pages *pcp;
@@ -2657,7 +2700,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
26572700
int migratetype;
26582701

26592702
if (!pcp_allowed_order(order)) {
2660-
__free_pages_ok(page, order, FPI_NONE);
2703+
__free_pages_ok(page, order, fpi_flags);
26612704
return;
26622705
}
26632706

@@ -2675,23 +2718,33 @@ void free_frozen_pages(struct page *page, unsigned int order)
26752718
migratetype = get_pfnblock_migratetype(page, pfn);
26762719
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
26772720
if (unlikely(is_migrate_isolate(migratetype))) {
2678-
free_one_page(zone, page, pfn, order, FPI_NONE);
2721+
free_one_page(zone, page, pfn, order, fpi_flags);
26792722
return;
26802723
}
26812724
migratetype = MIGRATE_MOVABLE;
26822725
}
26832726

2727+
if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
2728+
&& (in_nmi() || in_hardirq()))) {
2729+
add_page_to_zone_llist(zone, page, order);
2730+
return;
2731+
}
26842732
pcp_trylock_prepare(UP_flags);
26852733
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
26862734
if (pcp) {
2687-
free_frozen_page_commit(zone, pcp, page, migratetype, order);
2735+
free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
26882736
pcp_spin_unlock(pcp);
26892737
} else {
2690-
free_one_page(zone, page, pfn, order, FPI_NONE);
2738+
free_one_page(zone, page, pfn, order, fpi_flags);
26912739
}
26922740
pcp_trylock_finish(UP_flags);
26932741
}
26942742

2743+
void free_frozen_pages(struct page *page, unsigned int order)
2744+
{
2745+
__free_frozen_pages(page, order, FPI_NONE);
2746+
}
2747+
26952748
/*
26962749
* Free a batch of folios
26972750
*/
@@ -2780,7 +2833,7 @@ void free_unref_folios(struct folio_batch *folios)
27802833

27812834
trace_mm_page_free_batched(&folio->page);
27822835
free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
2783-
order);
2836+
order, FPI_NONE);
27842837
}
27852838

27862839
if (pcp) {
@@ -4822,9 +4875,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
48224875
EXPORT_SYMBOL(get_zeroed_page_noprof);
48234876

48244877
/**
4825-
* __free_pages - Free pages allocated with alloc_pages().
4878+
* ___free_pages - Free pages allocated with alloc_pages().
48264879
* @page: The page pointer returned from alloc_pages().
48274880
* @order: The order of the allocation.
4881+
* @fpi_flags: Free Page Internal flags.
48284882
*
48294883
* This function can free multi-page allocations that are not compound
48304884
* pages. It does not check that the @order passed in matches that of
@@ -4841,22 +4895,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
48414895
* Context: May be called in interrupt context or while holding a normal
48424896
* spinlock, but not in NMI context or while holding a raw spinlock.
48434897
*/
4844-
void __free_pages(struct page *page, unsigned int order)
4898+
static void ___free_pages(struct page *page, unsigned int order,
4899+
fpi_t fpi_flags)
48454900
{
48464901
/* get PageHead before we drop reference */
48474902
int head = PageHead(page);
48484903
struct alloc_tag *tag = pgalloc_tag_get(page);
48494904

48504905
if (put_page_testzero(page))
4851-
free_frozen_pages(page, order);
4906+
__free_frozen_pages(page, order, fpi_flags);
48524907
else if (!head) {
48534908
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
48544909
while (order-- > 0)
4855-
free_frozen_pages(page + (1 << order), order);
4910+
__free_frozen_pages(page + (1 << order), order,
4911+
fpi_flags);
48564912
}
48574913
}
4914+
void __free_pages(struct page *page, unsigned int order)
4915+
{
4916+
___free_pages(page, order, FPI_NONE);
4917+
}
48584918
EXPORT_SYMBOL(__free_pages);
48594919

4920+
/*
4921+
* Can be called while holding raw_spin_lock or from IRQ and NMI for any
4922+
* page type (not only those that came from try_alloc_pages)
4923+
*/
4924+
void free_pages_nolock(struct page *page, unsigned int order)
4925+
{
4926+
___free_pages(page, order, FPI_TRYLOCK);
4927+
}
4928+
48604929
void free_pages(unsigned long addr, unsigned int order)
48614930
{
48624931
if (addr != 0) {

mm/page_owner.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
294294
page_owner = get_page_owner(page_ext);
295295
alloc_handle = page_owner->handle;
296296

297-
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
297+
/*
298+
* Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
299+
* to prevent issues in stack_depot_save().
300+
* This is similar to try_alloc_pages() gfp flags, but only used
301+
* to signal stack_depot to avoid spin_locks.
302+
*/
303+
handle = save_stack(__GFP_NOWARN);
298304
__update_page_owner_free_handle(page_ext, handle, order, current->pid,
299305
current->tgid, free_ts_nsec);
300306
page_ext_put(page_ext);

0 commit comments

Comments
 (0)