Skip to content

Commit 31639fd

Browse files
melverakpm00
authored andcommitted
stackdepot: use variable size records for non-evictable entries
With the introduction of stack depot evictions, each stack record is now fixed size, so that future reuse after an eviction can safely store differently sized stack traces. In all cases that do not make use of evictions, this wastes lots of space. Fix it by re-introducing variable size stack records (up to the max allowed size) for entries that will never be evicted. We know if an entry will never be evicted if the flag STACK_DEPOT_FLAG_GET is not provided, since a later stack_depot_put() attempt is undefined behavior. With my current kernel config that enables KASAN and also SLUB owner tracking, I observe (after a kernel boot) a whopping reduction of 296 stack depot pools, which translates into 4736 KiB saved. The savings here are from SLUB owner tracking only, because KASAN generic mode still uses refcounting. Before: pools: 893 allocations: 29841 frees: 6524 in_use: 23317 freelist_size: 3454 After: pools: 597 refcounted_allocations: 17547 refcounted_frees: 6477 refcounted_in_use: 11070 freelist_size: 3497 persistent_count: 12163 persistent_bytes: 1717008 [elver@google.com: fix -Wstringop-overflow warning] Link: https://lore.kernel.org/all/20240201135747.18eca98e@canb.auug.org.au/ Link: https://lkml.kernel.org/r/20240201090434.1762340-1-elver@google.com Link: https://lore.kernel.org/all/CABXGCsOzpRPZGg23QqJAzKnqkZPKzvieeg=W7sgjgi3q0pBo0g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20240129100708.39460-1-elver@google.com Link: https://lore.kernel.org/all/CABXGCsOzpRPZGg23QqJAzKnqkZPKzvieeg=W7sgjgi3q0pBo0g@mail.gmail.com/ Fixes: 108be8d ("lib/stackdepot: allow users to evict stack traces") Signed-off-by: Marco Elver <elver@google.com> Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com> Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com> Cc: Alexander Potapenko <glider@google.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 2597c99 commit 31639fd

File tree

2 files changed

+130
-123
lines changed

2 files changed

+130
-123
lines changed

include/linux/poison.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,7 @@
9292
/********** VFS **********/
9393
#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA))
9494

95+
/********** lib/stackdepot.c **********/
96+
#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
97+
9598
#endif

lib/stackdepot.c

Lines changed: 127 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <linux/list.h>
2323
#include <linux/mm.h>
2424
#include <linux/mutex.h>
25+
#include <linux/poison.h>
2526
#include <linux/printk.h>
2627
#include <linux/rculist.h>
2728
#include <linux/rcupdate.h>
@@ -43,17 +44,7 @@
4344
#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
4445
#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
4546
STACK_DEPOT_EXTRA_BITS)
46-
#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32
47-
/*
48-
* KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack
49-
* traces. As KMSAN does not support evicting stack traces from the stack
50-
* depot, the stack depot capacity might be reached quickly with large stack
51-
* records. Adjust the maximum number of stack depot pools for this case.
52-
*/
53-
#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16))
54-
#else
5547
#define DEPOT_POOLS_CAP 8192
56-
#endif
5748
#define DEPOT_MAX_POOLS \
5849
(((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
5950
(1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
@@ -93,9 +84,6 @@ struct stack_record {
9384
};
9485
};
9586

96-
#define DEPOT_STACK_RECORD_SIZE \
97-
ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
98-
9987
static bool stack_depot_disabled;
10088
static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
10189
static bool __stack_depot_early_init_passed __initdata;
@@ -121,32 +109,31 @@ static void *stack_pools[DEPOT_MAX_POOLS];
121109
static void *new_pool;
122110
/* Number of pools in stack_pools. */
123111
static int pools_num;
112+
/* Offset to the unused space in the currently used pool. */
113+
static size_t pool_offset = DEPOT_POOL_SIZE;
124114
/* Freelist of stack records within stack_pools. */
125115
static LIST_HEAD(free_stacks);
126-
/*
127-
* Stack depot tries to keep an extra pool allocated even before it runs out
128-
* of space in the currently used pool. This flag marks whether this extra pool
129-
* needs to be allocated. It has the value 0 when either an extra pool is not
130-
* yet allocated or if the limit on the number of pools is reached.
131-
*/
132-
static bool new_pool_required = true;
133116
/* The lock must be held when performing pool or freelist modifications. */
134117
static DEFINE_RAW_SPINLOCK(pool_lock);
135118

136119
/* Statistics counters for debugfs. */
137120
enum depot_counter_id {
138-
DEPOT_COUNTER_ALLOCS,
139-
DEPOT_COUNTER_FREES,
140-
DEPOT_COUNTER_INUSE,
121+
DEPOT_COUNTER_REFD_ALLOCS,
122+
DEPOT_COUNTER_REFD_FREES,
123+
DEPOT_COUNTER_REFD_INUSE,
141124
DEPOT_COUNTER_FREELIST_SIZE,
125+
DEPOT_COUNTER_PERSIST_COUNT,
126+
DEPOT_COUNTER_PERSIST_BYTES,
142127
DEPOT_COUNTER_COUNT,
143128
};
144129
static long counters[DEPOT_COUNTER_COUNT];
145130
static const char *const counter_names[] = {
146-
[DEPOT_COUNTER_ALLOCS] = "allocations",
147-
[DEPOT_COUNTER_FREES] = "frees",
148-
[DEPOT_COUNTER_INUSE] = "in_use",
131+
[DEPOT_COUNTER_REFD_ALLOCS] = "refcounted_allocations",
132+
[DEPOT_COUNTER_REFD_FREES] = "refcounted_frees",
133+
[DEPOT_COUNTER_REFD_INUSE] = "refcounted_in_use",
149134
[DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size",
135+
[DEPOT_COUNTER_PERSIST_COUNT] = "persistent_count",
136+
[DEPOT_COUNTER_PERSIST_BYTES] = "persistent_bytes",
150137
};
151138
static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
152139

@@ -294,48 +281,52 @@ int stack_depot_init(void)
294281
EXPORT_SYMBOL_GPL(stack_depot_init);
295282

296283
/*
297-
* Initializes new stack depot @pool, release all its entries to the freelist,
298-
* and update the list of pools.
284+
* Initializes new stack pool, and updates the list of pools.
299285
*/
300-
static void depot_init_pool(void *pool)
286+
static bool depot_init_pool(void **prealloc)
301287
{
302-
int offset;
303-
304288
lockdep_assert_held(&pool_lock);
305289

306-
/* Initialize handles and link stack records into the freelist. */
307-
for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
308-
offset += DEPOT_STACK_RECORD_SIZE) {
309-
struct stack_record *stack = pool + offset;
310-
311-
stack->handle.pool_index = pools_num;
312-
stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
313-
stack->handle.extra = 0;
314-
315-
/*
316-
* Stack traces of size 0 are never saved, and we can simply use
317-
* the size field as an indicator if this is a new unused stack
318-
* record in the freelist.
319-
*/
320-
stack->size = 0;
290+
if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
291+
/* Bail out if we reached the pool limit. */
292+
WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
293+
WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
294+
WARN_ONCE(1, "Stack depot reached limit capacity");
295+
return false;
296+
}
321297

322-
INIT_LIST_HEAD(&stack->hash_list);
323-
/*
324-
* Add to the freelist front to prioritize never-used entries:
325-
* required in case there are entries in the freelist, but their
326-
* RCU cookie still belongs to the current RCU grace period
327-
* (there can still be concurrent readers).
328-
*/
329-
list_add(&stack->free_list, &free_stacks);
330-
counters[DEPOT_COUNTER_FREELIST_SIZE]++;
298+
if (!new_pool && *prealloc) {
299+
/* We have preallocated memory, use it. */
300+
WRITE_ONCE(new_pool, *prealloc);
301+
*prealloc = NULL;
331302
}
332303

304+
if (!new_pool)
305+
return false; /* new_pool and *prealloc are NULL */
306+
333307
/* Save reference to the pool to be used by depot_fetch_stack(). */
334-
stack_pools[pools_num] = pool;
308+
stack_pools[pools_num] = new_pool;
309+
310+
/*
311+
* Stack depot tries to keep an extra pool allocated even before it runs
312+
* out of space in the currently used pool.
313+
*
314+
* To indicate that a new preallocation is needed new_pool is reset to
315+
* NULL; do not reset to NULL if we have reached the maximum number of
316+
* pools.
317+
*/
318+
if (pools_num < DEPOT_MAX_POOLS)
319+
WRITE_ONCE(new_pool, NULL);
320+
else
321+
WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
335322

336323
/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
337324
WRITE_ONCE(pools_num, pools_num + 1);
338325
ASSERT_EXCLUSIVE_WRITER(pools_num);
326+
327+
pool_offset = 0;
328+
329+
return true;
339330
}
340331

341332
/* Keeps the preallocated memory to be used for a new stack depot pool. */
@@ -347,63 +338,51 @@ static void depot_keep_new_pool(void **prealloc)
347338
* If a new pool is already saved or the maximum number of
348339
* pools is reached, do not use the preallocated memory.
349340
*/
350-
if (!new_pool_required)
341+
if (new_pool)
351342
return;
352343

353-
/*
354-
* Use the preallocated memory for the new pool
355-
* as long as we do not exceed the maximum number of pools.
356-
*/
357-
if (pools_num < DEPOT_MAX_POOLS) {
358-
new_pool = *prealloc;
359-
*prealloc = NULL;
360-
}
361-
362-
/*
363-
* At this point, either a new pool is kept or the maximum
364-
* number of pools is reached. In either case, take note that
365-
* keeping another pool is not required.
366-
*/
367-
WRITE_ONCE(new_pool_required, false);
344+
WRITE_ONCE(new_pool, *prealloc);
345+
*prealloc = NULL;
368346
}
369347

370348
/*
371-
* Try to initialize a new stack depot pool from either a previous or the
372-
* current pre-allocation, and release all its entries to the freelist.
349+
* Try to initialize a new stack record from the current pool, a cached pool, or
350+
* the current pre-allocation.
373351
*/
374-
static bool depot_try_init_pool(void **prealloc)
352+
static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
375353
{
354+
struct stack_record *stack;
355+
void *current_pool;
356+
u32 pool_index;
357+
376358
lockdep_assert_held(&pool_lock);
377359

378-
/* Check if we have a new pool saved and use it. */
379-
if (new_pool) {
380-
depot_init_pool(new_pool);
381-
new_pool = NULL;
360+
if (pool_offset + size > DEPOT_POOL_SIZE) {
361+
if (!depot_init_pool(prealloc))
362+
return NULL;
363+
}
382364

383-
/* Take note that we might need a new new_pool. */
384-
if (pools_num < DEPOT_MAX_POOLS)
385-
WRITE_ONCE(new_pool_required, true);
365+
if (WARN_ON_ONCE(pools_num < 1))
366+
return NULL;
367+
pool_index = pools_num - 1;
368+
current_pool = stack_pools[pool_index];
369+
if (WARN_ON_ONCE(!current_pool))
370+
return NULL;
386371

387-
return true;
388-
}
372+
stack = current_pool + pool_offset;
389373

390-
/* Bail out if we reached the pool limit. */
391-
if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
392-
WARN_ONCE(1, "Stack depot reached limit capacity");
393-
return false;
394-
}
374+
/* Pre-initialize handle once. */
375+
stack->handle.pool_index = pool_index;
376+
stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
377+
stack->handle.extra = 0;
378+
INIT_LIST_HEAD(&stack->hash_list);
395379

396-
/* Check if we have preallocated memory and use it. */
397-
if (*prealloc) {
398-
depot_init_pool(*prealloc);
399-
*prealloc = NULL;
400-
return true;
401-
}
380+
pool_offset += size;
402381

403-
return false;
382+
return stack;
404383
}
405384

406-
/* Try to find next free usable entry. */
385+
/* Try to find next free usable entry from the freelist. */
407386
static struct stack_record *depot_pop_free(void)
408387
{
409388
struct stack_record *stack;
@@ -420,7 +399,7 @@ static struct stack_record *depot_pop_free(void)
420399
* check the first entry.
421400
*/
422401
stack = list_first_entry(&free_stacks, struct stack_record, free_list);
423-
if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
402+
if (!poll_state_synchronize_rcu(stack->rcu_state))
424403
return NULL;
425404

426405
list_del(&stack->free_list);
@@ -429,48 +408,73 @@ static struct stack_record *depot_pop_free(void)
429408
return stack;
430409
}
431410

411+
static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
412+
{
413+
const size_t used = flex_array_size(s, entries, nr_entries);
414+
const size_t unused = sizeof(s->entries) - used;
415+
416+
WARN_ON_ONCE(sizeof(s->entries) < used);
417+
418+
return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
419+
}
420+
432421
/* Allocates a new stack in a stack depot pool. */
433422
static struct stack_record *
434-
depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
423+
depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
435424
{
436-
struct stack_record *stack;
425+
struct stack_record *stack = NULL;
426+
size_t record_size;
437427

438428
lockdep_assert_held(&pool_lock);
439429

440430
/* This should already be checked by public API entry points. */
441-
if (WARN_ON_ONCE(!size))
431+
if (WARN_ON_ONCE(!nr_entries))
442432
return NULL;
443433

444-
/* Check if we have a stack record to save the stack trace. */
445-
stack = depot_pop_free();
446-
if (!stack) {
447-
/* No usable entries on the freelist - try to refill the freelist. */
448-
if (!depot_try_init_pool(prealloc))
449-
return NULL;
434+
/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
435+
if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
436+
nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
437+
438+
if (flags & STACK_DEPOT_FLAG_GET) {
439+
/*
440+
* Evictable entries have to allocate the max. size so they may
441+
* safely be re-used by differently sized allocations.
442+
*/
443+
record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
450444
stack = depot_pop_free();
451-
if (WARN_ON(!stack))
452-
return NULL;
445+
} else {
446+
record_size = depot_stack_record_size(stack, nr_entries);
453447
}
454448

455-
/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
456-
if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
457-
size = CONFIG_STACKDEPOT_MAX_FRAMES;
449+
if (!stack) {
450+
stack = depot_pop_free_pool(prealloc, record_size);
451+
if (!stack)
452+
return NULL;
453+
}
458454

459455
/* Save the stack trace. */
460456
stack->hash = hash;
461-
stack->size = size;
462-
/* stack->handle is already filled in by depot_init_pool(). */
463-
refcount_set(&stack->count, 1);
464-
memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
457+
stack->size = nr_entries;
458+
/* stack->handle is already filled in by depot_pop_free_pool(). */
459+
memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
460+
461+
if (flags & STACK_DEPOT_FLAG_GET) {
462+
refcount_set(&stack->count, 1);
463+
counters[DEPOT_COUNTER_REFD_ALLOCS]++;
464+
counters[DEPOT_COUNTER_REFD_INUSE]++;
465+
} else {
466+
/* Warn on attempts to switch to refcounting this entry. */
467+
refcount_set(&stack->count, REFCOUNT_SATURATED);
468+
counters[DEPOT_COUNTER_PERSIST_COUNT]++;
469+
counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
470+
}
465471

466472
/*
467473
* Let KMSAN know the stored stack record is initialized. This shall
468474
* prevent false positive reports if instrumented code accesses it.
469475
*/
470-
kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
476+
kmsan_unpoison_memory(stack, record_size);
471477

472-
counters[DEPOT_COUNTER_ALLOCS]++;
473-
counters[DEPOT_COUNTER_INUSE]++;
474478
return stack;
475479
}
476480

@@ -538,8 +542,8 @@ static void depot_free_stack(struct stack_record *stack)
538542
list_add_tail(&stack->free_list, &free_stacks);
539543

540544
counters[DEPOT_COUNTER_FREELIST_SIZE]++;
541-
counters[DEPOT_COUNTER_FREES]++;
542-
counters[DEPOT_COUNTER_INUSE]--;
545+
counters[DEPOT_COUNTER_REFD_FREES]++;
546+
counters[DEPOT_COUNTER_REFD_INUSE]--;
543547

544548
printk_deferred_exit();
545549
raw_spin_unlock_irqrestore(&pool_lock, flags);
@@ -660,7 +664,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
660664
* Allocate memory for a new pool if required now:
661665
* we won't be able to do that under the lock.
662666
*/
663-
if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
667+
if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
664668
/*
665669
* Zero out zone modifiers, as we don't have specific zone
666670
* requirements. Keep the flags related to allocation in atomic
@@ -681,7 +685,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
681685
found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
682686
if (!found) {
683687
struct stack_record *new =
684-
depot_alloc_stack(entries, nr_entries, hash, &prealloc);
688+
depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
685689

686690
if (new) {
687691
/*

0 commit comments

Comments
 (0)