Skip to content

Commit fef55c4

Browse files
fdmananagregkh
authored andcommitted
btrfs: make the extent map shrinker run asynchronously as a work queue job
[ Upstream commit 1020443 ] Currently the extent map shrinker is run synchronously for kswapd tasks that end up calling the fs shrinker (fs/super.c:super_cache_scan()). This has some disadvantages and for some heavy workloads with memory pressure it can cause some delays and stalls that make a machine unresponsive for some periods. This happens because: 1) We can have several kswapd tasks on machines with multiple NUMA zones, and running the extent map shrinker concurrently can cause high contention on some spin locks, namely the spin locks that protect the radix tree that tracks roots, the per root xarray that tracks open inodes and the list of delayed iputs. This not only delays the shrinker but also causes high CPU consumption and makes the task running the shrinker monopolize a core, resulting in the symptoms of an unresponsive system. This was noted in previous commits such as commit ae1e766 ("btrfs: only run the extent map shrinker from kswapd tasks"); 2) The extent map shrinker's iteration over inodes can often be slow, even after changing the data structure that tracks open inodes for a root from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+). The transition to the xarray while it made things a bit faster, it's still somewhat slow - for example in a test scenario with 10000 inodes that have no extent maps loaded, the extent map shrinker took between 5ms to 8ms, using a release, non-debug kernel. Iterating over the extent maps of an inode can also be slow if have an inode with many thousands of extent maps, since we use a red black tree to track and search extent maps. So having the extent map shrinker run synchronously adds extra delay for other things a kswapd task does. So make the extent map shrinker run asynchronously as a job for the system unbounded workqueue, just like what we do for data and metadata space reclaim jobs. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent c223f37 commit fef55c4

File tree

5 files changed

+52
-19
lines changed

5 files changed

+52
-19
lines changed

fs/btrfs/disk-io.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2785,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
27852785
btrfs_init_scrub(fs_info);
27862786
btrfs_init_balance(fs_info);
27872787
btrfs_init_async_reclaim_work(fs_info);
2788+
btrfs_init_extent_map_shrinker_work(fs_info);
27882789

27892790
rwlock_init(&fs_info->block_group_cache_lock);
27902791
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -4334,6 +4335,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
43344335
cancel_work_sync(&fs_info->async_reclaim_work);
43354336
cancel_work_sync(&fs_info->async_data_reclaim_work);
43364337
cancel_work_sync(&fs_info->preempt_reclaim_work);
4338+
cancel_work_sync(&fs_info->extent_map_shrinker_work);
43374339

43384340
/* Cancel or finish ongoing discard work */
43394341
btrfs_discard_cleanup(fs_info);

fs/btrfs/extent_map.c

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,7 +1128,8 @@ struct btrfs_em_shrink_ctx {
11281128

11291129
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
11301130
{
1131-
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
1131+
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1132+
const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
11321133
struct extent_map_tree *tree = &inode->extent_tree;
11331134
long nr_dropped = 0;
11341135
struct rb_node *node;
@@ -1187,7 +1188,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11871188
* lock. This is to avoid slowing other tasks trying to take the
11881189
* lock.
11891190
*/
1190-
if (need_resched() || rwlock_needbreak(&tree->lock))
1191+
if (need_resched() || rwlock_needbreak(&tree->lock) ||
1192+
btrfs_fs_closing(fs_info))
11911193
break;
11921194
node = next;
11931195
}
@@ -1261,7 +1263,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
12611263
ctx->last_ino = btrfs_ino(inode);
12621264
btrfs_add_delayed_iput(inode);
12631265

1264-
if (ctx->scanned >= ctx->nr_to_scan)
1266+
if (ctx->scanned >= ctx->nr_to_scan ||
1267+
btrfs_fs_closing(inode->root->fs_info))
12651268
break;
12661269

12671270
cond_resched();
@@ -1290,16 +1293,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
12901293
return nr_dropped;
12911294
}
12921295

1293-
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
1296+
static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
12941297
{
1298+
struct btrfs_fs_info *fs_info;
12951299
struct btrfs_em_shrink_ctx ctx;
12961300
u64 start_root_id;
12971301
u64 next_root_id;
12981302
bool cycled = false;
12991303
long nr_dropped = 0;
13001304

1305+
fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
1306+
13011307
ctx.scanned = 0;
1302-
ctx.nr_to_scan = nr_to_scan;
1308+
ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
13031309

13041310
/*
13051311
* In case we have multiple tasks running this shrinker, make the next
@@ -1317,12 +1323,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
13171323
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
13181324
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
13191325

1320-
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
1326+
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
13211327
nr, ctx.last_root,
13221328
ctx.last_ino);
13231329
}
13241330

1325-
while (ctx.scanned < ctx.nr_to_scan) {
1331+
while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
13261332
struct btrfs_root *root;
13271333
unsigned long count;
13281334

@@ -1380,5 +1386,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
13801386
ctx.last_ino);
13811387
}
13821388

1383-
return nr_dropped;
1389+
atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
1390+
}
1391+
1392+
void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
1393+
{
1394+
/*
1395+
* Do nothing if the shrinker is already running. In case of high memory
1396+
* pressure we can have a lot of tasks calling us and all passing the
1397+
* same nr_to_scan value, but in reality we may need only to free
1398+
* nr_to_scan extent maps (or less). In case we need to free more than
1399+
* that, we will be called again by the fs shrinker, so no worries about
1400+
* not doing enough work to reclaim memory from extent maps.
1401+
* We can also be repeatedly called with the same nr_to_scan value
1402+
* simply because the shrinker runs asynchronously and multiple calls
1403+
* to this function are made before the shrinker does enough progress.
1404+
*
1405+
* That's why we set the atomic counter to nr_to_scan only if its
1406+
* current value is zero, instead of incrementing the counter by
1407+
* nr_to_scan.
1408+
*/
1409+
if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
1410+
return;
1411+
1412+
queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
1413+
}
1414+
1415+
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
1416+
{
1417+
atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
1418+
INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
13841419
}

fs/btrfs/extent_map.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
189189
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
190190
struct extent_map *new_em,
191191
bool modified);
192-
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
192+
void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
193+
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
193194

194195
#endif

fs/btrfs/fs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,8 @@ struct btrfs_fs_info {
639639
spinlock_t extent_map_shrinker_lock;
640640
u64 extent_map_shrinker_last_root;
641641
u64 extent_map_shrinker_last_ino;
642+
atomic64_t extent_map_shrinker_nr_to_scan;
643+
struct work_struct extent_map_shrinker_work;
642644

643645
/* Protected by 'trans_lock'. */
644646
struct list_head dirty_cowonly_roots;

fs/btrfs/super.c

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include <linux/btrfs.h>
2929
#include <linux/security.h>
3030
#include <linux/fs_parser.h>
31-
#include <linux/swap.h>
3231
#include "messages.h"
3332
#include "delayed-inode.h"
3433
#include "ctree.h"
@@ -2399,16 +2398,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
23992398
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
24002399
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
24012400

2402-
/*
2403-
* We may be called from any task trying to allocate memory and we don't
2404-
* want to slow it down with scanning and dropping extent maps. It would
2405-
* also cause heavy lock contention if many tasks concurrently enter
2406-
* here. Therefore only allow kswapd tasks to scan and drop extent maps.
2407-
*/
2408-
if (!current_is_kswapd())
2409-
return 0;
2401+
btrfs_free_extent_maps(fs_info, nr_to_scan);
24102402

2411-
return btrfs_free_extent_maps(fs_info, nr_to_scan);
2403+
/* The extent map shrinker runs asynchronously, so always return 0. */
2404+
return 0;
24122405
}
24132406

24142407
static const struct super_operations btrfs_super_ops = {

0 commit comments

Comments
 (0)