Skip to content

Commit c223f37

Browse files
fdmananagregkh
authored andcommitted
btrfs: skip inodes without loaded extent maps when shrinking extent maps
[ Upstream commit c6c9c4d ] If there are inodes that don't have any loaded extent maps, we end up grabbing a reference on them and later adding a delayed iput, which wakes up the cleaner and makes it do unnecessary work. This is common when for example the inodes were open only to run stat(2) or all their extent maps were already released through the folio release callback (btrfs_release_folio()) or released by a previous run of the shrinker, or directories which never have extent maps. Reported-by: Ivan Shapovalov <intelfx@intelfx.name> Tested-by: Ivan Shapovalov <intelfx@intelfx.name> Link: https://lore.kernel.org/linux-btrfs/0414d690ac5680d0d77dfc930606cdc36e42e12f.camel@intelfx.name/ CC: stable@vger.kernel.org # 6.13+ Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Reviewed-by: Qu Wenruo <wqu@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 0d087de commit c223f37

File tree

1 file changed

+57
-21
lines changed

1 file changed

+57
-21
lines changed

fs/btrfs/extent_map.c

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,6 +1133,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11331133
long nr_dropped = 0;
11341134
struct rb_node *node;
11351135

1136+
lockdep_assert_held_write(&tree->lock);
1137+
11361138
/*
11371139
* Take the mmap lock so that we serialize with the inode logging phase
11381140
* of fsync because we may need to set the full sync flag on the inode,
@@ -1144,28 +1146,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
11441146
* to find new extents, which may not be there yet because ordered
11451147
* extents haven't completed yet.
11461148
*
1147-
* We also do a try lock because otherwise we could deadlock. This is
1148-
* because the shrinker for this filesystem may be invoked while we are
1149-
* in a path that is holding the mmap lock in write mode. For example in
1150-
* a reflink operation while COWing an extent buffer, when allocating
1151-
* pages for a new extent buffer and under memory pressure, the shrinker
1152-
* may be invoked, and therefore we would deadlock by attempting to read
1153-
* lock the mmap lock while we are holding already a write lock on it.
1149+
* We also do a try lock because we don't want to block for too long and
1150+
* we are holding the extent map tree's lock in write mode.
11541151
*/
11551152
if (!down_read_trylock(&inode->i_mmap_lock))
11561153
return 0;
11571154

1158-
/*
1159-
* We want to be fast so if the lock is busy we don't want to spend time
1160-
* waiting for it - either some task is about to do IO for the inode or
1161-
* we may have another task shrinking extent maps, here in this code, so
1162-
* skip this inode.
1163-
*/
1164-
if (!write_trylock(&tree->lock)) {
1165-
up_read(&inode->i_mmap_lock);
1166-
return 0;
1167-
}
1168-
11691155
node = rb_first(&tree->root);
11701156
while (node) {
11711157
struct rb_node *next = rb_next(node);
@@ -1205,21 +1191,71 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
12051191
break;
12061192
node = next;
12071193
}
1208-
write_unlock(&tree->lock);
12091194
up_read(&inode->i_mmap_lock);
12101195

12111196
return nr_dropped;
12121197
}
12131198

1199+
static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
1200+
u64 min_ino)
1201+
{
1202+
struct btrfs_inode *inode;
1203+
unsigned long from = min_ino;
1204+
1205+
xa_lock(&root->inodes);
1206+
while (true) {
1207+
struct extent_map_tree *tree;
1208+
1209+
inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
1210+
if (!inode)
1211+
break;
1212+
1213+
tree = &inode->extent_tree;
1214+
1215+
/*
1216+
* We want to be fast so if the lock is busy we don't want to
1217+
* spend time waiting for it (some task is about to do IO for
1218+
* the inode).
1219+
*/
1220+
if (!write_trylock(&tree->lock))
1221+
goto next;
1222+
1223+
/*
1224+
* Skip inode if it doesn't have loaded extent maps, so we avoid
1225+
* getting a reference and doing an iput later. This includes
1226+
* cases like files that were opened for things like stat(2), or
1227+
* files with all extent maps previously released through the
1228+
* release folio callback (btrfs_release_folio()) or released in
1229+
* a previous run, or directories which never have extent maps.
1230+
*/
1231+
if (RB_EMPTY_ROOT(&tree->root)) {
1232+
write_unlock(&tree->lock);
1233+
goto next;
1234+
}
1235+
1236+
if (igrab(&inode->vfs_inode))
1237+
break;
1238+
1239+
write_unlock(&tree->lock);
1240+
next:
1241+
from = btrfs_ino(inode) + 1;
1242+
cond_resched_lock(&root->inodes.xa_lock);
1243+
}
1244+
xa_unlock(&root->inodes);
1245+
1246+
return inode;
1247+
}
1248+
12141249
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
12151250
{
12161251
struct btrfs_inode *inode;
12171252
long nr_dropped = 0;
12181253
u64 min_ino = ctx->last_ino + 1;
12191254

1220-
inode = btrfs_find_first_inode(root, min_ino);
1255+
inode = find_first_inode_to_shrink(root, min_ino);
12211256
while (inode) {
12221257
nr_dropped += btrfs_scan_inode(inode, ctx);
1258+
write_unlock(&inode->extent_tree.lock);
12231259

12241260
min_ino = btrfs_ino(inode) + 1;
12251261
ctx->last_ino = btrfs_ino(inode);
@@ -1230,7 +1266,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
12301266

12311267
cond_resched();
12321268

1233-
inode = btrfs_find_first_inode(root, min_ino);
1269+
inode = find_first_inode_to_shrink(root, min_ino);
12341270
}
12351271

12361272
if (inode) {

0 commit comments

Comments
 (0)