Skip to content

Commit 01cd390

Browse files
committed
Revert "btrfs: turn fs_info member buffer_radix into XArray"
This reverts commit 8ee9226. Revert the xarray conversion, there's a problem with potential sleep-inside-spinlock [1] when calling xa_insert that triggers GFP_NOFS allocation. The radix tree used the preloading mechanism to avoid sleeping but this is not available in xarray. Conversion from spin lock to mutex is possible but at time of rc6 is riskier than a clean revert. [1] https://lore.kernel.org/linux-btrfs/cover.1657097693.git.fdmanana@suse.com/ Reported-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent fc7cbcd commit 01cd390

File tree

4 files changed

+97
-55
lines changed

4 files changed

+97
-55
lines changed

fs/btrfs/ctree.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -994,10 +994,10 @@ struct btrfs_fs_info {
994994

995995
struct btrfs_delayed_root *delayed_root;
996996

997-
/* Extent buffer xarray */
997+
/* Extent buffer radix tree */
998998
spinlock_t buffer_lock;
999999
/* Entries are eb->start / sectorsize */
1000-
struct xarray extent_buffers;
1000+
struct radix_tree_root buffer_radix;
10011001

10021002
/* next backup root to be overwritten */
10031003
int backup_root_index;

fs/btrfs/disk-io.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
486486
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
487487
fs_info->nodesize);
488488

489-
/* A dirty eb shouldn't disappear from extent_buffers */
489+
/* A dirty eb shouldn't disappear from buffer_radix */
490490
if (WARN_ON(!eb))
491491
return -EUCLEAN;
492492

@@ -3150,7 +3150,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
31503150
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
31513151
{
31523152
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
3153-
xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
3153+
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
31543154
INIT_LIST_HEAD(&fs_info->trans_list);
31553155
INIT_LIST_HEAD(&fs_info->dead_roots);
31563156
INIT_LIST_HEAD(&fs_info->delayed_iputs);

fs/btrfs/extent_io.c

Lines changed: 74 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2966,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
29662966
}
29672967

29682968
/*
2969-
* Find extent buffer for a given bytenr.
2969+
* Find extent buffer for a givne bytenr.
29702970
*
29712971
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
29722972
* in endio context.
@@ -2985,9 +2985,11 @@ static struct extent_buffer *find_extent_buffer_readpage(
29852985
return (struct extent_buffer *)page->private;
29862986
}
29872987

2988-
/* For subpage case, we need to lookup extent buffer xarray */
2989-
eb = xa_load(&fs_info->extent_buffers,
2990-
bytenr >> fs_info->sectorsize_bits);
2988+
/* For subpage case, we need to lookup buffer radix tree */
2989+
rcu_read_lock();
2990+
eb = radix_tree_lookup(&fs_info->buffer_radix,
2991+
bytenr >> fs_info->sectorsize_bits);
2992+
rcu_read_unlock();
29912993
ASSERT(eb);
29922994
return eb;
29932995
}
@@ -4434,8 +4436,8 @@ static struct extent_buffer *find_extent_buffer_nolock(
44344436
struct extent_buffer *eb;
44354437

44364438
rcu_read_lock();
4437-
eb = xa_load(&fs_info->extent_buffers,
4438-
start >> fs_info->sectorsize_bits);
4439+
eb = radix_tree_lookup(&fs_info->buffer_radix,
4440+
start >> fs_info->sectorsize_bits);
44394441
if (eb && atomic_inc_not_zero(&eb->refs)) {
44404442
rcu_read_unlock();
44414443
return eb;
@@ -6128,22 +6130,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
61286130
if (!eb)
61296131
return ERR_PTR(-ENOMEM);
61306132
eb->fs_info = fs_info;
6131-
6132-
do {
6133-
ret = xa_insert(&fs_info->extent_buffers,
6134-
start >> fs_info->sectorsize_bits,
6135-
eb, GFP_NOFS);
6136-
if (ret == -ENOMEM) {
6137-
exists = ERR_PTR(ret);
6133+
again:
6134+
ret = radix_tree_preload(GFP_NOFS);
6135+
if (ret) {
6136+
exists = ERR_PTR(ret);
6137+
goto free_eb;
6138+
}
6139+
spin_lock(&fs_info->buffer_lock);
6140+
ret = radix_tree_insert(&fs_info->buffer_radix,
6141+
start >> fs_info->sectorsize_bits, eb);
6142+
spin_unlock(&fs_info->buffer_lock);
6143+
radix_tree_preload_end();
6144+
if (ret == -EEXIST) {
6145+
exists = find_extent_buffer(fs_info, start);
6146+
if (exists)
61386147
goto free_eb;
6139-
}
6140-
if (ret == -EBUSY) {
6141-
exists = find_extent_buffer(fs_info, start);
6142-
if (exists)
6143-
goto free_eb;
6144-
}
6145-
} while (ret);
6146-
6148+
else
6149+
goto again;
6150+
}
61476151
check_buffer_tree_ref(eb);
61486152
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
61496153

@@ -6318,22 +6322,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
63186322
}
63196323
if (uptodate)
63206324
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6321-
6322-
do {
6323-
ret = xa_insert(&fs_info->extent_buffers,
6324-
start >> fs_info->sectorsize_bits,
6325-
eb, GFP_NOFS);
6326-
if (ret == -ENOMEM) {
6327-
exists = ERR_PTR(ret);
6325+
again:
6326+
ret = radix_tree_preload(GFP_NOFS);
6327+
if (ret) {
6328+
exists = ERR_PTR(ret);
6329+
goto free_eb;
6330+
}
6331+
6332+
spin_lock(&fs_info->buffer_lock);
6333+
ret = radix_tree_insert(&fs_info->buffer_radix,
6334+
start >> fs_info->sectorsize_bits, eb);
6335+
spin_unlock(&fs_info->buffer_lock);
6336+
radix_tree_preload_end();
6337+
if (ret == -EEXIST) {
6338+
exists = find_extent_buffer(fs_info, start);
6339+
if (exists)
63286340
goto free_eb;
6329-
}
6330-
if (ret == -EBUSY) {
6331-
exists = find_extent_buffer(fs_info, start);
6332-
if (exists)
6333-
goto free_eb;
6334-
}
6335-
} while (ret);
6336-
6341+
else
6342+
goto again;
6343+
}
63376344
/* add one reference for the tree */
63386345
check_buffer_tree_ref(eb);
63396346
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6378,8 +6385,10 @@ static int release_extent_buffer(struct extent_buffer *eb)
63786385

63796386
spin_unlock(&eb->refs_lock);
63806387

6381-
xa_erase(&fs_info->extent_buffers,
6382-
eb->start >> fs_info->sectorsize_bits);
6388+
spin_lock(&fs_info->buffer_lock);
6389+
radix_tree_delete(&fs_info->buffer_radix,
6390+
eb->start >> fs_info->sectorsize_bits);
6391+
spin_unlock(&fs_info->buffer_lock);
63836392
} else {
63846393
spin_unlock(&eb->refs_lock);
63856394
}
@@ -7324,25 +7333,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
73247333
}
73257334
}
73267335

7336+
#define GANG_LOOKUP_SIZE 16
73277337
static struct extent_buffer *get_next_extent_buffer(
73287338
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
73297339
{
7330-
struct extent_buffer *eb;
7331-
unsigned long index;
7340+
struct extent_buffer *gang[GANG_LOOKUP_SIZE];
7341+
struct extent_buffer *found = NULL;
73327342
u64 page_start = page_offset(page);
7343+
u64 cur = page_start;
73337344

73347345
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
73357346
lockdep_assert_held(&fs_info->buffer_lock);
73367347

7337-
xa_for_each_start(&fs_info->extent_buffers, index, eb,
7338-
page_start >> fs_info->sectorsize_bits) {
7339-
if (in_range(eb->start, page_start, PAGE_SIZE))
7340-
return eb;
7341-
else if (eb->start >= page_start + PAGE_SIZE)
7342-
/* Already beyond page end */
7343-
return NULL;
7348+
while (cur < page_start + PAGE_SIZE) {
7349+
int ret;
7350+
int i;
7351+
7352+
ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7353+
(void **)gang, cur >> fs_info->sectorsize_bits,
7354+
min_t(unsigned int, GANG_LOOKUP_SIZE,
7355+
PAGE_SIZE / fs_info->nodesize));
7356+
if (ret == 0)
7357+
goto out;
7358+
for (i = 0; i < ret; i++) {
7359+
/* Already beyond page end */
7360+
if (gang[i]->start >= page_start + PAGE_SIZE)
7361+
goto out;
7362+
/* Found one */
7363+
if (gang[i]->start >= bytenr) {
7364+
found = gang[i];
7365+
goto out;
7366+
}
7367+
}
7368+
cur = gang[ret - 1]->start + gang[ret - 1]->len;
73447369
}
7345-
return NULL;
7370+
out:
7371+
return found;
73467372
}
73477373

73487374
static int try_release_subpage_extent_buffer(struct page *page)

fs/btrfs/tests/btrfs-tests.c

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
150150

151151
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
152152
{
153-
unsigned long index;
154-
struct extent_buffer *eb;
153+
struct radix_tree_iter iter;
154+
void **slot;
155155
struct btrfs_device *dev, *tmp;
156156

157157
if (!fs_info)
@@ -163,9 +163,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
163163

164164
test_mnt->mnt_sb->s_fs_info = NULL;
165165

166-
xa_for_each(&fs_info->extent_buffers, index, eb) {
166+
spin_lock(&fs_info->buffer_lock);
167+
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
168+
struct extent_buffer *eb;
169+
170+
eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
171+
if (!eb)
172+
continue;
173+
/* Shouldn't happen but that kind of thinking creates CVE's */
174+
if (radix_tree_exception(eb)) {
175+
if (radix_tree_deref_retry(eb))
176+
slot = radix_tree_iter_retry(&iter);
177+
continue;
178+
}
179+
slot = radix_tree_iter_resume(slot, &iter);
180+
spin_unlock(&fs_info->buffer_lock);
167181
free_extent_buffer_stale(eb);
182+
spin_lock(&fs_info->buffer_lock);
168183
}
184+
spin_unlock(&fs_info->buffer_lock);
169185

170186
btrfs_mapping_tree_free(&fs_info->mapping_tree);
171187
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,

0 commit comments

Comments
 (0)