Skip to content

Commit 82708bb

Browse files
committed
Merge tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - zoned relocation fixes: - fix critical section end for extent writeback, this could lead to out of order write - prevent writing to previous data relocation block group if space gets low - reflink fixes: - fix race between reflinking and ordered extent completion - proper error handling when block reserve migration fails - add missing inode iversion/mtime/ctime updates on each iteration when replacing extents - fix deadlock when running fsync/fiemap/commit at the same time - fix false-positive KCSAN report regarding pid tracking for read locks and data race - minor documentation update and link to new site * tag 'for-5.19-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: Documentation: update btrfs list of features and link to readthedocs.io btrfs: fix deadlock with fsync+fiemap+transaction commit btrfs: don't set lock_owner when locking extent buffer for reading btrfs: zoned: fix critical section of relocation inode writeback btrfs: zoned: prevent allocation from previous data relocation BG btrfs: do not BUG_ON() on failure to migrate space when replacing extents btrfs: add missing inode updates on each iteration when replacing extents btrfs: fix race between reflinking and ordered extent completion
2 parents c898c67 + 037e127 commit 82708bb

File tree

11 files changed

+158
-30
lines changed

11 files changed

+158
-30
lines changed

Documentation/filesystems/btrfs.rst

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,23 @@ The main Btrfs features include:
1919
* Subvolumes (separate internal filesystem roots)
2020
* Object level mirroring and striping
2121
* Checksums on data and metadata (multiple algorithms available)
22-
* Compression
22+
* Compression (multiple algorithms available)
23+
* Reflink, deduplication
24+
* Scrub (on-line checksum verification)
25+
* Hierarchical quota groups (subvolume and snapshot support)
2326
* Integrated multiple device support, with several raid algorithms
2427
* Offline filesystem check
25-
* Efficient incremental backup and FS mirroring
28+
* Efficient incremental backup and FS mirroring (send/receive)
29+
* Trim/discard
2630
* Online filesystem defragmentation
31+
* Swapfile support
32+
* Zoned mode
33+
* Read/write metadata verification
34+
* Online resize (shrink, grow)
2735

28-
For more information please refer to the wiki
36+
For more information please refer to the documentation site or wiki
37+
38+
https://btrfs.readthedocs.io
2939

3040
https://btrfs.wiki.kernel.org
3141

fs/btrfs/block-group.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ struct btrfs_block_group {
104104
unsigned int relocating_repair:1;
105105
unsigned int chunk_item_inserted:1;
106106
unsigned int zone_is_active:1;
107+
unsigned int zoned_data_reloc_ongoing:1;
107108

108109
int disk_cache_state;
109110

fs/btrfs/ctree.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,8 @@ struct btrfs_replace_extent_info {
13301330
* existing extent into a file range.
13311331
*/
13321332
bool is_new_extent;
1333+
/* Indicate if we should update the inode's mtime and ctime. */
1334+
bool update_times;
13331335
/* Meaningful only if is_new_extent is true. */
13341336
int qgroup_reserved;
13351337
/*

fs/btrfs/extent-tree.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
38323832
block_group->start == fs_info->data_reloc_bg ||
38333833
fs_info->data_reloc_bg == 0);
38343834

3835-
if (block_group->ro) {
3835+
if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
38363836
ret = 1;
38373837
goto out;
38383838
}
@@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
38943894
out:
38953895
if (ret && ffe_ctl->for_treelog)
38963896
fs_info->treelog_bg = 0;
3897-
if (ret && ffe_ctl->for_data_reloc)
3897+
if (ret && ffe_ctl->for_data_reloc &&
3898+
fs_info->data_reloc_bg == block_group->start) {
3899+
/*
3900+
* Do not allow further allocations from this block group.
3901+
* Compared to increasing the ->ro, setting the
3902+
* ->zoned_data_reloc_ongoing flag still allows nocow
3903+
* writers to come in. See btrfs_inc_nocow_writers().
3904+
*
3905+
* We need to disable an allocation to avoid an allocation of
3906+
* regular (non-relocation data) extent. With mix of relocation
3907+
* extents and regular extents, we can dispatch WRITE commands
3908+
* (for relocation extents) and ZONE APPEND commands (for
3909+
* regular extents) at the same time to the same zone, which
3910+
* easily break the write pointer.
3911+
*/
3912+
block_group->zoned_data_reloc_ongoing = 1;
38983913
fs_info->data_reloc_bg = 0;
3914+
}
38993915
spin_unlock(&fs_info->relocation_bg_lock);
39003916
spin_unlock(&fs_info->treelog_bg_lock);
39013917
spin_unlock(&block_group->lock);

fs/btrfs/extent_io.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5241,13 +5241,14 @@ int extent_writepages(struct address_space *mapping,
52415241
*/
52425242
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
52435243
ret = extent_write_cache_pages(mapping, wbc, &epd);
5244-
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
52455244
ASSERT(ret <= 0);
52465245
if (ret < 0) {
5246+
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
52475247
end_write_bio(&epd, ret);
52485248
return ret;
52495249
}
52505250
flush_write_bio(&epd);
5251+
btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
52515252
return ret;
52525253
}
52535254

fs/btrfs/file.c

Lines changed: 75 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
23232323
*/
23242324
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
23252325

2326-
if (ret != BTRFS_NO_LOG_SYNC) {
2326+
if (ret == BTRFS_NO_LOG_SYNC) {
2327+
ret = btrfs_end_transaction(trans);
2328+
goto out;
2329+
}
2330+
2331+
/* We successfully logged the inode, attempt to sync the log. */
2332+
if (!ret) {
2333+
ret = btrfs_sync_log(trans, root, &ctx);
23272334
if (!ret) {
2328-
ret = btrfs_sync_log(trans, root, &ctx);
2329-
if (!ret) {
2330-
ret = btrfs_end_transaction(trans);
2331-
goto out;
2332-
}
2333-
}
2334-
if (!full_sync) {
2335-
ret = btrfs_wait_ordered_range(inode, start, len);
2336-
if (ret) {
2337-
btrfs_end_transaction(trans);
2338-
goto out;
2339-
}
2335+
ret = btrfs_end_transaction(trans);
2336+
goto out;
23402337
}
2341-
ret = btrfs_commit_transaction(trans);
2342-
} else {
2338+
}
2339+
2340+
/*
2341+
* At this point we need to commit the transaction because we had
2342+
* btrfs_need_log_full_commit() or some other error.
2343+
*
2344+
* If we didn't do a full sync we have to stop the trans handle, wait on
2345+
* the ordered extents, start it again and commit the transaction. If
2346+
* we attempt to wait on the ordered extents here we could deadlock with
2347+
* something like fallocate() that is holding the extent lock trying to
2348+
* start a transaction while some other thread is trying to commit the
2349+
* transaction while we (fsync) are currently holding the transaction
2350+
* open.
2351+
*/
2352+
if (!full_sync) {
23432353
ret = btrfs_end_transaction(trans);
2354+
if (ret)
2355+
goto out;
2356+
ret = btrfs_wait_ordered_range(inode, start, len);
2357+
if (ret)
2358+
goto out;
2359+
2360+
/*
2361+
* This is safe to use here because we're only interested in
2362+
* making sure the transaction that had the ordered extents is
2363+
* committed. We aren't waiting on anything past this point,
2364+
* we're purely getting the transaction and committing it.
2365+
*/
2366+
trans = btrfs_attach_transaction_barrier(root);
2367+
if (IS_ERR(trans)) {
2368+
ret = PTR_ERR(trans);
2369+
2370+
/*
2371+
* We committed the transaction and there's no currently
2372+
* running transaction, this means everything we care
2373+
* about made it to disk and we are done.
2374+
*/
2375+
if (ret == -ENOENT)
2376+
ret = 0;
2377+
goto out;
2378+
}
23442379
}
2380+
2381+
ret = btrfs_commit_transaction(trans);
23452382
out:
23462383
ASSERT(list_empty(&ctx.list));
23472384
err = file_check_and_advance_wb_err(file);
@@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
27192756

27202757
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
27212758
min_size, false);
2722-
BUG_ON(ret);
2759+
if (WARN_ON(ret))
2760+
goto out_trans;
27232761
trans->block_rsv = rsv;
27242762

27252763
cur_offset = start;
@@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
28032841
extent_info->file_offset += replace_len;
28042842
}
28052843

2844+
/*
2845+
* We are releasing our handle on the transaction, balance the
2846+
* dirty pages of the btree inode and flush delayed items, and
2847+
* then get a new transaction handle, which may now point to a
2848+
* new transaction in case someone else may have committed the
2849+
* transaction we used to replace/drop file extent items. So
2850+
* bump the inode's iversion and update mtime and ctime except
2851+
* if we are called from a dedupe context. This is because a
2852+
* power failure/crash may happen after the transaction is
2853+
* committed and before we finish replacing/dropping all the
2854+
* file extent items we need.
2855+
*/
2856+
inode_inc_iversion(&inode->vfs_inode);
2857+
2858+
if (!extent_info || extent_info->update_times) {
2859+
inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
2860+
inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
2861+
}
2862+
28062863
ret = btrfs_update_inode(trans, root, inode);
28072864
if (ret)
28082865
break;
@@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
28192876

28202877
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
28212878
rsv, min_size, false);
2822-
BUG_ON(ret); /* shouldn't happen */
2879+
if (WARN_ON(ret))
2880+
break;
28232881
trans->block_rsv = rsv;
28242882

28252883
cur_offset = drop_args.drop_end;

fs/btrfs/inode.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
31953195
ordered_extent->file_offset,
31963196
ordered_extent->file_offset +
31973197
logical_len);
3198+
btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3199+
ordered_extent->disk_num_bytes);
31983200
} else {
31993201
BUG_ON(root == fs_info->tree_root);
32003202
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
@@ -9897,6 +9899,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
98979899
extent_info.file_offset = file_offset;
98989900
extent_info.extent_buf = (char *)&stack_fi;
98999901
extent_info.is_new_extent = true;
9902+
extent_info.update_times = true;
99009903
extent_info.qgroup_reserved = qgroup_released;
99019904
extent_info.insertions = 0;
99029905

fs/btrfs/locking.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
4545
start_ns = ktime_get_ns();
4646

4747
down_read_nested(&eb->lock, nest);
48-
eb->lock_owner = current->pid;
4948
trace_btrfs_tree_read_lock(eb, start_ns);
5049
}
5150

@@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
6261
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
6362
{
6463
if (down_read_trylock(&eb->lock)) {
65-
eb->lock_owner = current->pid;
6664
trace_btrfs_try_tree_read_lock(eb);
6765
return 1;
6866
}
@@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
9088
void btrfs_tree_read_unlock(struct extent_buffer *eb)
9189
{
9290
trace_btrfs_tree_read_unlock(eb);
93-
eb->lock_owner = 0;
9491
up_read(&eb->lock);
9592
}
9693

fs/btrfs/reflink.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
344344
int ret;
345345
const u64 len = olen_aligned;
346346
u64 last_dest_end = destoff;
347+
u64 prev_extent_end = off;
347348

348349
ret = -ENOMEM;
349350
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
@@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
363364
key.offset = off;
364365

365366
while (1) {
366-
u64 next_key_min_offset = key.offset + 1;
367367
struct btrfs_file_extent_item *extent;
368368
u64 extent_gen;
369369
int type;
@@ -431,14 +431,21 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
431431
* The first search might have left us at an extent item that
432432
* ends before our target range's start, can happen if we have
433433
* holes and NO_HOLES feature enabled.
434+
*
435+
* Subsequent searches may leave us on a file range we have
436+
* processed before - this happens due to a race with ordered
437+
* extent completion for a file range that is outside our source
438+
* range, but that range was part of a file extent item that
439+
* also covered a leading part of our source range.
434440
*/
435-
if (key.offset + datal <= off) {
441+
if (key.offset + datal <= prev_extent_end) {
436442
path->slots[0]++;
437443
goto process_slot;
438444
} else if (key.offset >= off + len) {
439445
break;
440446
}
441-
next_key_min_offset = key.offset + datal;
447+
448+
prev_extent_end = key.offset + datal;
442449
size = btrfs_item_size(leaf, slot);
443450
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
444451
size);
@@ -489,6 +496,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
489496
clone_info.file_offset = new_key.offset;
490497
clone_info.extent_buf = buf;
491498
clone_info.is_new_extent = false;
499+
clone_info.update_times = !no_time_update;
492500
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
493501
drop_start, new_key.offset + datal - 1,
494502
&clone_info, &trans);
@@ -550,7 +558,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
550558
break;
551559

552560
btrfs_release_path(path);
553-
key.offset = next_key_min_offset;
561+
key.offset = prev_extent_end;
554562

555563
if (fatal_signal_pending(current)) {
556564
ret = -EINTR;

fs/btrfs/zoned.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2139,3 +2139,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
21392139
factor = div64_u64(used * 100, total);
21402140
return factor >= fs_info->bg_reclaim_threshold;
21412141
}
2142+
2143+
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
2144+
u64 length)
2145+
{
2146+
struct btrfs_block_group *block_group;
2147+
2148+
if (!btrfs_is_zoned(fs_info))
2149+
return;
2150+
2151+
block_group = btrfs_lookup_block_group(fs_info, logical);
2152+
/* It should be called on a previous data relocation block group. */
2153+
ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
2154+
2155+
spin_lock(&block_group->lock);
2156+
if (!block_group->zoned_data_reloc_ongoing)
2157+
goto out;
2158+
2159+
/* All relocation extents are written. */
2160+
if (block_group->start + block_group->alloc_offset == logical + length) {
2161+
/* Now, release this block group for further allocations. */
2162+
block_group->zoned_data_reloc_ongoing = 0;
2163+
}
2164+
2165+
out:
2166+
spin_unlock(&block_group->lock);
2167+
btrfs_put_block_group(block_group);
2168+
}

0 commit comments

Comments
 (0)