Skip to content

Commit d994788

Browse files
fdmananakdave
authored andcommitted
btrfs: fix lost prealloc extents beyond eof after full fsync
When doing a full fsync, if we have prealloc extents beyond (or at) eof, and the leaves that contain them were not modified in the current transaction, we end up not logging them. This results in losing those extents when we replay the log after a power failure, since the inode is truncated to the current value of the logged i_size. Just like for the fast fsync path, we need to always log all prealloc extents starting at or beyond i_size. The fast fsync case was fixed in commit 471d557 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") but it missed the full fsync path. The problem exists since the very early days, when the log tree was added by commit e02119d ("Btrfs: Add a write ahead tree log to optimize synchronous operations"). Example reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt # Create our test file with many file extent items, so that they span # several leaves of metadata, even if the node/page size is 64K. Use # direct IO and not fsync/O_SYNC because it's both faster and it avoids # clearing the full sync flag from the inode - we want the fsync below # to trigger the slow full sync code path. $ xfs_io -f -d -c "pwrite -b 4K 0 16M" /mnt/foo # Now add two preallocated extents to our file without extending the # file's size. One right at i_size, and another further beyond, leaving # a gap between the two prealloc extents. $ xfs_io -c "falloc -k 16M 1M" /mnt/foo $ xfs_io -c "falloc -k 20M 1M" /mnt/foo # Make sure everything is durably persisted and the transaction is # committed. This makes all created extents to have a generation lower # than the generation of the transaction used by the next write and # fsync. sync # Now overwrite only the first extent, which will result in modifying # only the first leaf of metadata for our inode. Then fsync it. This # fsync will use the slow code path (inode full sync bit is set) because # it's the first fsync since the inode was created/loaded. $ xfs_io -c "pwrite 0 4K" -c "fsync" /mnt/foo # Extent list before power failure. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x0 3: [32768..34815]: 2172928..2174975 2048 0x800 4: [34816..40959]: hole 6144 5: [40960..43007]: 2174976..2177023 2048 0x801 <power fail> # Mount fs again, trigger log replay. $ mount /dev/sdc /mnt # Extent list after power failure and log replay. $ xfs_io -c "fiemap -v" /mnt/foo /mnt/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..7]: 2178048..2178055 8 0x0 1: [8..16383]: 26632..43007 16376 0x0 2: [16384..32767]: 2156544..2172927 16384 0x1 # The prealloc extents at file offsets 16M and 20M are missing. So fix this by calling btrfs_log_prealloc_extents() when we are doing a full fsync, so that we always log all prealloc extents beyond eof. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent c992fa1 commit d994788

File tree

1 file changed

+31
-12
lines changed

1 file changed

+31
-12
lines changed

fs/btrfs/tree-log.c

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4635,7 +4635,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
46354635

46364636
/*
46374637
* Log all prealloc extents beyond the inode's i_size to make sure we do not
4638-
* lose them after doing a fast fsync and replaying the log. We scan the
4638+
* lose them after doing a full/fast fsync and replaying the log. We scan the
46394639
* subvolume's root instead of iterating the inode's extent map tree because
46404640
* otherwise we can log incorrect extent items based on extent map conversion.
46414641
* That can happen due to the fact that extent maps are merged when they
@@ -5414,6 +5414,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
54145414
struct btrfs_log_ctx *ctx,
54155415
bool *need_log_inode_item)
54165416
{
5417+
const u64 i_size = i_size_read(&inode->vfs_inode);
54175418
struct btrfs_root *root = inode->root;
54185419
int ins_start_slot = 0;
54195420
int ins_nr = 0;
@@ -5434,13 +5435,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
54345435
if (min_key->type > max_key->type)
54355436
break;
54365437

5437-
if (min_key->type == BTRFS_INODE_ITEM_KEY)
5438+
if (min_key->type == BTRFS_INODE_ITEM_KEY) {
54385439
*need_log_inode_item = false;
5439-
5440-
if ((min_key->type == BTRFS_INODE_REF_KEY ||
5441-
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5442-
inode->generation == trans->transid &&
5443-
!recursive_logging) {
5440+
} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5441+
min_key->offset >= i_size) {
5442+
/*
5443+
* Extents at and beyond eof are logged with
5444+
* btrfs_log_prealloc_extents().
5445+
* Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5446+
* and no keys greater than that, so bail out.
5447+
*/
5448+
break;
5449+
} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5450+
min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5451+
inode->generation == trans->transid &&
5452+
!recursive_logging) {
54445453
u64 other_ino = 0;
54455454
u64 other_parent = 0;
54465455

@@ -5471,10 +5480,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
54715480
btrfs_release_path(path);
54725481
goto next_key;
54735482
}
5474-
}
5475-
5476-
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5477-
if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5483+
} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5484+
/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
54785485
if (ins_nr == 0)
54795486
goto next_slot;
54805487
ret = copy_items(trans, inode, dst_path, path,
@@ -5527,9 +5534,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
55275534
break;
55285535
}
55295536
}
5530-
if (ins_nr)
5537+
if (ins_nr) {
55315538
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
55325539
ins_nr, inode_only, logged_isize);
5540+
if (ret)
5541+
return ret;
5542+
}
5543+
5544+
if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5545+
/*
5546+
* Release the path because otherwise we might attempt to double
5547+
* lock the same leaf with btrfs_log_prealloc_extents() below.
5548+
*/
5549+
btrfs_release_path(path);
5550+
ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5551+
}
55335552

55345553
return ret;
55355554
}

0 commit comments

Comments
 (0)