Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 19ca0d8

Browse files
committed
Merge tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fix from David Sterba: "A fix for fast fsync that needs to handle errors during writes after some COW failure so it does not lead to an inconsistent state" * tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: ensure fast fsync waits for ordered extents after a write failure
2 parents e20b269 + f13e01b commit 19ca0d8

File tree

3 files changed

+57
-0
lines changed

3 files changed

+57
-0
lines changed

fs/btrfs/btrfs_inode.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,16 @@ enum {
8989
BTRFS_INODE_FREE_SPACE_INODE,
9090
/* Set when there are no capabilities in XATTs for the inode. */
9191
BTRFS_INODE_NO_CAP_XATTR,
92+
/*
93+
* Set if an error happened when doing a COW write before submitting a
94+
* bio or during writeback. Used for both buffered writes and direct IO
95+
* writes. This is to signal a fast fsync that it has to wait for
96+
* ordered extents to complete and therefore not log extent maps that
97+
* point to unwritten extents (when an ordered extent completes and it
98+
* has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
99+
* range).
100+
*/
101+
BTRFS_INODE_COW_WRITE_ERROR,
92102
};
93103

94104
/* in memory btrfs inode */

fs/btrfs/file.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
18851885
*/
18861886
if (full_sync || btrfs_is_zoned(fs_info)) {
18871887
ret = btrfs_wait_ordered_range(inode, start, len);
1888+
clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
18881889
} else {
18891890
/*
18901891
* Get our ordered extents as soon as possible to avoid doing
@@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
18941895
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
18951896
&ctx.ordered_extents);
18961897
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
1898+
if (ret)
1899+
goto out_release_extents;
1900+
1901+
/*
1902+
* Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
1903+
* starting and waiting for writeback, because for buffered IO
1904+
* it may have been set during the end IO callback
1905+
* (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
1906+
* case an error happened and we need to wait for ordered
1907+
* extents to complete so that any extent maps that point to
1908+
* unwritten locations are dropped and we don't log them.
1909+
*/
1910+
if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
1911+
&BTRFS_I(inode)->runtime_flags))
1912+
ret = btrfs_wait_ordered_range(inode, start, len);
18971913
}
18981914

18991915
if (ret)

fs/btrfs/ordered-data.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
388388
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
389389
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
390390

391+
/*
392+
* If this is a COW write it means we created new extent maps for the
393+
* range and they point to unwritten locations if we got an error either
394+
* before submitting a bio or during IO.
395+
*
396+
* We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
397+
* are queuing its completion below. During completion, at
398+
* btrfs_finish_one_ordered(), we will drop the extent maps for the
399+
* unwritten extents.
400+
*
401+
* However because completion runs in a work queue we can end up having
402+
* a fast fsync running before that. In the case of direct IO, once we
403+
* unlock the inode the fsync might start, and we queue the completion
404+
* before unlocking the inode. In the case of buffered IO when writeback
405+
* finishes (end_bbio_data_write()) we queue the completion, so if the
406+
* writeback was triggered by a fast fsync, the fsync might start
407+
* logging before ordered extent completion runs in the work queue.
408+
*
409+
* The fast fsync will log file extent items based on the extent maps it
410+
* finds, so if by the time it collects extent maps the ordered extent
411+
* completion didn't happen yet, it will log file extent items that
412+
* point to unwritten extents, resulting in a corruption if a crash
413+
* happens and the log tree is replayed. Note that a fast fsync does not
414+
* wait for completion of ordered extents in order to reduce latency.
415+
*
416+
* Set a flag in the inode so that the next fast fsync will wait for
417+
* ordered extents to complete before starting to log.
418+
*/
419+
if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
420+
set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
421+
391422
if (ret)
392423
btrfs_queue_ordered_fn(ordered);
393424
return ret;

0 commit comments

Comments
 (0)