Skip to content

Commit 3ee65c0

Browse files
committed
Merge tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: "A few more fixes for various problems that have user visible effects or seem to be urgent: - fix corruption when combining DIO and non-blocking io_uring over multiple extents (seen on MariaDB) - fix relocation crash due to premature return from commit - fix quota deadlock between rescan and qgroup removal - fix item data bounds checks in tree-checker (found on a fuzzed image) - fix fsync of prealloc extents after EOF - add missing run of delayed items after unlink during log replay - don't start relocation until snapshot drop is finished - fix reversed condition for subpage writers locking - fix warning on page error" * tag 'for-5.17-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fallback to blocking mode when doing async dio over multiple extents btrfs: add missing run of delayed items after unlink during log replay btrfs: qgroup: fix deadlock between rescan worker and remove qgroup btrfs: fix relocation crash due to premature return from btrfs_commit_transaction() btrfs: do not start relocation until in progress drops are done btrfs: tree-checker: use u64 for item data end to avoid overflow btrfs: do not WARN_ON() if we have PageError set btrfs: fix lost prealloc extents beyond eof after full fsync btrfs: subpage: fix a wrong check on subpage->writers
2 parents f81664f + ca93e44 commit 3ee65c0

File tree

13 files changed

+230
-28
lines changed

13 files changed

+230
-28
lines changed

fs/btrfs/ctree.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,9 @@ enum {
602602
/* Indicate that we want the transaction kthread to commit right now. */
603603
BTRFS_FS_COMMIT_TRANS,
604604

605+
/* Indicate we have half completed snapshot deletions pending. */
606+
BTRFS_FS_UNFINISHED_DROPS,
607+
605608
#if BITS_PER_LONG == 32
606609
/* Indicate if we have error/warn message printed on 32bit systems */
607610
BTRFS_FS_32BIT_ERROR,
@@ -1106,8 +1109,15 @@ enum {
11061109
BTRFS_ROOT_QGROUP_FLUSHING,
11071110
/* We started the orphan cleanup for this root. */
11081111
BTRFS_ROOT_ORPHAN_CLEANUP,
1112+
/* This root has a drop operation that was started previously. */
1113+
BTRFS_ROOT_UNFINISHED_DROP,
11091114
};
11101115

1116+
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1117+
{
1118+
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
1119+
}
1120+
11111121
/*
11121122
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
11131123
* code. For detail check comment in fs/btrfs/qgroup.c.

fs/btrfs/disk-io.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3813,6 +3813,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
38133813

38143814
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
38153815

3816+
/* Kick the cleaner thread so it'll start deleting snapshots. */
3817+
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
3818+
wake_up_process(fs_info->cleaner_kthread);
3819+
38163820
clear_oneshot:
38173821
btrfs_clear_oneshot_options(fs_info);
38183822
return 0;
@@ -4538,6 +4542,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
45384542
*/
45394543
kthread_park(fs_info->cleaner_kthread);
45404544

4545+
/*
4546+
* If we had UNFINISHED_DROPS we could still be processing them, so
4547+
* clear that bit and wake up relocation so it can stop.
4548+
*/
4549+
btrfs_wake_unfinished_drop(fs_info);
4550+
45414551
/* wait for the qgroup rescan worker to stop */
45424552
btrfs_qgroup_wait_for_completion(fs_info, false);
45434553

fs/btrfs/extent-tree.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5622,6 +5622,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
56225622
int ret;
56235623
int level;
56245624
bool root_dropped = false;
5625+
bool unfinished_drop = false;
56255626

56265627
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
56275628

@@ -5664,6 +5665,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
56645665
* already dropped.
56655666
*/
56665667
set_bit(BTRFS_ROOT_DELETING, &root->state);
5668+
unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
5669+
56675670
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
56685671
level = btrfs_header_level(root->node);
56695672
path->nodes[level] = btrfs_lock_root_node(root);
@@ -5838,6 +5841,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
58385841
kfree(wc);
58395842
btrfs_free_path(path);
58405843
out:
5844+
/*
5845+
* We were an unfinished drop root, check to see if there are any
5846+
* pending, and if not clear and wake up any waiters.
5847+
*/
5848+
if (!err && unfinished_drop)
5849+
btrfs_maybe_wake_unfinished_drop(fs_info);
5850+
58415851
/*
58425852
* So if we need to stop dropping the snapshot for whatever reason we
58435853
* need to make sure to add it back to the dead root list so that we

fs/btrfs/extent_io.c

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6841,14 +6841,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
68416841
{
68426842
struct btrfs_fs_info *fs_info = eb->fs_info;
68436843

6844+
/*
6845+
* If we are using the commit root we could potentially clear a page
6846+
* Uptodate while we're using the extent buffer that we've previously
6847+
* looked up. We don't want to complain in this case, as the page was
6848+
* valid before, we just didn't write it out. Instead we want to catch
6849+
* the case where we didn't actually read the block properly, which
6850+
* would have !PageUptodate && !PageError, as we clear PageError before
6851+
* reading.
6852+
*/
68446853
if (fs_info->sectorsize < PAGE_SIZE) {
6845-
bool uptodate;
6854+
bool uptodate, error;
68466855

68476856
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
68486857
eb->start, eb->len);
6849-
WARN_ON(!uptodate);
6858+
error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6859+
WARN_ON(!uptodate && !error);
68506860
} else {
6851-
WARN_ON(!PageUptodate(page));
6861+
WARN_ON(!PageUptodate(page) && !PageError(page));
68526862
}
68536863
}
68546864

fs/btrfs/inode.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7600,6 +7600,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
76007600
}
76017601

76027602
len = min(len, em->len - (start - em->start));
7603+
7604+
/*
7605+
* If we have a NOWAIT request and the range contains multiple extents
7606+
* (or a mix of extents and holes), then we return -EAGAIN to make the
7607+
* caller fallback to a context where it can do a blocking (without
7608+
* NOWAIT) request. This way we avoid doing partial IO and returning
7609+
* success to the caller, which is not optimal for writes and for reads
7610+
* it can result in unexpected behaviour for an application.
7611+
*
7612+
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7613+
* iomap_dio_rw(), we can end up returning less data then what the caller
7614+
* asked for, resulting in an unexpected, and incorrect, short read.
7615+
* That is, the caller asked to read N bytes and we return less than that,
7616+
* which is wrong unless we are crossing EOF. This happens if we get a
7617+
* page fault error when trying to fault in pages for the buffer that is
7618+
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
7619+
* have previously submitted bios for other extents in the range, in
7620+
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7621+
* those bios have completed by the time we get the page fault error,
7622+
* which we return back to our caller - we should only return EIOCBQUEUED
7623+
* after we have submitted bios for all the extents in the range.
7624+
*/
7625+
if ((flags & IOMAP_NOWAIT) && len < length) {
7626+
free_extent_map(em);
7627+
ret = -EAGAIN;
7628+
goto unlock_err;
7629+
}
7630+
76037631
if (write) {
76047632
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
76057633
start, len);

fs/btrfs/qgroup.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1196,14 +1196,21 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
11961196
if (!fs_info->quota_root)
11971197
goto out;
11981198

1199+
/*
1200+
* Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1201+
* complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1202+
* to lock that mutex while holding a transaction handle and the rescan
1203+
* worker needs to commit a transaction.
1204+
*/
1205+
mutex_unlock(&fs_info->qgroup_ioctl_lock);
1206+
11991207
/*
12001208
* Request qgroup rescan worker to complete and wait for it. This wait
12011209
* must be done before transaction start for quota disable since it may
12021210
* deadlock with transaction by the qgroup rescan worker.
12031211
*/
12041212
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
12051213
btrfs_qgroup_wait_for_completion(fs_info, false);
1206-
mutex_unlock(&fs_info->qgroup_ioctl_lock);
12071214

12081215
/*
12091216
* 1 For the root item

fs/btrfs/relocation.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
39603960
int rw = 0;
39613961
int err = 0;
39623962

3963+
/*
3964+
* This only gets set if we had a half-deleted snapshot on mount. We
3965+
* cannot allow relocation to start while we're still trying to clean up
3966+
* these pending deletions.
3967+
*/
3968+
ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
3969+
if (ret)
3970+
return ret;
3971+
3972+
/* We may have been woken up by close_ctree, so bail if we're closing. */
3973+
if (btrfs_fs_closing(fs_info))
3974+
return -EINTR;
3975+
39633976
bg = btrfs_lookup_block_group(fs_info, group_start);
39643977
if (!bg)
39653978
return -ENOENT;

fs/btrfs/root-tree.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
278278

279279
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
280280
if (btrfs_root_refs(&root->root_item) == 0) {
281+
struct btrfs_key drop_key;
282+
283+
btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
284+
/*
285+
* If we have a non-zero drop_progress then we know we
286+
* made it partly through deleting this snapshot, and
287+
* thus we need to make sure we block any balance from
288+
* happening until this snapshot is completely dropped.
289+
*/
290+
if (drop_key.objectid != 0 || drop_key.type != 0 ||
291+
drop_key.offset != 0) {
292+
set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
293+
set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
294+
}
295+
281296
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
282297
btrfs_add_dead_root(root);
283298
}

fs/btrfs/subpage.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
736736
* Since we own the page lock, no one else could touch subpage::writers
737737
* and we are safe to do several atomic operations without spinlock.
738738
*/
739-
if (atomic_read(&subpage->writers))
739+
if (atomic_read(&subpage->writers) == 0)
740740
/* No writers, locked by plain lock_page() */
741741
return unlock_page(page);
742742

fs/btrfs/transaction.c

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
854854
static noinline void wait_for_commit(struct btrfs_transaction *commit,
855855
const enum btrfs_trans_state min_state)
856856
{
857-
wait_event(commit->commit_wait, commit->state >= min_state);
857+
struct btrfs_fs_info *fs_info = commit->fs_info;
858+
u64 transid = commit->transid;
859+
bool put = false;
860+
861+
while (1) {
862+
wait_event(commit->commit_wait, commit->state >= min_state);
863+
if (put)
864+
btrfs_put_transaction(commit);
865+
866+
if (min_state < TRANS_STATE_COMPLETED)
867+
break;
868+
869+
/*
870+
* A transaction isn't really completed until all of the
871+
* previous transactions are completed, but with fsync we can
872+
* end up with SUPER_COMMITTED transactions before a COMPLETED
873+
* transaction. Wait for those.
874+
*/
875+
876+
spin_lock(&fs_info->trans_lock);
877+
commit = list_first_entry_or_null(&fs_info->trans_list,
878+
struct btrfs_transaction,
879+
list);
880+
if (!commit || commit->transid > transid) {
881+
spin_unlock(&fs_info->trans_lock);
882+
break;
883+
}
884+
refcount_inc(&commit->use_count);
885+
put = true;
886+
spin_unlock(&fs_info->trans_lock);
887+
}
858888
}
859889

860890
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -1319,6 +1349,32 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
13191349
return 0;
13201350
}
13211351

1352+
/*
1353+
* If we had a pending drop we need to see if there are any others left in our
1354+
* dead roots list, and if not clear our bit and wake any waiters.
1355+
*/
1356+
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
1357+
{
1358+
/*
1359+
* We put the drop in progress roots at the front of the list, so if the
1360+
* first entry doesn't have UNFINISHED_DROP set we can wake everybody
1361+
* up.
1362+
*/
1363+
spin_lock(&fs_info->trans_lock);
1364+
if (!list_empty(&fs_info->dead_roots)) {
1365+
struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
1366+
struct btrfs_root,
1367+
root_list);
1368+
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
1369+
spin_unlock(&fs_info->trans_lock);
1370+
return;
1371+
}
1372+
}
1373+
spin_unlock(&fs_info->trans_lock);
1374+
1375+
btrfs_wake_unfinished_drop(fs_info);
1376+
}
1377+
13221378
/*
13231379
* dead roots are old snapshots that need to be deleted. This allocates
13241380
* a dirty root struct and adds it into the list of dead roots that need to
@@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
13311387
spin_lock(&fs_info->trans_lock);
13321388
if (list_empty(&root->root_list)) {
13331389
btrfs_grab_root(root);
1334-
list_add_tail(&root->root_list, &fs_info->dead_roots);
1390+
1391+
/* We want to process the partially complete drops first. */
1392+
if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
1393+
list_add(&root->root_list, &fs_info->dead_roots);
1394+
else
1395+
list_add_tail(&root->root_list, &fs_info->dead_roots);
13351396
}
13361397
spin_unlock(&fs_info->trans_lock);
13371398
}

0 commit comments

Comments
 (0)