Skip to content

Commit a4c7631

Browse files
committed
Merge tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs
Push bcachefs fixes from Kent Overstreet: "The data corruption in the buffered write path is troubling; inode lock should not have been able to cause that... - Fix a rare data corruption in the rebalance path, caught as a nonce inconsistency on encrypted filesystems - Revert lockless buffered write path - Mark more errors as autofix" * tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs: bcachefs: Mark more errors as autofix bcachefs: Revert lockless buffered IO path bcachefs: Fix bch2_extents_match() false positive bcachefs: Fix failure to return error in data_update_index_update()
2 parents 6cd90e5 + 3d3020c commit a4c7631

File tree

5 files changed

+68
-116
lines changed

5 files changed

+68
-116
lines changed

fs/bcachefs/data_update.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
337337
printbuf_exit(&buf);
338338

339339
bch2_fatal_error(c);
340+
ret = -EIO;
340341
goto out;
341342
}
342343

fs/bcachefs/errcode.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@
257257
x(BCH_ERR_nopromote, nopromote_in_flight) \
258258
x(BCH_ERR_nopromote, nopromote_no_writes) \
259259
x(BCH_ERR_nopromote, nopromote_enomem) \
260-
x(0, need_inode_lock) \
261260
x(0, invalid_snapshot_node) \
262261
x(0, option_needs_open_fs)
263262

fs/bcachefs/extents.c

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
929929
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
930930
if (p1.ptr.dev == p2.ptr.dev &&
931931
p1.ptr.gen == p2.ptr.gen &&
932+
933+
/*
934+
* This checks that the two pointers point
935+
* to the same region on disk - adjusting
936+
* for the difference in where the extents
937+
* start, since one may have been trimmed:
938+
*/
932939
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
933-
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
940+
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
941+
942+
/*
943+
* This additionally checks that the
944+
* extents overlap on disk, since the
945+
* previous check may trigger spuriously
946+
* when one extent is immediately partially
947+
* overwritten with another extent (so that
948+
* on disk they are adjacent) and
949+
* compression is in use:
950+
*/
951+
((p1.ptr.offset >= p2.ptr.offset &&
952+
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
953+
(p2.ptr.offset >= p1.ptr.offset &&
954+
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
934955
return true;
935956

936957
return false;

fs/bcachefs/fs-io-buffered.c

Lines changed: 40 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
802802
static int __bch2_buffered_write(struct bch_inode_info *inode,
803803
struct address_space *mapping,
804804
struct iov_iter *iter,
805-
loff_t pos, unsigned len,
806-
bool inode_locked)
805+
loff_t pos, unsigned len)
807806
{
808807
struct bch_fs *c = inode->v.i_sb->s_fs_info;
809808
struct bch2_folio_reservation res;
@@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
827826

828827
BUG_ON(!fs.nr);
829828

830-
/*
831-
* If we're not using the inode lock, we need to lock all the folios for
832-
* atomiticity of writes vs. other writes:
833-
*/
834-
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
835-
ret = -BCH_ERR_need_inode_lock;
836-
goto out;
837-
}
838-
839829
f = darray_first(fs);
840830
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
841831
ret = bch2_read_single_folio(f, mapping);
@@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
932922
end = pos + copied;
933923

934924
spin_lock(&inode->v.i_lock);
935-
if (end > inode->v.i_size) {
936-
BUG_ON(!inode_locked);
925+
if (end > inode->v.i_size)
937926
i_size_write(&inode->v, end);
938-
}
939927
spin_unlock(&inode->v.i_lock);
940928

941929
f_pos = pos;
@@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
979967
struct file *file = iocb->ki_filp;
980968
struct address_space *mapping = file->f_mapping;
981969
struct bch_inode_info *inode = file_bch_inode(file);
982-
loff_t pos;
983-
bool inode_locked = false;
984-
ssize_t written = 0, written2 = 0, ret = 0;
985-
986-
/*
987-
* We don't take the inode lock unless i_size will be changing. Folio
988-
* locks provide exclusion with other writes, and the pagecache add lock
989-
* provides exclusion with truncate and hole punching.
990-
*
991-
* There is one nasty corner case where atomicity would be broken
992-
* without great care: when copying data from userspace to the page
993-
* cache, we do that with faults disable - a page fault would recurse
994-
* back into the filesystem, taking filesystem locks again, and
995-
* deadlock; so it's done with faults disabled, and we fault in the user
996-
* buffer when we aren't holding locks.
997-
*
998-
* If we do part of the write, but we then race and in the userspace
999-
* buffer have been evicted and are no longer resident, then we have to
1000-
* drop our folio locks to re-fault them in, breaking write atomicity.
1001-
*
1002-
* To fix this, we restart the write from the start, if we weren't
1003-
* holding the inode lock.
1004-
*
1005-
* There is another wrinkle after that; if we restart the write from the
1006-
* start, and then get an unrecoverable error, we _cannot_ claim to
1007-
* userspace that we did not write data we actually did - so we must
1008-
* track (written2) the most we ever wrote.
1009-
*/
1010-
1011-
if ((iocb->ki_flags & IOCB_APPEND) ||
1012-
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
1013-
inode_lock(&inode->v);
1014-
inode_locked = true;
1015-
}
1016-
1017-
ret = generic_write_checks(iocb, iter);
1018-
if (ret <= 0)
1019-
goto unlock;
1020-
1021-
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
1022-
if (ret) {
1023-
if (!inode_locked) {
1024-
inode_lock(&inode->v);
1025-
inode_locked = true;
1026-
ret = file_remove_privs_flags(file, 0);
1027-
}
1028-
if (ret)
1029-
goto unlock;
1030-
}
1031-
1032-
ret = file_update_time(file);
1033-
if (ret)
1034-
goto unlock;
1035-
1036-
pos = iocb->ki_pos;
970+
loff_t pos = iocb->ki_pos;
971+
ssize_t written = 0;
972+
int ret = 0;
1037973

1038974
bch2_pagecache_add_get(inode);
1039975

1040-
if (!inode_locked &&
1041-
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
1042-
goto get_inode_lock;
1043-
1044976
do {
1045977
unsigned offset = pos & (PAGE_SIZE - 1);
1046978
unsigned bytes = iov_iter_count(iter);
@@ -1065,17 +997,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1065997
}
1066998
}
1067999

1068-
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
1069-
goto get_inode_lock;
1070-
10711000
if (unlikely(fatal_signal_pending(current))) {
10721001
ret = -EINTR;
10731002
break;
10741003
}
10751004

1076-
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
1077-
if (ret == -BCH_ERR_need_inode_lock)
1078-
goto get_inode_lock;
1005+
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
10791006
if (unlikely(ret < 0))
10801007
break;
10811008

@@ -1096,46 +1023,50 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
10961023
}
10971024
pos += ret;
10981025
written += ret;
1099-
written2 = max(written, written2);
1100-
1101-
if (ret != bytes && !inode_locked)
1102-
goto get_inode_lock;
11031026
ret = 0;
11041027

11051028
balance_dirty_pages_ratelimited(mapping);
1106-
1107-
if (0) {
1108-
get_inode_lock:
1109-
bch2_pagecache_add_put(inode);
1110-
inode_lock(&inode->v);
1111-
inode_locked = true;
1112-
bch2_pagecache_add_get(inode);
1113-
1114-
iov_iter_revert(iter, written);
1115-
pos -= written;
1116-
written = 0;
1117-
ret = 0;
1118-
}
11191029
} while (iov_iter_count(iter));
1120-
bch2_pagecache_add_put(inode);
1121-
unlock:
1122-
if (inode_locked)
1123-
inode_unlock(&inode->v);
11241030

1125-
iocb->ki_pos += written;
1031+
bch2_pagecache_add_put(inode);
11261032

1127-
ret = max(written, written2) ?: ret;
1128-
if (ret > 0)
1129-
ret = generic_write_sync(iocb, ret);
1130-
return ret;
1033+
return written ? written : ret;
11311034
}
11321035

1133-
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
1036+
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
11341037
{
1135-
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
1136-
? bch2_direct_write(iocb, iter)
1137-
: bch2_buffered_write(iocb, iter);
1038+
struct file *file = iocb->ki_filp;
1039+
struct bch_inode_info *inode = file_bch_inode(file);
1040+
ssize_t ret;
1041+
1042+
if (iocb->ki_flags & IOCB_DIRECT) {
1043+
ret = bch2_direct_write(iocb, from);
1044+
goto out;
1045+
}
1046+
1047+
inode_lock(&inode->v);
1048+
1049+
ret = generic_write_checks(iocb, from);
1050+
if (ret <= 0)
1051+
goto unlock;
1052+
1053+
ret = file_remove_privs(file);
1054+
if (ret)
1055+
goto unlock;
1056+
1057+
ret = file_update_time(file);
1058+
if (ret)
1059+
goto unlock;
1060+
1061+
ret = bch2_buffered_write(iocb, from);
1062+
if (likely(ret > 0))
1063+
iocb->ki_pos += ret;
1064+
unlock:
1065+
inode_unlock(&inode->v);
11381066

1067+
if (ret > 0)
1068+
ret = generic_write_sync(iocb, ret);
1069+
out:
11391070
return bch2_err_class(ret);
11401071
}
11411072

fs/bcachefs/sb-errors_format.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ enum bch_fsck_flags {
2323
x(jset_past_bucket_end, 9, 0) \
2424
x(jset_seq_blacklisted, 10, 0) \
2525
x(journal_entries_missing, 11, 0) \
26-
x(journal_entry_replicas_not_marked, 12, 0) \
26+
x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
2727
x(journal_entry_past_jset_end, 13, 0) \
2828
x(journal_entry_replicas_data_mismatch, 14, 0) \
2929
x(journal_entry_bkey_u64s_0, 15, 0) \
@@ -288,10 +288,10 @@ enum bch_fsck_flags {
288288
x(invalid_btree_id, 274, 0) \
289289
x(alloc_key_io_time_bad, 275, 0) \
290290
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
291-
x(accounting_key_junk_at_end, 277, 0) \
292-
x(accounting_key_replicas_nr_devs_0, 278, 0) \
293-
x(accounting_key_replicas_nr_required_bad, 279, 0) \
294-
x(accounting_key_replicas_devs_unsorted, 280, 0) \
291+
x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
292+
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
293+
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
294+
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
295295

296296
enum bch_sb_error_id {
297297
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,

0 commit comments

Comments
 (0)