Skip to content

Commit e3e6940

Browse files
author
Kent Overstreet
committed
bcachefs: Revert lockless buffered IO path
We had a report of data corruption on nixos when building installer images. NixOS/nixpkgs#321055 (comment) It seems that writes are being dropped, but only when issued by QEMU, and possibly only in snapshot mode. It's undetermined if it's write calls are being dropped or dirty folios. Further testing, via minimizing the original patch to just the change that skips the inode lock on non appends/truncates, reveals that it really is just not taking the inode lock that causes the corruption: it has nothing to do with the other logic changes for preserving write atomicity in corner cases. It's also kernel config dependent: it doesn't reproduce with the minimal kernel config that ktest uses, but it does reproduce with nixos's distro config. Bisection the kernel config initially pointer the finger at page migration or compaction, but it appears that was erroneous; we haven't yet determined what kernel config option actually triggers it. Sadly it appears this will have to be reverted since we're getting too close to release and my plate is full, but we'd _really_ like to fully debug it. My suspicion is that this patch is exposing a preexisting bug - the inode lock actually covers very little in IO paths, and we have a different lock (the pagecache add lock) that guards against races with truncate here. Fixes: 7e64c86 ("bcachefs: Buffered write path now can avoid the inode lock") Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
1 parent d269356 commit e3e6940

File tree

2 files changed

+40
-110
lines changed

2 files changed

+40
-110
lines changed

fs/bcachefs/errcode.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,6 @@
257257
x(BCH_ERR_nopromote, nopromote_in_flight) \
258258
x(BCH_ERR_nopromote, nopromote_no_writes) \
259259
x(BCH_ERR_nopromote, nopromote_enomem) \
260-
x(0, need_inode_lock) \
261260
x(0, invalid_snapshot_node) \
262261
x(0, option_needs_open_fs)
263262

fs/bcachefs/fs-io-buffered.c

Lines changed: 40 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
802802
static int __bch2_buffered_write(struct bch_inode_info *inode,
803803
struct address_space *mapping,
804804
struct iov_iter *iter,
805-
loff_t pos, unsigned len,
806-
bool inode_locked)
805+
loff_t pos, unsigned len)
807806
{
808807
struct bch_fs *c = inode->v.i_sb->s_fs_info;
809808
struct bch2_folio_reservation res;
@@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
827826

828827
BUG_ON(!fs.nr);
829828

830-
/*
831-
* If we're not using the inode lock, we need to lock all the folios for
832-
* atomiticity of writes vs. other writes:
833-
*/
834-
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
835-
ret = -BCH_ERR_need_inode_lock;
836-
goto out;
837-
}
838-
839829
f = darray_first(fs);
840830
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
841831
ret = bch2_read_single_folio(f, mapping);
@@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
932922
end = pos + copied;
933923

934924
spin_lock(&inode->v.i_lock);
935-
if (end > inode->v.i_size) {
936-
BUG_ON(!inode_locked);
925+
if (end > inode->v.i_size)
937926
i_size_write(&inode->v, end);
938-
}
939927
spin_unlock(&inode->v.i_lock);
940928

941929
f_pos = pos;
@@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
979967
struct file *file = iocb->ki_filp;
980968
struct address_space *mapping = file->f_mapping;
981969
struct bch_inode_info *inode = file_bch_inode(file);
982-
loff_t pos;
983-
bool inode_locked = false;
984-
ssize_t written = 0, written2 = 0, ret = 0;
985-
986-
/*
987-
* We don't take the inode lock unless i_size will be changing. Folio
988-
* locks provide exclusion with other writes, and the pagecache add lock
989-
* provides exclusion with truncate and hole punching.
990-
*
991-
* There is one nasty corner case where atomicity would be broken
992-
* without great care: when copying data from userspace to the page
993-
* cache, we do that with faults disable - a page fault would recurse
994-
* back into the filesystem, taking filesystem locks again, and
995-
* deadlock; so it's done with faults disabled, and we fault in the user
996-
* buffer when we aren't holding locks.
997-
*
998-
* If we do part of the write, but we then race and in the userspace
999-
* buffer have been evicted and are no longer resident, then we have to
1000-
* drop our folio locks to re-fault them in, breaking write atomicity.
1001-
*
1002-
* To fix this, we restart the write from the start, if we weren't
1003-
* holding the inode lock.
1004-
*
1005-
* There is another wrinkle after that; if we restart the write from the
1006-
* start, and then get an unrecoverable error, we _cannot_ claim to
1007-
* userspace that we did not write data we actually did - so we must
1008-
* track (written2) the most we ever wrote.
1009-
*/
1010-
1011-
if ((iocb->ki_flags & IOCB_APPEND) ||
1012-
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
1013-
inode_lock(&inode->v);
1014-
inode_locked = true;
1015-
}
1016-
1017-
ret = generic_write_checks(iocb, iter);
1018-
if (ret <= 0)
1019-
goto unlock;
1020-
1021-
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
1022-
if (ret) {
1023-
if (!inode_locked) {
1024-
inode_lock(&inode->v);
1025-
inode_locked = true;
1026-
ret = file_remove_privs_flags(file, 0);
1027-
}
1028-
if (ret)
1029-
goto unlock;
1030-
}
1031-
1032-
ret = file_update_time(file);
1033-
if (ret)
1034-
goto unlock;
1035-
1036-
pos = iocb->ki_pos;
970+
loff_t pos = iocb->ki_pos;
971+
ssize_t written = 0;
972+
int ret = 0;
1037973

1038974
bch2_pagecache_add_get(inode);
1039975

1040-
if (!inode_locked &&
1041-
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
1042-
goto get_inode_lock;
1043-
1044976
do {
1045977
unsigned offset = pos & (PAGE_SIZE - 1);
1046978
unsigned bytes = iov_iter_count(iter);
@@ -1065,17 +997,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
1065997
}
1066998
}
1067999

1068-
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
1069-
goto get_inode_lock;
1070-
10711000
if (unlikely(fatal_signal_pending(current))) {
10721001
ret = -EINTR;
10731002
break;
10741003
}
10751004

1076-
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
1077-
if (ret == -BCH_ERR_need_inode_lock)
1078-
goto get_inode_lock;
1005+
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
10791006
if (unlikely(ret < 0))
10801007
break;
10811008

@@ -1096,46 +1023,50 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
10961023
}
10971024
pos += ret;
10981025
written += ret;
1099-
written2 = max(written, written2);
1100-
1101-
if (ret != bytes && !inode_locked)
1102-
goto get_inode_lock;
11031026
ret = 0;
11041027

11051028
balance_dirty_pages_ratelimited(mapping);
1106-
1107-
if (0) {
1108-
get_inode_lock:
1109-
bch2_pagecache_add_put(inode);
1110-
inode_lock(&inode->v);
1111-
inode_locked = true;
1112-
bch2_pagecache_add_get(inode);
1113-
1114-
iov_iter_revert(iter, written);
1115-
pos -= written;
1116-
written = 0;
1117-
ret = 0;
1118-
}
11191029
} while (iov_iter_count(iter));
1120-
bch2_pagecache_add_put(inode);
1121-
unlock:
1122-
if (inode_locked)
1123-
inode_unlock(&inode->v);
11241030

1125-
iocb->ki_pos += written;
1031+
bch2_pagecache_add_put(inode);
11261032

1127-
ret = max(written, written2) ?: ret;
1128-
if (ret > 0)
1129-
ret = generic_write_sync(iocb, ret);
1130-
return ret;
1033+
return written ? written : ret;
11311034
}
11321035

1133-
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
1036+
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
11341037
{
1135-
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
1136-
? bch2_direct_write(iocb, iter)
1137-
: bch2_buffered_write(iocb, iter);
1038+
struct file *file = iocb->ki_filp;
1039+
struct bch_inode_info *inode = file_bch_inode(file);
1040+
ssize_t ret;
1041+
1042+
if (iocb->ki_flags & IOCB_DIRECT) {
1043+
ret = bch2_direct_write(iocb, from);
1044+
goto out;
1045+
}
1046+
1047+
inode_lock(&inode->v);
1048+
1049+
ret = generic_write_checks(iocb, from);
1050+
if (ret <= 0)
1051+
goto unlock;
1052+
1053+
ret = file_remove_privs(file);
1054+
if (ret)
1055+
goto unlock;
1056+
1057+
ret = file_update_time(file);
1058+
if (ret)
1059+
goto unlock;
1060+
1061+
ret = bch2_buffered_write(iocb, from);
1062+
if (likely(ret > 0))
1063+
iocb->ki_pos += ret;
1064+
unlock:
1065+
inode_unlock(&inode->v);
11381066

1067+
if (ret > 0)
1068+
ret = generic_write_sync(iocb, ret);
1069+
out:
11391070
return bch2_err_class(ret);
11401071
}
11411072

0 commit comments

Comments
 (0)