Skip to content

Commit d1b0f9a

Browse files
author
Kent Overstreet
committed
bcachefs: Rework fiemap transaction restart handling
Restart handling in the previous patch was incorrect, so: move btree operations into a separate helper, and run it with a lockrestart_do(). Additionally, clarify whether pagecache or the btree takes precedence. Right now, the btree takes precedence: this is incorrect, but it's needed to pass fstests. Add a giant comment explaining why. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
1 parent b9b0494 commit d1b0f9a

File tree

1 file changed

+112
-113
lines changed

1 file changed

+112
-113
lines changed

fs/bcachefs/fs.c

Lines changed: 112 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,8 @@ static int bch2_fill_extent(struct bch_fs *c,
12741274
struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
12751275
unsigned flags = fe->flags;
12761276

1277+
BUG_ON(!k.k->size);
1278+
12771279
if (bkey_extent_is_direct_data(k.k)) {
12781280
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
12791281
const union bch_extent_entry *entry;
@@ -1326,36 +1328,6 @@ static int bch2_fill_extent(struct bch_fs *c,
13261328
}
13271329
}
13281330

1329-
static int bch2_fiemap_extent(struct btree_trans *trans,
1330-
struct btree_iter *iter, struct bkey_s_c k,
1331-
struct bch_fiemap_extent *cur)
1332-
{
1333-
s64 offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
1334-
unsigned sectors = k.k->size - offset_into_extent;
1335-
1336-
bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
1337-
1338-
enum btree_id data_btree = BTREE_ID_extents;
1339-
int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
1340-
&cur->kbuf);
1341-
if (ret)
1342-
return ret;
1343-
1344-
k = bkey_i_to_s_c(cur->kbuf.k);
1345-
sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
1346-
1347-
bch2_cut_front(POS(k.k->p.inode,
1348-
bkey_start_offset(k.k) + offset_into_extent),
1349-
cur->kbuf.k);
1350-
bch2_key_resize(&cur->kbuf.k->k, sectors);
1351-
cur->kbuf.k->k.p = iter->pos;
1352-
cur->kbuf.k->k.p.offset += cur->kbuf.k->k.size;
1353-
1354-
cur->flags = 0;
1355-
1356-
return 0;
1357-
}
1358-
13591331
/*
13601332
* Scan a range of an inode for data in pagecache.
13611333
*
@@ -1371,13 +1343,19 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
13711343
dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
13721344
if (dstart < 0)
13731345
return dstart;
1374-
if (dstart >= *end)
1375-
return -ENOENT;
1346+
1347+
if (dstart == *end) {
1348+
*start = dstart;
1349+
return 0;
1350+
}
13761351

13771352
dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
13781353
if (dend < 0)
13791354
return dend;
13801355

1356+
/* race */
1357+
BUG_ON(dstart == dend);
1358+
13811359
*start = dstart;
13821360
*end = dend;
13831361
return 0;
@@ -1387,18 +1365,15 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
13871365
* Scan a range of pagecache that corresponds to a file mapping hole in the
13881366
* extent btree. If data is found, fake up an extent key so it looks like a
13891367
* delalloc extent to the rest of the fiemap processing code.
1390-
*
1391-
* Returns 0 if cached data was found, -ENOENT if not.
13921368
*/
13931369
static int
1394-
bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
1395-
u64 end, struct bch_fiemap_extent *cur)
1370+
bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
1371+
u64 start, u64 end, struct bch_fiemap_extent *cur)
13961372
{
1397-
struct bch_fs *c = vinode->i_sb->s_fs_info;
1398-
struct bch_inode_info *ei = to_bch_ei(vinode);
1373+
struct bch_fs *c = trans->c;
13991374
struct bkey_i_extent *delextent;
14001375
struct bch_extent_ptr ptr = {};
1401-
loff_t dstart = start, dend = end;
1376+
loff_t dstart = start << 9, dend = end << 9;
14021377
int ret;
14031378

14041379
/*
@@ -1411,13 +1386,10 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
14111386
* fundamentally racy with writeback anyways. Therefore, just report the
14121387
* range as delalloc regardless of whether we have to cycle trans locks.
14131388
*/
1414-
ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, true);
1415-
if (ret == -EAGAIN) {
1416-
/* open coded drop_locks_do() to relock even on error */
1417-
bch2_trans_unlock(trans);
1418-
ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, false);
1419-
bch2_trans_relock(trans);
1420-
}
1389+
ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
1390+
if (ret == -EAGAIN)
1391+
ret = drop_locks_do(trans,
1392+
bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
14211393
if (ret < 0)
14221394
return ret;
14231395

@@ -1428,124 +1400,151 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
14281400
*/
14291401
bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
14301402
delextent = bkey_extent_init(cur->kbuf.k);
1431-
delextent->k.p = POS(ei->v.i_ino, dstart >> 9);
1432-
bch2_key_resize(&delextent->k, (dend - dstart) >> 9);
1403+
delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
1404+
delextent->k.size = (dend - dstart) >> 9;
14331405
bch2_bkey_append_ptr(&delextent->k_i, ptr);
14341406

14351407
cur->flags = FIEMAP_EXTENT_DELALLOC;
14361408

14371409
return 0;
14381410
}
14391411

1412+
static int bch2_next_fiemap_extent(struct btree_trans *trans,
1413+
struct bch_inode_info *inode,
1414+
u64 start, u64 end,
1415+
struct bch_fiemap_extent *cur)
1416+
{
1417+
u32 snapshot;
1418+
int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
1419+
if (ret)
1420+
return ret;
1421+
1422+
struct btree_iter iter;
1423+
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1424+
SPOS(inode->ei_inum.inum, start, snapshot), 0);
1425+
1426+
struct bkey_s_c k =
1427+
bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
1428+
ret = bkey_err(k);
1429+
if (ret)
1430+
goto err;
1431+
1432+
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
1433+
if (ret)
1434+
goto err;
1435+
1436+
struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
1437+
1438+
/*
1439+
* Does the pagecache or the btree take precedence?
1440+
*
1441+
* It _should_ be the pagecache, so that we correctly report delalloc
1442+
* extents when dirty in the pagecache (we're COW, after all).
1443+
*
1444+
* But we'd have to add per-sector writeback tracking to
1445+
* bch_folio_state, otherwise we report delalloc extents for clean
1446+
* cached data in the pagecache.
1447+
*
1448+
* We should do this, but even then fiemap won't report stable mappings:
1449+
* on bcachefs data moves around in the background (copygc, rebalance)
1450+
* and we don't provide a way for userspace to lock that out.
1451+
*/
1452+
if (k.k &&
1453+
bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
1454+
pagecache_start)) {
1455+
bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
1456+
bch2_cut_front(iter.pos, cur->kbuf.k);
1457+
bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
1458+
cur->flags = 0;
1459+
} else if (k.k) {
1460+
bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
1461+
}
1462+
1463+
if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
1464+
unsigned sectors = cur->kbuf.k->k.size;
1465+
s64 offset_into_extent = 0;
1466+
enum btree_id data_btree = BTREE_ID_extents;
1467+
int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
1468+
&cur->kbuf);
1469+
if (ret)
1470+
goto err;
1471+
1472+
struct bkey_i *k = cur->kbuf.k;
1473+
sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
1474+
1475+
bch2_cut_front(POS(k->k.p.inode,
1476+
bkey_start_offset(&k->k) + offset_into_extent),
1477+
k);
1478+
bch2_key_resize(&k->k, sectors);
1479+
k->k.p = iter.pos;
1480+
k->k.p.offset += k->k.size;
1481+
}
1482+
err:
1483+
bch2_trans_iter_exit(trans, &iter);
1484+
return ret;
1485+
}
1486+
14401487
static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
14411488
u64 start, u64 len)
14421489
{
14431490
struct bch_fs *c = vinode->i_sb->s_fs_info;
14441491
struct bch_inode_info *ei = to_bch_ei(vinode);
14451492
struct btree_trans *trans;
1446-
struct btree_iter iter;
1447-
struct bkey_s_c k;
14481493
struct bch_fiemap_extent cur, prev;
1449-
bool have_extent = false;
14501494
int ret = 0;
14511495

14521496
ret = fiemap_prep(&ei->v, info, start, &len, 0);
14531497
if (ret)
14541498
return ret;
14551499

1456-
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
14571500
if (start + len < start)
14581501
return -EINVAL;
14591502

14601503
start >>= 9;
1504+
u64 end = (start + len) >> 9;
14611505

14621506
bch2_bkey_buf_init(&cur.kbuf);
14631507
bch2_bkey_buf_init(&prev.kbuf);
1464-
trans = bch2_trans_get(c);
1465-
1466-
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
1467-
POS(ei->v.i_ino, start), 0);
1508+
bkey_init(&prev.kbuf.k->k);
14681509

1469-
while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1470-
bool have_delalloc = false;
1471-
1472-
bch2_trans_begin(trans);
1510+
trans = bch2_trans_get(c);
14731511

1474-
u32 snapshot;
1475-
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
1512+
while (start < end) {
1513+
ret = lockrestart_do(trans,
1514+
bch2_next_fiemap_extent(trans, ei, start, end, &cur));
14761515
if (ret)
1477-
continue;
1478-
1479-
bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
1516+
goto err;
14801517

1481-
k = bch2_btree_iter_peek_max(trans, &iter, end);
1482-
ret = bkey_err(k);
1483-
if (ret)
1484-
continue;
1518+
BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
1519+
BUG_ON(cur.kbuf.k->k.p.offset > end);
14851520

1486-
if (!k.k)
1521+
if (bkey_start_offset(&cur.kbuf.k->k) == end)
14871522
break;
14881523

1489-
/*
1490-
* If a hole exists before the start of the extent key, scan the
1491-
* range for pagecache data that might be pending writeback and
1492-
* thus not yet exist in the extent tree.
1493-
*/
1494-
if (iter.pos.offset > start) {
1495-
ret = bch2_fiemap_hole(trans, vinode, start << 9,
1496-
iter.pos.offset << 9, &cur);
1497-
if (!ret)
1498-
have_delalloc = true;
1499-
else if (ret != -ENOENT)
1500-
break;
1501-
}
1502-
1503-
/* process the current key if there's no delalloc to report */
1504-
if (!have_delalloc) {
1505-
if (!bkey_extent_is_data(k.k) &&
1506-
k.k->type != KEY_TYPE_reservation) {
1507-
start = bkey_start_offset(k.k) + k.k->size;
1508-
bch2_btree_iter_advance(trans, &iter);
1509-
continue;
1510-
}
1511-
1512-
ret = bch2_fiemap_extent(trans, &iter, k, &cur);
1513-
if (ret)
1514-
break;
1515-
}
1516-
1517-
/*
1518-
* Store the current extent in prev so we can flag the last
1519-
* extent on the way out.
1520-
*/
1521-
bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s);
15221524
start = cur.kbuf.k->k.p.offset;
15231525

1524-
if (have_extent) {
1526+
if (!bkey_deleted(&prev.kbuf.k->k)) {
15251527
bch2_trans_unlock(trans);
15261528
ret = bch2_fill_extent(c, info, &prev);
15271529
if (ret)
1528-
break;
1530+
goto err;
15291531
}
15301532

1531-
bkey_copy(prev.kbuf.k, cur.kbuf.k);
1533+
bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
15321534
prev.flags = cur.flags;
1533-
have_extent = true;
1534-
1535-
bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, start));
15361535
}
1537-
bch2_trans_iter_exit(trans, &iter);
15381536

1539-
if (!ret && have_extent) {
1537+
if (!bkey_deleted(&prev.kbuf.k->k)) {
15401538
bch2_trans_unlock(trans);
15411539
prev.flags |= FIEMAP_EXTENT_LAST;
15421540
ret = bch2_fill_extent(c, info, &prev);
15431541
}
1544-
1542+
err:
15451543
bch2_trans_put(trans);
15461544
bch2_bkey_buf_exit(&cur.kbuf, c);
15471545
bch2_bkey_buf_exit(&prev.kbuf, c);
1548-
return ret < 0 ? ret : 0;
1546+
1547+
return bch2_err_class(ret < 0 ? ret : 0);
15491548
}
15501549

15511550
static const struct vm_operations_struct bch_vm_ops = {

0 commit comments

Comments
 (0)