Skip to content

Commit b9b0494

Browse files
Brian FosterKent Overstreet
authored andcommitted
bcachefs: add fiemap delalloc extent detection
bcachefs currently populates fiemap data from the extents btree. This works correctly when the fiemap sync flag is provided, but if not, it skips all delalloc extents that have not yet been flushed. This is because delalloc extents from buffered writes are first stored as reservation in the pagecache, and only become resident in the extents btree after writeback completes. Update the fiemap implementation to process holes between extents by scanning pagecache for data, via seek data/hole. If a valid data range is found over a hole in the extent btree, fake up an extent key and flag the extent as delalloc for reporting to userspace. Note that this does not necessarily change behavior for the case where there is dirty pagecache over already written extents, where when in COW mode, writeback will allocate new blocks for the underlying ranges. The existing behavior is consistent with btrfs and it is recommended to use the sync flag for the most up to date extent state from fiemap. Signed-off-by: Brian Foster <bfoster@redhat.com>
1 parent 2d55a63 commit b9b0494

File tree

1 file changed

+112
-7
lines changed

1 file changed

+112
-7
lines changed

fs/bcachefs/fs.c

Lines changed: 112 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,6 +1356,87 @@ static int bch2_fiemap_extent(struct btree_trans *trans,
13561356
return 0;
13571357
}
13581358

1359+
/*
1360+
* Scan a range of an inode for data in pagecache.
1361+
*
1362+
* Intended to be retryable, so don't modify the output params until success is
1363+
* imminent.
1364+
*/
1365+
static int
1366+
bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
1367+
bool nonblock)
1368+
{
1369+
loff_t dstart, dend;
1370+
1371+
dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
1372+
if (dstart < 0)
1373+
return dstart;
1374+
if (dstart >= *end)
1375+
return -ENOENT;
1376+
1377+
dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
1378+
if (dend < 0)
1379+
return dend;
1380+
1381+
*start = dstart;
1382+
*end = dend;
1383+
return 0;
1384+
}
1385+
1386+
/*
1387+
* Scan a range of pagecache that corresponds to a file mapping hole in the
1388+
* extent btree. If data is found, fake up an extent key so it looks like a
1389+
* delalloc extent to the rest of the fiemap processing code.
1390+
*
1391+
* Returns 0 if cached data was found, -ENOENT if not.
1392+
*/
1393+
static int
1394+
bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
1395+
u64 end, struct bch_fiemap_extent *cur)
1396+
{
1397+
struct bch_fs *c = vinode->i_sb->s_fs_info;
1398+
struct bch_inode_info *ei = to_bch_ei(vinode);
1399+
struct bkey_i_extent *delextent;
1400+
struct bch_extent_ptr ptr = {};
1401+
loff_t dstart = start, dend = end;
1402+
int ret;
1403+
1404+
/*
1405+
* We hold btree locks here so we cannot block on folio locks without
1406+
* dropping trans locks first. Run a nonblocking scan for the common
1407+
* case of no folios over holes and fall back on failure.
1408+
*
1409+
* Note that dropping locks like this is technically racy against
1410+
* writeback inserting to the extent tree, but a non-sync fiemap scan is
1411+
* fundamentally racy with writeback anyways. Therefore, just report the
1412+
* range as delalloc regardless of whether we have to cycle trans locks.
1413+
*/
1414+
ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, true);
1415+
if (ret == -EAGAIN) {
1416+
/* open coded drop_locks_do() to relock even on error */
1417+
bch2_trans_unlock(trans);
1418+
ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, false);
1419+
bch2_trans_relock(trans);
1420+
}
1421+
if (ret < 0)
1422+
return ret;
1423+
1424+
/*
1425+
* Create a fake extent key in the buffer. We have to add a dummy extent
1426+
* pointer for the fill code to add an extent entry. It's explicitly
1427+
* zeroed to reflect delayed allocation (i.e. phys offset 0).
1428+
*/
1429+
bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
1430+
delextent = bkey_extent_init(cur->kbuf.k);
1431+
delextent->k.p = POS(ei->v.i_ino, dstart >> 9);
1432+
bch2_key_resize(&delextent->k, (dend - dstart) >> 9);
1433+
bch2_bkey_append_ptr(&delextent->k_i, ptr);
1434+
1435+
cur->flags = FIEMAP_EXTENT_DELALLOC;
1436+
1437+
return 0;
1438+
}
1439+
13591440
static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
13601441
u64 start, u64 len)
13611442
{
@@ -1386,6 +1467,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
13861467
POS(ei->v.i_ino, start), 0);
13871468

13881469
while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1470+
bool have_delalloc = false;
13891471

13901472
bch2_trans_begin(trans);
13911473

@@ -1404,15 +1486,38 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
14041486
if (!k.k)
14051487
break;
14061488

1407-
if (!bkey_extent_is_data(k.k) &&
1408-
k.k->type != KEY_TYPE_reservation) {
1409-
bch2_btree_iter_advance(trans, &iter);
1410-
continue;
1489+
/*
1490+
* If a hole exists before the start of the extent key, scan the
1491+
* range for pagecache data that might be pending writeback and
1492+
* thus not yet exist in the extent tree.
1493+
*/
1494+
if (iter.pos.offset > start) {
1495+
ret = bch2_fiemap_hole(trans, vinode, start << 9,
1496+
iter.pos.offset << 9, &cur);
1497+
if (!ret)
1498+
have_delalloc = true;
1499+
else if (ret != -ENOENT)
1500+
break;
14111501
}
14121502

1413-
ret = bch2_fiemap_extent(trans, &iter, k, &cur);
1414-
if (ret)
1415-
break;
1503+
/* process the current key if there's no delalloc to report */
1504+
if (!have_delalloc) {
1505+
if (!bkey_extent_is_data(k.k) &&
1506+
k.k->type != KEY_TYPE_reservation) {
1507+
start = bkey_start_offset(k.k) + k.k->size;
1508+
bch2_btree_iter_advance(trans, &iter);
1509+
continue;
1510+
}
1511+
1512+
ret = bch2_fiemap_extent(trans, &iter, k, &cur);
1513+
if (ret)
1514+
break;
1515+
}
1516+
1517+
/*
1518+
* Store the current extent in prev so we can flag the last
1519+
* extent on the way out.
1520+
*/
14161521
bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s);
14171522
start = cur.kbuf.k->k.p.offset;
14181523

0 commit comments

Comments
 (0)