@@ -1274,6 +1274,8 @@ static int bch2_fill_extent(struct bch_fs *c,
1274
1274
struct bkey_s_c k = bkey_i_to_s_c (fe -> kbuf .k );
1275
1275
unsigned flags = fe -> flags ;
1276
1276
1277
+ BUG_ON (!k .k -> size );
1278
+
1277
1279
if (bkey_extent_is_direct_data (k .k )) {
1278
1280
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c (k );
1279
1281
const union bch_extent_entry * entry ;
@@ -1326,36 +1328,6 @@ static int bch2_fill_extent(struct bch_fs *c,
1326
1328
}
1327
1329
}
1328
1330
1329
- static int bch2_fiemap_extent (struct btree_trans * trans ,
1330
- struct btree_iter * iter , struct bkey_s_c k ,
1331
- struct bch_fiemap_extent * cur )
1332
- {
1333
- s64 offset_into_extent = iter -> pos .offset - bkey_start_offset (k .k );
1334
- unsigned sectors = k .k -> size - offset_into_extent ;
1335
-
1336
- bch2_bkey_buf_reassemble (& cur -> kbuf , trans -> c , k );
1337
-
1338
- enum btree_id data_btree = BTREE_ID_extents ;
1339
- int ret = bch2_read_indirect_extent (trans , & data_btree , & offset_into_extent ,
1340
- & cur -> kbuf );
1341
- if (ret )
1342
- return ret ;
1343
-
1344
- k = bkey_i_to_s_c (cur -> kbuf .k );
1345
- sectors = min_t (unsigned , sectors , k .k -> size - offset_into_extent );
1346
-
1347
- bch2_cut_front (POS (k .k -> p .inode ,
1348
- bkey_start_offset (k .k ) + offset_into_extent ),
1349
- cur -> kbuf .k );
1350
- bch2_key_resize (& cur -> kbuf .k -> k , sectors );
1351
- cur -> kbuf .k -> k .p = iter -> pos ;
1352
- cur -> kbuf .k -> k .p .offset += cur -> kbuf .k -> k .size ;
1353
-
1354
- cur -> flags = 0 ;
1355
-
1356
- return 0 ;
1357
- }
1358
-
1359
1331
/*
1360
1332
* Scan a range of an inode for data in pagecache.
1361
1333
*
@@ -1371,13 +1343,19 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
1371
1343
dstart = bch2_seek_pagecache_data (vinode , * start , * end , 0 , nonblock );
1372
1344
if (dstart < 0 )
1373
1345
return dstart ;
1374
- if (dstart >= * end )
1375
- return - ENOENT ;
1346
+
1347
+ if (dstart == * end ) {
1348
+ * start = dstart ;
1349
+ return 0 ;
1350
+ }
1376
1351
1377
1352
dend = bch2_seek_pagecache_hole (vinode , dstart , * end , 0 , nonblock );
1378
1353
if (dend < 0 )
1379
1354
return dend ;
1380
1355
1356
+ /* race */
1357
+ BUG_ON (dstart == dend );
1358
+
1381
1359
* start = dstart ;
1382
1360
* end = dend ;
1383
1361
return 0 ;
@@ -1387,18 +1365,15 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
1387
1365
* Scan a range of pagecache that corresponds to a file mapping hole in the
1388
1366
* extent btree. If data is found, fake up an extent key so it looks like a
1389
1367
* delalloc extent to the rest of the fiemap processing code.
1390
- *
1391
- * Returns 0 if cached data was found, -ENOENT if not.
1392
1368
*/
1393
1369
static int
1394
- bch2_fiemap_hole (struct btree_trans * trans , struct inode * vinode , u64 start ,
1395
- u64 end , struct bch_fiemap_extent * cur )
1370
+ bch2_next_fiemap_pagecache_extent (struct btree_trans * trans , struct bch_inode_info * inode ,
1371
+ u64 start , u64 end , struct bch_fiemap_extent * cur )
1396
1372
{
1397
- struct bch_fs * c = vinode -> i_sb -> s_fs_info ;
1398
- struct bch_inode_info * ei = to_bch_ei (vinode );
1373
+ struct bch_fs * c = trans -> c ;
1399
1374
struct bkey_i_extent * delextent ;
1400
1375
struct bch_extent_ptr ptr = {};
1401
- loff_t dstart = start , dend = end ;
1376
+ loff_t dstart = start << 9 , dend = end << 9 ;
1402
1377
int ret ;
1403
1378
1404
1379
/*
@@ -1411,13 +1386,10 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
1411
1386
* fundamentally racy with writeback anyways. Therefore, just report the
1412
1387
* range as delalloc regardless of whether we have to cycle trans locks.
1413
1388
*/
1414
- ret = bch2_fiemap_hole_pagecache (vinode , & dstart , & dend , true);
1415
- if (ret == - EAGAIN ) {
1416
- /* open coded drop_locks_do() to relock even on error */
1417
- bch2_trans_unlock (trans );
1418
- ret = bch2_fiemap_hole_pagecache (vinode , & dstart , & dend , false);
1419
- bch2_trans_relock (trans );
1420
- }
1389
+ ret = bch2_fiemap_hole_pagecache (& inode -> v , & dstart , & dend , true);
1390
+ if (ret == - EAGAIN )
1391
+ ret = drop_locks_do (trans ,
1392
+ bch2_fiemap_hole_pagecache (& inode -> v , & dstart , & dend , false));
1421
1393
if (ret < 0 )
1422
1394
return ret ;
1423
1395
@@ -1428,124 +1400,151 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start,
1428
1400
*/
1429
1401
bch2_bkey_buf_realloc (& cur -> kbuf , c , sizeof (* delextent ) / sizeof (u64 ));
1430
1402
delextent = bkey_extent_init (cur -> kbuf .k );
1431
- delextent -> k .p = POS (ei -> v . i_ino , dstart >> 9 );
1432
- bch2_key_resize ( & delextent -> k , (dend - dstart ) >> 9 ) ;
1403
+ delextent -> k .p = POS (inode -> ei_inum . inum , dend >> 9 );
1404
+ delextent -> k . size = (dend - dstart ) >> 9 ;
1433
1405
bch2_bkey_append_ptr (& delextent -> k_i , ptr );
1434
1406
1435
1407
cur -> flags = FIEMAP_EXTENT_DELALLOC ;
1436
1408
1437
1409
return 0 ;
1438
1410
}
1439
1411
1412
+ static int bch2_next_fiemap_extent (struct btree_trans * trans ,
1413
+ struct bch_inode_info * inode ,
1414
+ u64 start , u64 end ,
1415
+ struct bch_fiemap_extent * cur )
1416
+ {
1417
+ u32 snapshot ;
1418
+ int ret = bch2_subvolume_get_snapshot (trans , inode -> ei_inum .subvol , & snapshot );
1419
+ if (ret )
1420
+ return ret ;
1421
+
1422
+ struct btree_iter iter ;
1423
+ bch2_trans_iter_init (trans , & iter , BTREE_ID_extents ,
1424
+ SPOS (inode -> ei_inum .inum , start , snapshot ), 0 );
1425
+
1426
+ struct bkey_s_c k =
1427
+ bch2_btree_iter_peek_max (trans , & iter , POS (inode -> ei_inum .inum , end ));
1428
+ ret = bkey_err (k );
1429
+ if (ret )
1430
+ goto err ;
1431
+
1432
+ ret = bch2_next_fiemap_pagecache_extent (trans , inode , start , end , cur );
1433
+ if (ret )
1434
+ goto err ;
1435
+
1436
+ struct bpos pagecache_start = bkey_start_pos (& cur -> kbuf .k -> k );
1437
+
1438
+ /*
1439
+ * Does the pagecache or the btree take precedence?
1440
+ *
1441
+ * It _should_ be the pagecache, so that we correctly report delalloc
1442
+ * extents when dirty in the pagecache (we're COW, after all).
1443
+ *
1444
+ * But we'd have to add per-sector writeback tracking to
1445
+ * bch_folio_state, otherwise we report delalloc extents for clean
1446
+ * cached data in the pagecache.
1447
+ *
1448
+ * We should do this, but even then fiemap won't report stable mappings:
1449
+ * on bcachefs data moves around in the background (copygc, rebalance)
1450
+ * and we don't provide a way for userspace to lock that out.
1451
+ */
1452
+ if (k .k &&
1453
+ bkey_le (bpos_max (iter .pos , bkey_start_pos (k .k )),
1454
+ pagecache_start )) {
1455
+ bch2_bkey_buf_reassemble (& cur -> kbuf , trans -> c , k );
1456
+ bch2_cut_front (iter .pos , cur -> kbuf .k );
1457
+ bch2_cut_back (POS (inode -> ei_inum .inum , end ), cur -> kbuf .k );
1458
+ cur -> flags = 0 ;
1459
+ } else if (k .k ) {
1460
+ bch2_cut_back (bkey_start_pos (k .k ), cur -> kbuf .k );
1461
+ }
1462
+
1463
+ if (cur -> kbuf .k -> k .type == KEY_TYPE_reflink_p ) {
1464
+ unsigned sectors = cur -> kbuf .k -> k .size ;
1465
+ s64 offset_into_extent = 0 ;
1466
+ enum btree_id data_btree = BTREE_ID_extents ;
1467
+ int ret = bch2_read_indirect_extent (trans , & data_btree , & offset_into_extent ,
1468
+ & cur -> kbuf );
1469
+ if (ret )
1470
+ goto err ;
1471
+
1472
+ struct bkey_i * k = cur -> kbuf .k ;
1473
+ sectors = min_t (unsigned , sectors , k -> k .size - offset_into_extent );
1474
+
1475
+ bch2_cut_front (POS (k -> k .p .inode ,
1476
+ bkey_start_offset (& k -> k ) + offset_into_extent ),
1477
+ k );
1478
+ bch2_key_resize (& k -> k , sectors );
1479
+ k -> k .p = iter .pos ;
1480
+ k -> k .p .offset += k -> k .size ;
1481
+ }
1482
+ err :
1483
+ bch2_trans_iter_exit (trans , & iter );
1484
+ return ret ;
1485
+ }
1486
+
1440
1487
static int bch2_fiemap (struct inode * vinode , struct fiemap_extent_info * info ,
1441
1488
u64 start , u64 len )
1442
1489
{
1443
1490
struct bch_fs * c = vinode -> i_sb -> s_fs_info ;
1444
1491
struct bch_inode_info * ei = to_bch_ei (vinode );
1445
1492
struct btree_trans * trans ;
1446
- struct btree_iter iter ;
1447
- struct bkey_s_c k ;
1448
1493
struct bch_fiemap_extent cur , prev ;
1449
- bool have_extent = false;
1450
1494
int ret = 0 ;
1451
1495
1452
1496
ret = fiemap_prep (& ei -> v , info , start , & len , 0 );
1453
1497
if (ret )
1454
1498
return ret ;
1455
1499
1456
- struct bpos end = POS (ei -> v .i_ino , (start + len ) >> 9 );
1457
1500
if (start + len < start )
1458
1501
return - EINVAL ;
1459
1502
1460
1503
start >>= 9 ;
1504
+ u64 end = (start + len ) >> 9 ;
1461
1505
1462
1506
bch2_bkey_buf_init (& cur .kbuf );
1463
1507
bch2_bkey_buf_init (& prev .kbuf );
1464
- trans = bch2_trans_get (c );
1465
-
1466
- bch2_trans_iter_init (trans , & iter , BTREE_ID_extents ,
1467
- POS (ei -> v .i_ino , start ), 0 );
1508
+ bkey_init (& prev .kbuf .k -> k );
1468
1509
1469
- while (!ret || bch2_err_matches (ret , BCH_ERR_transaction_restart )) {
1470
- bool have_delalloc = false;
1471
-
1472
- bch2_trans_begin (trans );
1510
+ trans = bch2_trans_get (c );
1473
1511
1474
- u32 snapshot ;
1475
- ret = bch2_subvolume_get_snapshot (trans , ei -> ei_inum .subvol , & snapshot );
1512
+ while (start < end ) {
1513
+ ret = lockrestart_do (trans ,
1514
+ bch2_next_fiemap_extent (trans , ei , start , end , & cur ));
1476
1515
if (ret )
1477
- continue ;
1478
-
1479
- bch2_btree_iter_set_snapshot (trans , & iter , snapshot );
1516
+ goto err ;
1480
1517
1481
- k = bch2_btree_iter_peek_max (trans , & iter , end );
1482
- ret = bkey_err (k );
1483
- if (ret )
1484
- continue ;
1518
+ BUG_ON (bkey_start_offset (& cur .kbuf .k -> k ) < start );
1519
+ BUG_ON (cur .kbuf .k -> k .p .offset > end );
1485
1520
1486
- if (! k . k )
1521
+ if (bkey_start_offset ( & cur . kbuf . k -> k ) == end )
1487
1522
break ;
1488
1523
1489
- /*
1490
- * If a hole exists before the start of the extent key, scan the
1491
- * range for pagecache data that might be pending writeback and
1492
- * thus not yet exist in the extent tree.
1493
- */
1494
- if (iter .pos .offset > start ) {
1495
- ret = bch2_fiemap_hole (trans , vinode , start << 9 ,
1496
- iter .pos .offset << 9 , & cur );
1497
- if (!ret )
1498
- have_delalloc = true;
1499
- else if (ret != - ENOENT )
1500
- break ;
1501
- }
1502
-
1503
- /* process the current key if there's no delalloc to report */
1504
- if (!have_delalloc ) {
1505
- if (!bkey_extent_is_data (k .k ) &&
1506
- k .k -> type != KEY_TYPE_reservation ) {
1507
- start = bkey_start_offset (k .k ) + k .k -> size ;
1508
- bch2_btree_iter_advance (trans , & iter );
1509
- continue ;
1510
- }
1511
-
1512
- ret = bch2_fiemap_extent (trans , & iter , k , & cur );
1513
- if (ret )
1514
- break ;
1515
- }
1516
-
1517
- /*
1518
- * Store the current extent in prev so we can flag the last
1519
- * extent on the way out.
1520
- */
1521
- bch2_bkey_buf_realloc (& prev .kbuf , c , cur .kbuf .k -> k .u64s );
1522
1524
start = cur .kbuf .k -> k .p .offset ;
1523
1525
1524
- if (have_extent ) {
1526
+ if (! bkey_deleted ( & prev . kbuf . k -> k ) ) {
1525
1527
bch2_trans_unlock (trans );
1526
1528
ret = bch2_fill_extent (c , info , & prev );
1527
1529
if (ret )
1528
- break ;
1530
+ goto err ;
1529
1531
}
1530
1532
1531
- bkey_copy ( prev .kbuf . k , cur .kbuf .k );
1533
+ bch2_bkey_buf_copy ( & prev .kbuf , c , cur .kbuf .k );
1532
1534
prev .flags = cur .flags ;
1533
- have_extent = true;
1534
-
1535
- bch2_btree_iter_set_pos (trans , & iter , POS (iter .pos .inode , start ));
1536
1535
}
1537
- bch2_trans_iter_exit (trans , & iter );
1538
1536
1539
- if (!ret && have_extent ) {
1537
+ if (!bkey_deleted ( & prev . kbuf . k -> k ) ) {
1540
1538
bch2_trans_unlock (trans );
1541
1539
prev .flags |= FIEMAP_EXTENT_LAST ;
1542
1540
ret = bch2_fill_extent (c , info , & prev );
1543
1541
}
1544
-
1542
+ err :
1545
1543
bch2_trans_put (trans );
1546
1544
bch2_bkey_buf_exit (& cur .kbuf , c );
1547
1545
bch2_bkey_buf_exit (& prev .kbuf , c );
1548
- return ret < 0 ? ret : 0 ;
1546
+
1547
+ return bch2_err_class (ret < 0 ? ret : 0 );
1549
1548
}
1550
1549
1551
1550
static const struct vm_operations_struct bch_vm_ops = {
0 commit comments