Skip to content

Commit 53f7eed

Browse files
committed
Merge patch series "fs/buffer: split pagecache lookups into atomic or blocking"
Davidlohr Bueso <dave@stgolabs.net> says: This is a respin of the series[0] to address the sleep in atomic scenarios for noref migration with large folios, introduced in: 3c20917 ("block/bdev: enable large folio support for large logical block sizes") The main difference is that it removes the first patch and moves the fix (reducing the i_private_lock critical region in the migration path) to the final patch, which also introduces the new BH_Migrate flag. It also simplifies the locking scheme in patch 1 to avoid folio trylocking in the atomic lookup cases. So essentially blocking users will take the folio lock and hence wait for migration, and otherwise nonblocking callers will bail the lookup if a noref migration is on-going. Blocking callers will also benefit from potential performance gains by reducing contention on the spinlock for bdev mappings. * patches from https://lore.kernel.org/20250418015921.132400-1-dave@stgolabs.net: mm/migrate: fix sleep in atomic for large folios and buffer heads fs/ext4: use sleeping version of sb_find_get_block() fs/jbd2: use sleeping version of __find_get_block() fs/ocfs2: use sleeping version of __find_get_block() fs/buffer: use sleeping version of __find_get_block() fs/buffer: introduce sleeping flavors for pagecache lookups fs/buffer: split locking for pagecache lookups Link: https://lore.kernel.org/20250418015921.132400-1-dave@stgolabs.net Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 559a0d7 + 2d900ef commit 53f7eed

File tree

7 files changed

+82
-31
lines changed

7 files changed

+82
-31
lines changed

fs/buffer.c

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
176176
}
177177
EXPORT_SYMBOL(end_buffer_write_sync);
178178

179-
/*
180-
* Various filesystems appear to want __find_get_block to be non-blocking.
181-
* But it's the page lock which protects the buffers. To get around this,
182-
* we get exclusion from try_to_free_buffers with the blockdev mapping's
183-
* i_private_lock.
184-
*
185-
* Hack idea: for the blockdev mapping, i_private_lock contention
186-
* may be quite high. This code could TryLock the page, and if that
187-
* succeeds, there is no need to take i_private_lock.
188-
*/
189179
static struct buffer_head *
190-
__find_get_block_slow(struct block_device *bdev, sector_t block)
180+
__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
191181
{
192182
struct address_space *bd_mapping = bdev->bd_mapping;
193183
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
204194
if (IS_ERR(folio))
205195
goto out;
206196

207-
spin_lock(&bd_mapping->i_private_lock);
197+
/*
198+
* Folio lock protects the buffers. Callers that cannot block
199+
* will fallback to serializing vs try_to_free_buffers() via
200+
* the i_private_lock.
201+
*/
202+
if (atomic)
203+
spin_lock(&bd_mapping->i_private_lock);
204+
else
205+
folio_lock(folio);
206+
208207
head = folio_buffers(folio);
209208
if (!head)
210209
goto out_unlock;
210+
/*
211+
* Upon a noref migration, the folio lock serializes here;
212+
* otherwise bail.
213+
*/
214+
if (test_bit_acquire(BH_Migrate, &head->b_state)) {
215+
WARN_ON(!atomic);
216+
goto out_unlock;
217+
}
218+
211219
bh = head;
212220
do {
213221
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
236244
1 << blkbits);
237245
}
238246
out_unlock:
239-
spin_unlock(&bd_mapping->i_private_lock);
247+
if (atomic)
248+
spin_unlock(&bd_mapping->i_private_lock);
249+
else
250+
folio_unlock(folio);
240251
folio_put(folio);
241252
out:
242253
return ret;
@@ -656,7 +667,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
656667
void write_boundary_block(struct block_device *bdev,
657668
sector_t bblock, unsigned blocksize)
658669
{
659-
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
670+
struct buffer_head *bh;
671+
672+
bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
660673
if (bh) {
661674
if (buffer_dirty(bh))
662675
write_dirty_buffer(bh, 0);
@@ -1386,25 +1399,42 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
13861399
/*
13871400
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
13881401
* it in the LRU and mark it as accessed. If it is not present then return
1389-
* NULL
1402+
* NULL. Atomic context callers may also return NULL if the buffer is being
1403+
* migrated; similarly the page is not marked accessed either.
13901404
*/
1391-
struct buffer_head *
1392-
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1405+
static struct buffer_head *
1406+
find_get_block_common(struct block_device *bdev, sector_t block,
1407+
unsigned size, bool atomic)
13931408
{
13941409
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
13951410

13961411
if (bh == NULL) {
13971412
/* __find_get_block_slow will mark the page accessed */
1398-
bh = __find_get_block_slow(bdev, block);
1413+
bh = __find_get_block_slow(bdev, block, atomic);
13991414
if (bh)
14001415
bh_lru_install(bh);
14011416
} else
14021417
touch_buffer(bh);
14031418

14041419
return bh;
14051420
}
1421+
1422+
struct buffer_head *
1423+
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1424+
{
1425+
return find_get_block_common(bdev, block, size, true);
1426+
}
14061427
EXPORT_SYMBOL(__find_get_block);
14071428

1429+
/* same as __find_get_block() but allows sleeping contexts */
1430+
struct buffer_head *
1431+
__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
1432+
unsigned size)
1433+
{
1434+
return find_get_block_common(bdev, block, size, false);
1435+
}
1436+
EXPORT_SYMBOL(__find_get_block_nonatomic);
1437+
14081438
/**
14091439
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
14101440
* @bdev: The block device.
@@ -1422,7 +1452,12 @@ EXPORT_SYMBOL(__find_get_block);
14221452
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
14231453
unsigned size, gfp_t gfp)
14241454
{
1425-
struct buffer_head *bh = __find_get_block(bdev, block, size);
1455+
struct buffer_head *bh;
1456+
1457+
if (gfpflags_allow_blocking(gfp))
1458+
bh = __find_get_block_nonatomic(bdev, block, size);
1459+
else
1460+
bh = __find_get_block(bdev, block, size);
14261461

14271462
might_alloc(gfp);
14281463
if (bh)

fs/ext4/ialloc.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
691691
if (!bh || !buffer_uptodate(bh))
692692
/*
693693
* If the block is not in the buffer cache, then it
694-
* must have been written out.
694+
* must have been written out, or, most unlikely, is
695+
* being migrated - false failure should be OK here.
695696
*/
696697
goto out;
697698

fs/ext4/mballoc.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6642,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
66426642
for (i = 0; i < count; i++) {
66436643
cond_resched();
66446644
if (is_metadata)
6645-
bh = sb_find_get_block(inode->i_sb, block + i);
6645+
bh = sb_find_get_block_nonatomic(inode->i_sb,
6646+
block + i);
66466647
ext4_forget(handle, is_metadata, inode, bh, block + i);
66476648
}
66486649
}

fs/jbd2/revoke.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
345345
bh = bh_in;
346346

347347
if (!bh) {
348-
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
348+
bh = __find_get_block_nonatomic(bdev, blocknr,
349+
journal->j_blocksize);
349350
if (bh)
350351
BUFFER_TRACE(bh, "found on hash");
351352
}
@@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
355356

356357
/* If there is a different buffer_head lying around in
357358
* memory anywhere... */
358-
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
359+
bh2 = __find_get_block_nonatomic(bdev, blocknr,
360+
journal->j_blocksize);
359361
if (bh2) {
360362
/* ... and it has RevokeValid status... */
361363
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
464466
* state machine will get very upset later on. */
465467
if (need_cancel) {
466468
struct buffer_head *bh2;
467-
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
469+
bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
470+
bh->b_size);
468471
if (bh2) {
469472
if (bh2 != bh)
470473
clear_buffer_revoked(bh2);
@@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
492495
struct jbd2_revoke_record_s *record;
493496
struct buffer_head *bh;
494497
record = (struct jbd2_revoke_record_s *)list_entry;
495-
bh = __find_get_block(journal->j_fs_dev,
496-
record->blocknr,
497-
journal->j_blocksize);
498+
bh = __find_get_block_nonatomic(journal->j_fs_dev,
499+
record->blocknr,
500+
journal->j_blocksize);
498501
if (bh) {
499502
clear_buffer_revoked(bh);
500503
__brelse(bh);

fs/ocfs2/journal.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1249,7 +1249,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
12491249
}
12501250

12511251
for (i = 0; i < p_blocks; i++, p_blkno++) {
1252-
bh = __find_get_block(osb->sb->s_bdev, p_blkno,
1252+
bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,
12531253
osb->sb->s_blocksize);
12541254
/* block not cached. */
12551255
if (!bh)

include/linux/buffer_head.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ enum bh_state_bits {
3434
BH_Meta, /* Buffer contains metadata */
3535
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
3636
BH_Defer_Completion, /* Defer AIO completion to workqueue */
37+
BH_Migrate, /* Buffer is being migrated (norefs) */
3738

3839
BH_PrivateStart,/* not a state bit, but the first bit available
3940
* for private allocation by other entities
@@ -222,6 +223,8 @@ void __wait_on_buffer(struct buffer_head *);
222223
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
223224
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
224225
unsigned size);
226+
struct buffer_head *__find_get_block_nonatomic(struct block_device *bdev,
227+
sector_t block, unsigned size);
225228
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
226229
unsigned size, gfp_t gfp);
227230
void __brelse(struct buffer_head *);
@@ -397,6 +400,12 @@ sb_find_get_block(struct super_block *sb, sector_t block)
397400
return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
398401
}
399402

403+
static inline struct buffer_head *
404+
sb_find_get_block_nonatomic(struct super_block *sb, sector_t block)
405+
{
406+
return __find_get_block_nonatomic(sb->s_bdev, block, sb->s_blocksize);
407+
}
408+
400409
static inline void
401410
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
402411
{

mm/migrate.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -845,9 +845,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
845845
return -EAGAIN;
846846

847847
if (check_refs) {
848-
bool busy;
848+
bool busy, migrating;
849849
bool invalidated = false;
850850

851+
migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
852+
VM_WARN_ON_ONCE(migrating);
851853
recheck_buffers:
852854
busy = false;
853855
spin_lock(&mapping->i_private_lock);
@@ -859,12 +861,12 @@ static int __buffer_migrate_folio(struct address_space *mapping,
859861
}
860862
bh = bh->b_this_page;
861863
} while (bh != head);
864+
spin_unlock(&mapping->i_private_lock);
862865
if (busy) {
863866
if (invalidated) {
864867
rc = -EAGAIN;
865868
goto unlock_buffers;
866869
}
867-
spin_unlock(&mapping->i_private_lock);
868870
invalidate_bh_lrus();
869871
invalidated = true;
870872
goto recheck_buffers;
@@ -883,7 +885,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
883885

884886
unlock_buffers:
885887
if (check_refs)
886-
spin_unlock(&mapping->i_private_lock);
888+
clear_bit_unlock(BH_Migrate, &head->b_state);
887889
bh = head;
888890
do {
889891
unlock_buffer(bh);

0 commit comments

Comments
 (0)