Skip to content

Commit eb98f30

Browse files
committed
Merge tag 'vfs-6.15-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner: - For some reason we went from zero to three maintainers for HFS/HFS+ in a matter of days. The lesson to learn from this might just be that we need to threaten code removal more often!? - Fix a regression introduced by enabling large folios for lage logical block sizes. This has caused issues for noref migration with large folios due to sleeping while in an atomic context. New sleeping variants of pagecache lookup helpers are introduced. These helpers take the folio lock instead of the mapping's private spinlock. The problematic users are converted to the sleeping variants and serialize against noref migration. Atomic users will bail on seeing the new BH_Migrate flag. This also shrinks the critical region of the mapping's private lock and the new blocking callers reduce contention on the spinlock for bdev mappings. - Fix two bugs in do_move_mount() when with MOVE_MOUNT_BENEATH. The first bug is using a mountpoint that is located on a mount we're not holding a reference to. The second bug is putting the mountpoint after we've called namespace_unlock() as it's no longer guaranteed that it does stay a mountpoint. - Remove a pointless call to vfs_getattr_nosec() in the devtmpfs code just to query i_mode instead of simply querying the inode directly. This also avoids lifetime issues for the dm code by an earlier bugfix this cycle that moved bdev_statx() handling into vfs_getattr_nosec(). - Fix AT_FDCWD handling with getname_maybe_null() in the xattr code. - Fix a performance regression for files when multiple callers issue a close when it's not the last reference. - Remove a duplicate noinline annotation from pipe_clear_nowait(). * tag 'vfs-6.15-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: fs/xattr: Fix handling of AT_FDCWD in setxattrat(2) and getxattrat(2) MAINTAINERS: hfs/hfsplus: add myself as maintainer splice: remove duplicate noinline from pipe_clear_nowait devtmpfs: don't use vfs_getattr_nosec to query i_mode fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount() fs: fall back to file_ref_put() for non-last reference mm/migrate: fix sleep in atomic for large folios and buffer heads fs/ext4: use sleeping version of sb_find_get_block() fs/jbd2: use sleeping version of __find_get_block() fs/ocfs2: use sleeping version of __find_get_block() fs/buffer: use sleeping version of __find_get_block() fs/buffer: introduce sleeping flavors for pagecache lookups MAINTAINERS: add HFS/HFS+ maintainers fs/buffer: split locking for pagecache lookups
2 parents 349b7d7 + f520bed commit eb98f30

File tree

14 files changed

+145
-96
lines changed

14 files changed

+145
-96
lines changed

MAINTAINERS

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10464,14 +10464,20 @@ S: Supported
1046410464
F: drivers/infiniband/hw/hfi1
1046510465

1046610466
HFS FILESYSTEM
10467+
M: Viacheslav Dubeyko <slava@dubeyko.com>
10468+
M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
10469+
M: Yangtao Li <frank.li@vivo.com>
1046710470
L: linux-fsdevel@vger.kernel.org
10468-
S: Orphan
10471+
S: Maintained
1046910472
F: Documentation/filesystems/hfs.rst
1047010473
F: fs/hfs/
1047110474

1047210475
HFSPLUS FILESYSTEM
10476+
M: Viacheslav Dubeyko <slava@dubeyko.com>
10477+
M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
10478+
M: Yangtao Li <frank.li@vivo.com>
1047310479
L: linux-fsdevel@vger.kernel.org
10474-
S: Orphan
10480+
S: Maintained
1047510481
F: Documentation/filesystems/hfsplus.rst
1047610482
F: fs/hfsplus/
1047710483

drivers/base/devtmpfs.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -296,21 +296,21 @@ static int delete_path(const char *nodepath)
296296
return err;
297297
}
298298

299-
static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat)
299+
static int dev_mynode(struct device *dev, struct inode *inode)
300300
{
301301
/* did we create it */
302302
if (inode->i_private != &thread)
303303
return 0;
304304

305305
/* does the dev_t match */
306306
if (is_blockdev(dev)) {
307-
if (!S_ISBLK(stat->mode))
307+
if (!S_ISBLK(inode->i_mode))
308308
return 0;
309309
} else {
310-
if (!S_ISCHR(stat->mode))
310+
if (!S_ISCHR(inode->i_mode))
311311
return 0;
312312
}
313-
if (stat->rdev != dev->devt)
313+
if (inode->i_rdev != dev->devt)
314314
return 0;
315315

316316
/* ours */
@@ -321,28 +321,24 @@ static int handle_remove(const char *nodename, struct device *dev)
321321
{
322322
struct path parent;
323323
struct dentry *dentry;
324-
struct kstat stat;
325-
struct path p;
324+
struct inode *inode;
326325
int deleted = 0;
327-
int err;
326+
int err = 0;
328327

329328
dentry = kern_path_locked(nodename, &parent);
330329
if (IS_ERR(dentry))
331330
return PTR_ERR(dentry);
332331

333-
p.mnt = parent.mnt;
334-
p.dentry = dentry;
335-
err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE,
336-
AT_STATX_SYNC_AS_STAT);
337-
if (!err && dev_mynode(dev, d_inode(dentry), &stat)) {
332+
inode = d_inode(dentry);
333+
if (dev_mynode(dev, inode)) {
338334
struct iattr newattrs;
339335
/*
340336
* before unlinking this node, reset permissions
341337
* of possible references like hardlinks
342338
*/
343339
newattrs.ia_uid = GLOBAL_ROOT_UID;
344340
newattrs.ia_gid = GLOBAL_ROOT_GID;
345-
newattrs.ia_mode = stat.mode & ~0777;
341+
newattrs.ia_mode = inode->i_mode & ~0777;
346342
newattrs.ia_valid =
347343
ATTR_UID|ATTR_GID|ATTR_MODE;
348344
inode_lock(d_inode(dentry));

fs/buffer.c

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
176176
}
177177
EXPORT_SYMBOL(end_buffer_write_sync);
178178

179-
/*
180-
* Various filesystems appear to want __find_get_block to be non-blocking.
181-
* But it's the page lock which protects the buffers. To get around this,
182-
* we get exclusion from try_to_free_buffers with the blockdev mapping's
183-
* i_private_lock.
184-
*
185-
* Hack idea: for the blockdev mapping, i_private_lock contention
186-
* may be quite high. This code could TryLock the page, and if that
187-
* succeeds, there is no need to take i_private_lock.
188-
*/
189179
static struct buffer_head *
190-
__find_get_block_slow(struct block_device *bdev, sector_t block)
180+
__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
191181
{
192182
struct address_space *bd_mapping = bdev->bd_mapping;
193183
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
204194
if (IS_ERR(folio))
205195
goto out;
206196

207-
spin_lock(&bd_mapping->i_private_lock);
197+
/*
198+
* Folio lock protects the buffers. Callers that cannot block
199+
* will fallback to serializing vs try_to_free_buffers() via
200+
* the i_private_lock.
201+
*/
202+
if (atomic)
203+
spin_lock(&bd_mapping->i_private_lock);
204+
else
205+
folio_lock(folio);
206+
208207
head = folio_buffers(folio);
209208
if (!head)
210209
goto out_unlock;
210+
/*
211+
* Upon a noref migration, the folio lock serializes here;
212+
* otherwise bail.
213+
*/
214+
if (test_bit_acquire(BH_Migrate, &head->b_state)) {
215+
WARN_ON(!atomic);
216+
goto out_unlock;
217+
}
218+
211219
bh = head;
212220
do {
213221
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
236244
1 << blkbits);
237245
}
238246
out_unlock:
239-
spin_unlock(&bd_mapping->i_private_lock);
247+
if (atomic)
248+
spin_unlock(&bd_mapping->i_private_lock);
249+
else
250+
folio_unlock(folio);
240251
folio_put(folio);
241252
out:
242253
return ret;
@@ -656,7 +667,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
656667
void write_boundary_block(struct block_device *bdev,
657668
sector_t bblock, unsigned blocksize)
658669
{
659-
struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
670+
struct buffer_head *bh;
671+
672+
bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
660673
if (bh) {
661674
if (buffer_dirty(bh))
662675
write_dirty_buffer(bh, 0);
@@ -1386,25 +1399,42 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
13861399
/*
13871400
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
13881401
* it in the LRU and mark it as accessed. If it is not present then return
1389-
* NULL
1402+
* NULL. Atomic context callers may also return NULL if the buffer is being
1403+
* migrated; similarly the page is not marked accessed either.
13901404
*/
1391-
struct buffer_head *
1392-
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1405+
static struct buffer_head *
1406+
find_get_block_common(struct block_device *bdev, sector_t block,
1407+
unsigned size, bool atomic)
13931408
{
13941409
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
13951410

13961411
if (bh == NULL) {
13971412
/* __find_get_block_slow will mark the page accessed */
1398-
bh = __find_get_block_slow(bdev, block);
1413+
bh = __find_get_block_slow(bdev, block, atomic);
13991414
if (bh)
14001415
bh_lru_install(bh);
14011416
} else
14021417
touch_buffer(bh);
14031418

14041419
return bh;
14051420
}
1421+
1422+
struct buffer_head *
1423+
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1424+
{
1425+
return find_get_block_common(bdev, block, size, true);
1426+
}
14061427
EXPORT_SYMBOL(__find_get_block);
14071428

1429+
/* same as __find_get_block() but allows sleeping contexts */
1430+
struct buffer_head *
1431+
__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
1432+
unsigned size)
1433+
{
1434+
return find_get_block_common(bdev, block, size, false);
1435+
}
1436+
EXPORT_SYMBOL(__find_get_block_nonatomic);
1437+
14081438
/**
14091439
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
14101440
* @bdev: The block device.
@@ -1422,7 +1452,12 @@ EXPORT_SYMBOL(__find_get_block);
14221452
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
14231453
unsigned size, gfp_t gfp)
14241454
{
1425-
struct buffer_head *bh = __find_get_block(bdev, block, size);
1455+
struct buffer_head *bh;
1456+
1457+
if (gfpflags_allow_blocking(gfp))
1458+
bh = __find_get_block_nonatomic(bdev, block, size);
1459+
else
1460+
bh = __find_get_block(bdev, block, size);
14261461

14271462
might_alloc(gfp);
14281463
if (bh)

fs/ext4/ialloc.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
691691
if (!bh || !buffer_uptodate(bh))
692692
/*
693693
* If the block is not in the buffer cache, then it
694-
* must have been written out.
694+
* must have been written out, or, most unlikely, is
695+
* being migrated - false failure should be OK here.
695696
*/
696697
goto out;
697698

fs/ext4/mballoc.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6642,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
66426642
for (i = 0; i < count; i++) {
66436643
cond_resched();
66446644
if (is_metadata)
6645-
bh = sb_find_get_block(inode->i_sb, block + i);
6645+
bh = sb_find_get_block_nonatomic(inode->i_sb,
6646+
block + i);
66466647
ext4_forget(handle, is_metadata, inode, bh, block + i);
66476648
}
66486649
}

fs/file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
#include "internal.h"
2828

29-
bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
29+
static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
3030
{
3131
/*
3232
* If the reference count was already in the dead zone, then this

fs/jbd2/revoke.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
345345
bh = bh_in;
346346

347347
if (!bh) {
348-
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
348+
bh = __find_get_block_nonatomic(bdev, blocknr,
349+
journal->j_blocksize);
349350
if (bh)
350351
BUFFER_TRACE(bh, "found on hash");
351352
}
@@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
355356

356357
/* If there is a different buffer_head lying around in
357358
* memory anywhere... */
358-
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
359+
bh2 = __find_get_block_nonatomic(bdev, blocknr,
360+
journal->j_blocksize);
359361
if (bh2) {
360362
/* ... and it has RevokeValid status... */
361363
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
464466
* state machine will get very upset later on. */
465467
if (need_cancel) {
466468
struct buffer_head *bh2;
467-
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
469+
bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
470+
bh->b_size);
468471
if (bh2) {
469472
if (bh2 != bh)
470473
clear_buffer_revoked(bh2);
@@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
492495
struct jbd2_revoke_record_s *record;
493496
struct buffer_head *bh;
494497
record = (struct jbd2_revoke_record_s *)list_entry;
495-
bh = __find_get_block(journal->j_fs_dev,
496-
record->blocknr,
497-
journal->j_blocksize);
498+
bh = __find_get_block_nonatomic(journal->j_fs_dev,
499+
record->blocknr,
500+
journal->j_blocksize);
498501
if (bh) {
499502
clear_buffer_revoked(bh);
500503
__brelse(bh);

fs/namespace.c

Lines changed: 36 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2826,56 +2826,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
28262826
struct vfsmount *mnt = path->mnt;
28272827
struct dentry *dentry;
28282828
struct mountpoint *mp = ERR_PTR(-ENOENT);
2829+
struct path under = {};
28292830

28302831
for (;;) {
2831-
struct mount *m;
2832+
struct mount *m = real_mount(mnt);
28322833

28332834
if (beneath) {
2834-
m = real_mount(mnt);
2835+
path_put(&under);
28352836
read_seqlock_excl(&mount_lock);
2836-
dentry = dget(m->mnt_mountpoint);
2837+
under.mnt = mntget(&m->mnt_parent->mnt);
2838+
under.dentry = dget(m->mnt_mountpoint);
28372839
read_sequnlock_excl(&mount_lock);
2840+
dentry = under.dentry;
28382841
} else {
28392842
dentry = path->dentry;
28402843
}
28412844

28422845
inode_lock(dentry->d_inode);
2843-
if (unlikely(cant_mount(dentry))) {
2844-
inode_unlock(dentry->d_inode);
2845-
goto out;
2846-
}
2847-
28482846
namespace_lock();
28492847

2850-
if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
2848+
if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
2849+
break; // not to be mounted on
2850+
2851+
if (beneath && unlikely(m->mnt_mountpoint != dentry ||
2852+
&m->mnt_parent->mnt != under.mnt)) {
28512853
namespace_unlock();
28522854
inode_unlock(dentry->d_inode);
2853-
goto out;
2855+
continue; // got moved
28542856
}
28552857

28562858
mnt = lookup_mnt(path);
2857-
if (likely(!mnt))
2859+
if (unlikely(mnt)) {
2860+
namespace_unlock();
2861+
inode_unlock(dentry->d_inode);
2862+
path_put(path);
2863+
path->mnt = mnt;
2864+
path->dentry = dget(mnt->mnt_root);
2865+
continue; // got overmounted
2866+
}
2867+
mp = get_mountpoint(dentry);
2868+
if (IS_ERR(mp))
28582869
break;
2859-
2860-
namespace_unlock();
2861-
inode_unlock(dentry->d_inode);
2862-
if (beneath)
2863-
dput(dentry);
2864-
path_put(path);
2865-
path->mnt = mnt;
2866-
path->dentry = dget(mnt->mnt_root);
2867-
}
2868-
2869-
mp = get_mountpoint(dentry);
2870-
if (IS_ERR(mp)) {
2871-
namespace_unlock();
2872-
inode_unlock(dentry->d_inode);
2870+
if (beneath) {
2871+
/*
2872+
* @under duplicates the references that will stay
2873+
* at least until namespace_unlock(), so the path_put()
2874+
* below is safe (and OK to do under namespace_lock -
2875+
* we are not dropping the final references here).
2876+
*/
2877+
path_put(&under);
2878+
}
2879+
return mp;
28732880
}
2874-
2875-
out:
2881+
namespace_unlock();
2882+
inode_unlock(dentry->d_inode);
28762883
if (beneath)
2877-
dput(dentry);
2878-
2884+
path_put(&under);
28792885
return mp;
28802886
}
28812887

@@ -2886,14 +2892,11 @@ static inline struct mountpoint *lock_mount(struct path *path)
28862892

28872893
static void unlock_mount(struct mountpoint *where)
28882894
{
2889-
struct dentry *dentry = where->m_dentry;
2890-
2895+
inode_unlock(where->m_dentry->d_inode);
28912896
read_seqlock_excl(&mount_lock);
28922897
put_mountpoint(where);
28932898
read_sequnlock_excl(&mount_lock);
2894-
28952899
namespace_unlock();
2896-
inode_unlock(dentry->d_inode);
28972900
}
28982901

28992902
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)

0 commit comments

Comments
 (0)