Skip to content

Commit 910202f

Browse files
committed
Merge tag 'vfs-6.9.super' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull block handle updates from Christian Brauner: "Last cycle we changed opening of block devices, and opening a block device would return a bdev_handle. This allowed us to implement support for restricting and forbidding writes to mounted block devices. It was accompanied by converting and adding helpers to operate on bdev_handles instead of plain block devices. That was already a good step forward but ultimately it isn't necessary to have special purpose helpers for opening block devices internally that return a bdev_handle. Fundamentally, opening a block device internally should just be equivalent to opening files. So now all internal opens of block devices return files just as a userspace open would. Instead of introducing a separate indirection into bdev_open_by_*() via struct bdev_handle bdev_file_open_by_*() is made to just return a struct file. Opening and closing a block device just becomes equivalent to opening and closing a file. This all works well because internally we already have a pseudo fs for block devices and so opening block devices is simple. There's a few places where we needed to be careful such as during boot when the kernel is supposed to mount the rootfs directly without init doing it. Here we need to take care to ensure that we flush out any asynchronous file close. That's what we already do for opening, unpacking, and closing the initramfs. So nothing new here. The equivalence of opening and closing block devices to regular files is a win in and of itself. But it also has various other advantages. We can remove struct bdev_handle completely. Various low-level helpers are now private to the block layer. Other helpers were simply removable completely. A follow-up series that is already reviewed build on this and makes it possible to remove bdev->bd_inode and allows various clean ups of the buffer head code as well. All places where we stashed a bdev_handle now just stash a file and use simple accessors to get to the actual block device which was already the case for bdev_handle" * tag 'vfs-6.9.super' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (35 commits) block: remove bdev_handle completely block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access bdev: remove bdev pointer from struct bdev_handle bdev: make struct bdev_handle private to the block layer bdev: make bdev_{release, open_by_dev}() private to block layer bdev: remove bdev_open_by_path() reiserfs: port block device access to file ocfs2: port block device access to file nfs: port block device access to files jfs: port block device access to file f2fs: port block device access to files ext4: port block device access to file erofs: port device access to file btrfs: port device access to file bcachefs: port block device access to file target: port block device access to file s390: port block device access to file nvme: port block device access to file block2mtd: port device access to files bcache: port block device access to files ...
2 parents 0c75001 + 40ebc18 commit 910202f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+812
-703
lines changed

block/bdev.c

Lines changed: 155 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ struct block_device *I_BDEV(struct inode *inode)
4949
}
5050
EXPORT_SYMBOL(I_BDEV);
5151

52+
struct block_device *file_bdev(struct file *bdev_file)
53+
{
54+
return I_BDEV(bdev_file->f_mapping->host);
55+
}
56+
EXPORT_SYMBOL(file_bdev);
57+
5258
static void bdev_write_inode(struct block_device *bdev)
5359
{
5460
struct inode *inode = bdev->bd_inode;
@@ -368,12 +374,12 @@ static struct file_system_type bd_type = {
368374
};
369375

370376
struct super_block *blockdev_superblock __ro_after_init;
377+
struct vfsmount *blockdev_mnt __ro_after_init;
371378
EXPORT_SYMBOL_GPL(blockdev_superblock);
372379

373380
void __init bdev_cache_init(void)
374381
{
375382
int err;
376-
static struct vfsmount *bd_mnt __ro_after_init;
377383

378384
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
379385
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -382,10 +388,10 @@ void __init bdev_cache_init(void)
382388
err = register_filesystem(&bd_type);
383389
if (err)
384390
panic("Cannot register bdev pseudo-fs");
385-
bd_mnt = kern_mount(&bd_type);
386-
if (IS_ERR(bd_mnt))
391+
blockdev_mnt = kern_mount(&bd_type);
392+
if (IS_ERR(blockdev_mnt))
387393
panic("Cannot create bdev pseudo-fs");
388-
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
394+
blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */
389395
}
390396

391397
struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
@@ -696,6 +702,31 @@ static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
696702
return ret;
697703
}
698704

705+
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
706+
{
707+
int ret;
708+
709+
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
710+
MAJOR(dev), MINOR(dev),
711+
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
712+
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
713+
if (ret)
714+
return ret;
715+
716+
/* Blocking writes requires exclusive opener */
717+
if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
718+
return -EINVAL;
719+
720+
/*
721+
* We're using error pointers to indicate to ->release() when we
722+
* failed to open that block device. Also this doesn't make sense.
723+
*/
724+
if (WARN_ON_ONCE(IS_ERR(holder)))
725+
return -EINVAL;
726+
727+
return 0;
728+
}
729+
699730
static void blkdev_put_part(struct block_device *part)
700731
{
701732
struct block_device *whole = bdev_whole(part);
@@ -775,83 +806,55 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
775806
bdev->bd_writers++;
776807
}
777808

778-
static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
809+
static void bdev_yield_write_access(struct file *bdev_file)
779810
{
811+
struct block_device *bdev;
812+
780813
if (bdev_allow_write_mounted)
781814
return;
782815

816+
bdev = file_bdev(bdev_file);
783817
/* Yield exclusive or shared write access. */
784-
if (mode & BLK_OPEN_RESTRICT_WRITES)
785-
bdev_unblock_writes(bdev);
786-
else if (mode & BLK_OPEN_WRITE)
787-
bdev->bd_writers--;
818+
if (bdev_file->f_mode & FMODE_WRITE) {
819+
if (bdev_writes_blocked(bdev))
820+
bdev_unblock_writes(bdev);
821+
else
822+
bdev->bd_writers--;
823+
}
788824
}
789825

790826
/**
791-
* bdev_open_by_dev - open a block device by device number
792-
* @dev: device number of block device to open
827+
* bdev_open - open a block device
828+
* @bdev: block device to open
793829
* @mode: open mode (BLK_OPEN_*)
794830
* @holder: exclusive holder identifier
795831
* @hops: holder operations
832+
* @bdev_file: file for the block device
796833
*
797-
* Open the block device described by device number @dev. If @holder is not
798-
* %NULL, the block device is opened with exclusive access. Exclusive opens may
799-
* nest for the same @holder.
800-
*
801-
* Use this interface ONLY if you really do not have anything better - i.e. when
802-
* you are behind a truly sucky interface and all you are given is a device
803-
* number. Everything else should use bdev_open_by_path().
834+
* Open the block device. If @holder is not %NULL, the block device is opened
835+
* with exclusive access. Exclusive opens may nest for the same @holder.
804836
*
805837
* CONTEXT:
806838
* Might sleep.
807839
*
808840
* RETURNS:
809-
* Handle with a reference to the block_device on success, ERR_PTR(-errno) on
810-
* failure.
841+
* zero on success, -errno on failure.
811842
*/
812-
struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
813-
const struct blk_holder_ops *hops)
843+
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
844+
const struct blk_holder_ops *hops, struct file *bdev_file)
814845
{
815-
struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
816-
GFP_KERNEL);
817-
struct block_device *bdev;
818846
bool unblock_events = true;
819-
struct gendisk *disk;
847+
struct gendisk *disk = bdev->bd_disk;
820848
int ret;
821849

822-
if (!handle)
823-
return ERR_PTR(-ENOMEM);
824-
825-
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
826-
MAJOR(dev), MINOR(dev),
827-
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
828-
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
829-
if (ret)
830-
goto free_handle;
831-
832-
/* Blocking writes requires exclusive opener */
833-
if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
834-
ret = -EINVAL;
835-
goto free_handle;
836-
}
837-
838-
bdev = blkdev_get_no_open(dev);
839-
if (!bdev) {
840-
ret = -ENXIO;
841-
goto free_handle;
842-
}
843-
disk = bdev->bd_disk;
844-
845850
if (holder) {
846851
mode |= BLK_OPEN_EXCL;
847852
ret = bd_prepare_to_claim(bdev, holder, hops);
848853
if (ret)
849-
goto put_blkdev;
854+
return ret;
850855
} else {
851-
if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) {
852-
ret = -EIO;
853-
goto put_blkdev;
854-
}
856+
if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
857+
return -EIO;
855858
}
856859

857860
disk_block_events(disk);
@@ -892,70 +895,126 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
892895

893896
if (unblock_events)
894897
disk_unblock_events(disk);
895-
handle->bdev = bdev;
896-
handle->holder = holder;
897-
handle->mode = mode;
898-
return handle;
898+
899+
bdev_file->f_flags |= O_LARGEFILE;
900+
bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
901+
if (bdev_nowait(bdev))
902+
bdev_file->f_mode |= FMODE_NOWAIT;
903+
bdev_file->f_mapping = bdev->bd_inode->i_mapping;
904+
bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
905+
bdev_file->private_data = holder;
906+
907+
return 0;
899908
put_module:
900909
module_put(disk->fops->owner);
901910
abort_claiming:
902911
if (holder)
903912
bd_abort_claiming(bdev, holder);
904913
mutex_unlock(&disk->open_mutex);
905914
disk_unblock_events(disk);
906-
put_blkdev:
907-
blkdev_put_no_open(bdev);
908-
free_handle:
909-
kfree(handle);
910-
return ERR_PTR(ret);
915+
return ret;
911916
}
912-
EXPORT_SYMBOL(bdev_open_by_dev);
913917

914-
/**
915-
* bdev_open_by_path - open a block device by name
916-
* @path: path to the block device to open
917-
* @mode: open mode (BLK_OPEN_*)
918-
* @holder: exclusive holder identifier
919-
* @hops: holder operations
920-
*
921-
* Open the block device described by the device file at @path. If @holder is
922-
* not %NULL, the block device is opened with exclusive access. Exclusive opens
923-
* may nest for the same @holder.
924-
*
925-
* CONTEXT:
926-
* Might sleep.
918+
/*
919+
* If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
920+
* associated with the floppy driver where it has allowed ioctls if the
921+
* file was opened for writing, but does not allow reads or writes.
922+
* Make sure that this quirk is reflected in @f_flags.
927923
*
928-
* RETURNS:
929-
* Handle with a reference to the block_device on success, ERR_PTR(-errno) on
930-
* failure.
924+
* It can also happen if a block device is opened as O_RDWR | O_WRONLY.
931925
*/
932-
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
933-
void *holder, const struct blk_holder_ops *hops)
926+
static unsigned blk_to_file_flags(blk_mode_t mode)
927+
{
928+
unsigned int flags = 0;
929+
930+
if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
931+
(BLK_OPEN_READ | BLK_OPEN_WRITE))
932+
flags |= O_RDWR;
933+
else if (mode & BLK_OPEN_WRITE_IOCTL)
934+
flags |= O_RDWR | O_WRONLY;
935+
else if (mode & BLK_OPEN_WRITE)
936+
flags |= O_WRONLY;
937+
else if (mode & BLK_OPEN_READ)
938+
flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
939+
else
940+
WARN_ON_ONCE(true);
941+
942+
if (mode & BLK_OPEN_NDELAY)
943+
flags |= O_NDELAY;
944+
945+
return flags;
946+
}
947+
948+
struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
949+
const struct blk_holder_ops *hops)
934950
{
935-
struct bdev_handle *handle;
951+
struct file *bdev_file;
952+
struct block_device *bdev;
953+
unsigned int flags;
954+
int ret;
955+
956+
ret = bdev_permission(dev, mode, holder);
957+
if (ret)
958+
return ERR_PTR(ret);
959+
960+
bdev = blkdev_get_no_open(dev);
961+
if (!bdev)
962+
return ERR_PTR(-ENXIO);
963+
964+
flags = blk_to_file_flags(mode);
965+
bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode,
966+
blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
967+
if (IS_ERR(bdev_file)) {
968+
blkdev_put_no_open(bdev);
969+
return bdev_file;
970+
}
971+
ihold(bdev->bd_inode);
972+
973+
ret = bdev_open(bdev, mode, holder, hops, bdev_file);
974+
if (ret) {
975+
/* We failed to open the block device. Let ->release() know. */
976+
bdev_file->private_data = ERR_PTR(ret);
977+
fput(bdev_file);
978+
return ERR_PTR(ret);
979+
}
980+
return bdev_file;
981+
}
982+
EXPORT_SYMBOL(bdev_file_open_by_dev);
983+
984+
struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
985+
void *holder,
986+
const struct blk_holder_ops *hops)
987+
{
988+
struct file *file;
936989
dev_t dev;
937990
int error;
938991

939992
error = lookup_bdev(path, &dev);
940993
if (error)
941994
return ERR_PTR(error);
942995

943-
handle = bdev_open_by_dev(dev, mode, holder, hops);
944-
if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
945-
bdev_read_only(handle->bdev)) {
946-
bdev_release(handle);
947-
return ERR_PTR(-EACCES);
996+
file = bdev_file_open_by_dev(dev, mode, holder, hops);
997+
if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
998+
if (bdev_read_only(file_bdev(file))) {
999+
fput(file);
1000+
file = ERR_PTR(-EACCES);
1001+
}
9481002
}
9491003

950-
return handle;
1004+
return file;
9511005
}
952-
EXPORT_SYMBOL(bdev_open_by_path);
1006+
EXPORT_SYMBOL(bdev_file_open_by_path);
9531007

954-
void bdev_release(struct bdev_handle *handle)
1008+
void bdev_release(struct file *bdev_file)
9551009
{
956-
struct block_device *bdev = handle->bdev;
1010+
struct block_device *bdev = file_bdev(bdev_file);
1011+
void *holder = bdev_file->private_data;
9571012
struct gendisk *disk = bdev->bd_disk;
9581013

1014+
/* We failed to open that block device. */
1015+
if (IS_ERR(holder))
1016+
goto put_no_open;
1017+
9591018
/*
9601019
* Sync early if it looks like we're the last one. If someone else
9611020
* opens the block device between now and the decrement of bd_openers
@@ -967,10 +1026,10 @@ void bdev_release(struct bdev_handle *handle)
9671026
sync_blockdev(bdev);
9681027

9691028
mutex_lock(&disk->open_mutex);
970-
bdev_yield_write_access(bdev, handle->mode);
1029+
bdev_yield_write_access(bdev_file);
9711030

972-
if (handle->holder)
973-
bd_end_claim(bdev, handle->holder);
1031+
if (holder)
1032+
bd_end_claim(bdev, holder);
9741033

9751034
/*
9761035
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -986,10 +1045,9 @@ void bdev_release(struct bdev_handle *handle)
9861045
mutex_unlock(&disk->open_mutex);
9871046

9881047
module_put(disk->fops->owner);
1048+
put_no_open:
9891049
blkdev_put_no_open(bdev);
990-
kfree(handle);
9911050
}
992-
EXPORT_SYMBOL(bdev_release);
9931051

9941052
/**
9951053
* lookup_bdev() - Look up a struct block_device by name.

block/blk.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,4 +516,8 @@ static inline int req_ref_read(struct request *req)
516516
return atomic_read(&req->ref);
517517
}
518518

519+
void bdev_release(struct file *bdev_file);
520+
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
521+
const struct blk_holder_ops *hops, struct file *bdev_file);
522+
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
519523
#endif /* BLK_INTERNAL_H */

0 commit comments

Comments
 (0)