Skip to content

Commit 40ebc18

Browse files
committed
Pull block devices as files from Christian Brauner: This opens block devices as files. Instead of introducing a separate indirection into bdev_open_by_*() vis struct bdev_handle we can just make bdev_file_open_by_*() return a struct file. Opening and closing a block device from setup_bdev_super() and in all other places just becomes equivalent to opening and closing a file. This has held up in xfstests and in blktests so far and it seems stable and clean. The equivalence of opening and closing block devices to regular files is a win in and of itself imho. Added to that is the ability to do away with struct bdev_handle completely and make various low-level helpers private to the block layer. All places were we currently stash a struct bdev_handle we just stash a file and use an accessor such as file_bdev() akin to I_BDEV() to get to the block device. It's now also possible to use file->f_mapping as a replacement for bdev->bd_inode->i_mapping and file->f_inode or file->f_mapping->host as an alternative to bdev->bd_inode allowing us to significantly reduce or even fully remove bdev->bd_inode in follow-up patches. In addition, we could get rid of sb->s_bdev and various other places that stash the block device directly and instead stash the block device file. Again, this is follow-up work if we want this. * series 'Open block devices as files' of https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-0-adbd023e19cc@kernel.org: (35 commits) file: add alloc_file_pseudo_noaccount() file: prepare for new helper init: flush async file closing block: remove bdev_handle completely block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access bdev: remove bdev pointer from struct bdev_handle bdev: make struct bdev_handle private to the block layer bdev: make bdev_{release, open_by_dev}() private to block layer bdev: remove bdev_open_by_path() reiserfs: port block device access to file ocfs2: port block device access to file nfs: port block device access to files jfs: port block device access to file f2fs: port block device access to files ext4: port block device access to file erofs: port device access to file btrfs: port device access to file bcachefs: port block device access to file target: port block device access to file s390: port block device access to file nvme: port block device access to file block2mtd: port device access to files bcache: port block device access to files ... Signed-off-by: Christian Brauner <brauner@kernel.org>
2 parents 6613476 + ab838b3 commit 40ebc18

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+812
-703
lines changed

block/bdev.c

Lines changed: 155 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ struct block_device *I_BDEV(struct inode *inode)
4949
}
5050
EXPORT_SYMBOL(I_BDEV);
5151

52+
struct block_device *file_bdev(struct file *bdev_file)
53+
{
54+
return I_BDEV(bdev_file->f_mapping->host);
55+
}
56+
EXPORT_SYMBOL(file_bdev);
57+
5258
static void bdev_write_inode(struct block_device *bdev)
5359
{
5460
struct inode *inode = bdev->bd_inode;
@@ -368,12 +374,12 @@ static struct file_system_type bd_type = {
368374
};
369375

370376
struct super_block *blockdev_superblock __ro_after_init;
377+
struct vfsmount *blockdev_mnt __ro_after_init;
371378
EXPORT_SYMBOL_GPL(blockdev_superblock);
372379

373380
void __init bdev_cache_init(void)
374381
{
375382
int err;
376-
static struct vfsmount *bd_mnt __ro_after_init;
377383

378384
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
379385
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -382,10 +388,10 @@ void __init bdev_cache_init(void)
382388
err = register_filesystem(&bd_type);
383389
if (err)
384390
panic("Cannot register bdev pseudo-fs");
385-
bd_mnt = kern_mount(&bd_type);
386-
if (IS_ERR(bd_mnt))
391+
blockdev_mnt = kern_mount(&bd_type);
392+
if (IS_ERR(blockdev_mnt))
387393
panic("Cannot create bdev pseudo-fs");
388-
blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
394+
blockdev_superblock = blockdev_mnt->mnt_sb; /* For writeback */
389395
}
390396

391397
struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
@@ -696,6 +702,31 @@ static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
696702
return ret;
697703
}
698704

705+
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
706+
{
707+
int ret;
708+
709+
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
710+
MAJOR(dev), MINOR(dev),
711+
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
712+
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
713+
if (ret)
714+
return ret;
715+
716+
/* Blocking writes requires exclusive opener */
717+
if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
718+
return -EINVAL;
719+
720+
/*
721+
* We're using error pointers to indicate to ->release() when we
722+
* failed to open that block device. Also this doesn't make sense.
723+
*/
724+
if (WARN_ON_ONCE(IS_ERR(holder)))
725+
return -EINVAL;
726+
727+
return 0;
728+
}
729+
699730
static void blkdev_put_part(struct block_device *part)
700731
{
701732
struct block_device *whole = bdev_whole(part);
@@ -775,83 +806,55 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
775806
bdev->bd_writers++;
776807
}
777808

778-
static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
809+
static void bdev_yield_write_access(struct file *bdev_file)
779810
{
811+
struct block_device *bdev;
812+
780813
if (bdev_allow_write_mounted)
781814
return;
782815

816+
bdev = file_bdev(bdev_file);
783817
/* Yield exclusive or shared write access. */
784-
if (mode & BLK_OPEN_RESTRICT_WRITES)
785-
bdev_unblock_writes(bdev);
786-
else if (mode & BLK_OPEN_WRITE)
787-
bdev->bd_writers--;
818+
if (bdev_file->f_mode & FMODE_WRITE) {
819+
if (bdev_writes_blocked(bdev))
820+
bdev_unblock_writes(bdev);
821+
else
822+
bdev->bd_writers--;
823+
}
788824
}
789825

790826
/**
791-
* bdev_open_by_dev - open a block device by device number
792-
* @dev: device number of block device to open
827+
* bdev_open - open a block device
828+
* @bdev: block device to open
793829
* @mode: open mode (BLK_OPEN_*)
794830
* @holder: exclusive holder identifier
795831
* @hops: holder operations
832+
* @bdev_file: file for the block device
796833
*
797-
* Open the block device described by device number @dev. If @holder is not
798-
* %NULL, the block device is opened with exclusive access. Exclusive opens may
799-
* nest for the same @holder.
800-
*
801-
* Use this interface ONLY if you really do not have anything better - i.e. when
802-
* you are behind a truly sucky interface and all you are given is a device
803-
* number. Everything else should use bdev_open_by_path().
834+
* Open the block device. If @holder is not %NULL, the block device is opened
835+
* with exclusive access. Exclusive opens may nest for the same @holder.
804836
*
805837
* CONTEXT:
806838
* Might sleep.
807839
*
808840
* RETURNS:
809-
* Handle with a reference to the block_device on success, ERR_PTR(-errno) on
810-
* failure.
841+
* zero on success, -errno on failure.
811842
*/
812-
struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
813-
const struct blk_holder_ops *hops)
843+
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
844+
const struct blk_holder_ops *hops, struct file *bdev_file)
814845
{
815-
struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
816-
GFP_KERNEL);
817-
struct block_device *bdev;
818846
bool unblock_events = true;
819-
struct gendisk *disk;
847+
struct gendisk *disk = bdev->bd_disk;
820848
int ret;
821849

822-
if (!handle)
823-
return ERR_PTR(-ENOMEM);
824-
825-
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
826-
MAJOR(dev), MINOR(dev),
827-
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
828-
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
829-
if (ret)
830-
goto free_handle;
831-
832-
/* Blocking writes requires exclusive opener */
833-
if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
834-
ret = -EINVAL;
835-
goto free_handle;
836-
}
837-
838-
bdev = blkdev_get_no_open(dev);
839-
if (!bdev) {
840-
ret = -ENXIO;
841-
goto free_handle;
842-
}
843-
disk = bdev->bd_disk;
844-
845850
if (holder) {
846851
mode |= BLK_OPEN_EXCL;
847852
ret = bd_prepare_to_claim(bdev, holder, hops);
848853
if (ret)
849-
goto put_blkdev;
854+
return ret;
850855
} else {
851-
if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) {
852-
ret = -EIO;
853-
goto put_blkdev;
854-
}
856+
if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
857+
return -EIO;
855858
}
856859

857860
disk_block_events(disk);
@@ -892,70 +895,126 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
892895

893896
if (unblock_events)
894897
disk_unblock_events(disk);
895-
handle->bdev = bdev;
896-
handle->holder = holder;
897-
handle->mode = mode;
898-
return handle;
898+
899+
bdev_file->f_flags |= O_LARGEFILE;
900+
bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
901+
if (bdev_nowait(bdev))
902+
bdev_file->f_mode |= FMODE_NOWAIT;
903+
bdev_file->f_mapping = bdev->bd_inode->i_mapping;
904+
bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
905+
bdev_file->private_data = holder;
906+
907+
return 0;
899908
put_module:
900909
module_put(disk->fops->owner);
901910
abort_claiming:
902911
if (holder)
903912
bd_abort_claiming(bdev, holder);
904913
mutex_unlock(&disk->open_mutex);
905914
disk_unblock_events(disk);
906-
put_blkdev:
907-
blkdev_put_no_open(bdev);
908-
free_handle:
909-
kfree(handle);
910-
return ERR_PTR(ret);
915+
return ret;
911916
}
912-
EXPORT_SYMBOL(bdev_open_by_dev);
913917

914-
/**
915-
* bdev_open_by_path - open a block device by name
916-
* @path: path to the block device to open
917-
* @mode: open mode (BLK_OPEN_*)
918-
* @holder: exclusive holder identifier
919-
* @hops: holder operations
920-
*
921-
* Open the block device described by the device file at @path. If @holder is
922-
* not %NULL, the block device is opened with exclusive access. Exclusive opens
923-
* may nest for the same @holder.
924-
*
925-
* CONTEXT:
926-
* Might sleep.
918+
/*
919+
* If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
920+
* associated with the floppy driver where it has allowed ioctls if the
921+
* file was opened for writing, but does not allow reads or writes.
922+
* Make sure that this quirk is reflected in @f_flags.
927923
*
928-
* RETURNS:
929-
* Handle with a reference to the block_device on success, ERR_PTR(-errno) on
930-
* failure.
924+
* It can also happen if a block device is opened as O_RDWR | O_WRONLY.
931925
*/
932-
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
933-
void *holder, const struct blk_holder_ops *hops)
926+
static unsigned blk_to_file_flags(blk_mode_t mode)
927+
{
928+
unsigned int flags = 0;
929+
930+
if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
931+
(BLK_OPEN_READ | BLK_OPEN_WRITE))
932+
flags |= O_RDWR;
933+
else if (mode & BLK_OPEN_WRITE_IOCTL)
934+
flags |= O_RDWR | O_WRONLY;
935+
else if (mode & BLK_OPEN_WRITE)
936+
flags |= O_WRONLY;
937+
else if (mode & BLK_OPEN_READ)
938+
flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
939+
else
940+
WARN_ON_ONCE(true);
941+
942+
if (mode & BLK_OPEN_NDELAY)
943+
flags |= O_NDELAY;
944+
945+
return flags;
946+
}
947+
948+
struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
949+
const struct blk_holder_ops *hops)
934950
{
935-
struct bdev_handle *handle;
951+
struct file *bdev_file;
952+
struct block_device *bdev;
953+
unsigned int flags;
954+
int ret;
955+
956+
ret = bdev_permission(dev, mode, holder);
957+
if (ret)
958+
return ERR_PTR(ret);
959+
960+
bdev = blkdev_get_no_open(dev);
961+
if (!bdev)
962+
return ERR_PTR(-ENXIO);
963+
964+
flags = blk_to_file_flags(mode);
965+
bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode,
966+
blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
967+
if (IS_ERR(bdev_file)) {
968+
blkdev_put_no_open(bdev);
969+
return bdev_file;
970+
}
971+
ihold(bdev->bd_inode);
972+
973+
ret = bdev_open(bdev, mode, holder, hops, bdev_file);
974+
if (ret) {
975+
/* We failed to open the block device. Let ->release() know. */
976+
bdev_file->private_data = ERR_PTR(ret);
977+
fput(bdev_file);
978+
return ERR_PTR(ret);
979+
}
980+
return bdev_file;
981+
}
982+
EXPORT_SYMBOL(bdev_file_open_by_dev);
983+
984+
struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
985+
void *holder,
986+
const struct blk_holder_ops *hops)
987+
{
988+
struct file *file;
936989
dev_t dev;
937990
int error;
938991

939992
error = lookup_bdev(path, &dev);
940993
if (error)
941994
return ERR_PTR(error);
942995

943-
handle = bdev_open_by_dev(dev, mode, holder, hops);
944-
if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
945-
bdev_read_only(handle->bdev)) {
946-
bdev_release(handle);
947-
return ERR_PTR(-EACCES);
996+
file = bdev_file_open_by_dev(dev, mode, holder, hops);
997+
if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
998+
if (bdev_read_only(file_bdev(file))) {
999+
fput(file);
1000+
file = ERR_PTR(-EACCES);
1001+
}
9481002
}
9491003

950-
return handle;
1004+
return file;
9511005
}
952-
EXPORT_SYMBOL(bdev_open_by_path);
1006+
EXPORT_SYMBOL(bdev_file_open_by_path);
9531007

954-
void bdev_release(struct bdev_handle *handle)
1008+
void bdev_release(struct file *bdev_file)
9551009
{
956-
struct block_device *bdev = handle->bdev;
1010+
struct block_device *bdev = file_bdev(bdev_file);
1011+
void *holder = bdev_file->private_data;
9571012
struct gendisk *disk = bdev->bd_disk;
9581013

1014+
/* We failed to open that block device. */
1015+
if (IS_ERR(holder))
1016+
goto put_no_open;
1017+
9591018
/*
9601019
* Sync early if it looks like we're the last one. If someone else
9611020
* opens the block device between now and the decrement of bd_openers
@@ -967,10 +1026,10 @@ void bdev_release(struct bdev_handle *handle)
9671026
sync_blockdev(bdev);
9681027

9691028
mutex_lock(&disk->open_mutex);
970-
bdev_yield_write_access(bdev, handle->mode);
1029+
bdev_yield_write_access(bdev_file);
9711030

972-
if (handle->holder)
973-
bd_end_claim(bdev, handle->holder);
1031+
if (holder)
1032+
bd_end_claim(bdev, holder);
9741033

9751034
/*
9761035
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -986,10 +1045,9 @@ void bdev_release(struct bdev_handle *handle)
9861045
mutex_unlock(&disk->open_mutex);
9871046

9881047
module_put(disk->fops->owner);
1048+
put_no_open:
9891049
blkdev_put_no_open(bdev);
990-
kfree(handle);
9911050
}
992-
EXPORT_SYMBOL(bdev_release);
9931051

9941052
/**
9951053
* lookup_bdev() - Look up a struct block_device by name.

block/blk.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,4 +516,8 @@ static inline int req_ref_read(struct request *req)
516516
return atomic_read(&req->ref);
517517
}
518518

519+
void bdev_release(struct file *bdev_file);
520+
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
521+
const struct blk_holder_ops *hops, struct file *bdev_file);
522+
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
519523
#endif /* BLK_INTERNAL_H */

0 commit comments

Comments
 (0)