Skip to content

Commit fe9da61

Browse files
committed
zonefs: fix synchronous direct writes to sequential files
Commit 16d7fd3 ("zonefs: use iomap for synchronous direct writes") changes zonefs code from a self-built zone append BIO to using iomap for synchronous direct writes. This change relies on iomap submit BIO callback to change the write BIO built by iomap to a zone append BIO. However, this change overlooked the fact that a write BIO may be very large as it is split when issued. The change from a regular write to a zone append operation for the built BIO can result in a block layer warning as zone append BIO are not allowed to be split. WARNING: CPU: 18 PID: 202210 at block/bio.c:1644 bio_split+0x288/0x350 Call Trace: ? __warn+0xc9/0x2b0 ? bio_split+0x288/0x350 ? report_bug+0x2e6/0x390 ? handle_bug+0x41/0x80 ? exc_invalid_op+0x13/0x40 ? asm_exc_invalid_op+0x16/0x20 ? bio_split+0x288/0x350 bio_split_rw+0x4bc/0x810 ? __pfx_bio_split_rw+0x10/0x10 ? lockdep_unlock+0xf2/0x250 __bio_split_to_limits+0x1d8/0x900 blk_mq_submit_bio+0x1cf/0x18a0 ? __pfx_iov_iter_extract_pages+0x10/0x10 ? __pfx_blk_mq_submit_bio+0x10/0x10 ? find_held_lock+0x2d/0x110 ? lock_release+0x362/0x620 ? mark_held_locks+0x9e/0xe0 __submit_bio+0x1ea/0x290 ? __pfx___submit_bio+0x10/0x10 ? seqcount_lockdep_reader_access.constprop.0+0x82/0x90 submit_bio_noacct_nocheck+0x675/0xa20 ? __pfx_bio_iov_iter_get_pages+0x10/0x10 ? __pfx_submit_bio_noacct_nocheck+0x10/0x10 iomap_dio_bio_iter+0x624/0x1280 __iomap_dio_rw+0xa22/0x18a0 ? lock_is_held_type+0xe3/0x140 ? __pfx___iomap_dio_rw+0x10/0x10 ? lock_release+0x362/0x620 ? zonefs_file_write_iter+0x74c/0xc80 [zonefs] ? down_write+0x13d/0x1e0 iomap_dio_rw+0xe/0x40 zonefs_file_write_iter+0x5ea/0xc80 [zonefs] do_iter_readv_writev+0x18b/0x2c0 ? __pfx_do_iter_readv_writev+0x10/0x10 ? inode_security+0x54/0xf0 do_iter_write+0x13b/0x7c0 ? lock_is_held_type+0xe3/0x140 vfs_writev+0x185/0x550 ? __pfx_vfs_writev+0x10/0x10 ? __handle_mm_fault+0x9bd/0x1c90 ? find_held_lock+0x2d/0x110 ? lock_release+0x362/0x620 ? find_held_lock+0x2d/0x110 ? lock_release+0x362/0x620 ? __up_read+0x1ea/0x720 ? do_pwritev+0x136/0x1f0 do_pwritev+0x136/0x1f0 ? __pfx_do_pwritev+0x10/0x10 ? syscall_enter_from_user_mode+0x22/0x90 ? lockdep_hardirqs_on+0x7d/0x100 do_syscall_64+0x58/0x80 This error depends on the hardware used, specifically on the max zone append bytes and max_[hw_]sectors limits. Tests using AMD Epyc machines that have low limits did not reveal this issue while runs on Intel Xeon machines with larger limits trigger it. Manually splitting the zone append BIO using bio_split_rw() can solve this issue but also requires issuing the fragment BIOs synchronously with submit_bio_wait(), to avoid potential reordering of the zone append BIO fragments, which would lead to data corruption. That is, this solution is not better than using regular write BIOs which are subject to serialization using zone write locking at the IO scheduler level. Given this, fix the issue by removing zone append support and using regular write BIOs for synchronous direct writes. This allows preseving the use of iomap and having identical synchronous and asynchronous sequential file write path. Zone append support will be reintroduced later through io_uring commands to ensure that the needed special handling is done correctly. Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Fixes: 16d7fd3 ("zonefs: use iomap for synchronous direct writes") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal <dlemoal@kernel.org> Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
1 parent 52a93d3 commit fe9da61

File tree

3 files changed

+4
-118
lines changed

3 files changed

+4
-118
lines changed

fs/zonefs/file.c

Lines changed: 3 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -341,77 +341,6 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
341341
return generic_file_llseek_size(file, offset, whence, isize, isize);
342342
}
343343

344-
struct zonefs_zone_append_bio {
345-
/* The target inode of the BIO */
346-
struct inode *inode;
347-
348-
/* For sync writes, the target append write offset */
349-
u64 append_offset;
350-
351-
/*
352-
* This member must come last, bio_alloc_bioset will allocate enough
353-
* bytes for entire zonefs_bio but relies on bio being last.
354-
*/
355-
struct bio bio;
356-
};
357-
358-
static inline struct zonefs_zone_append_bio *
359-
zonefs_zone_append_bio(struct bio *bio)
360-
{
361-
return container_of(bio, struct zonefs_zone_append_bio, bio);
362-
}
363-
364-
static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio)
365-
{
366-
struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
367-
struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode);
368-
sector_t za_sector;
369-
370-
if (bio->bi_status != BLK_STS_OK)
371-
goto bio_end;
372-
373-
/*
374-
* If the file zone was written underneath the file system, the zone
375-
* append operation can still succedd (if the zone is not full) but
376-
* the write append location will not be where we expect it to be.
377-
* Check that we wrote where we intended to, that is, at z->z_wpoffset.
378-
*/
379-
za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT);
380-
if (bio->bi_iter.bi_sector != za_sector) {
381-
zonefs_warn(za_bio->inode->i_sb,
382-
"Invalid write sector %llu for zone at %llu\n",
383-
bio->bi_iter.bi_sector, z->z_sector);
384-
bio->bi_status = BLK_STS_IOERR;
385-
}
386-
387-
bio_end:
388-
iomap_dio_bio_end_io(bio);
389-
}
390-
391-
static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter,
392-
struct bio *bio,
393-
loff_t file_offset)
394-
{
395-
struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
396-
struct inode *inode = iter->inode;
397-
struct zonefs_zone *z = zonefs_inode_zone(inode);
398-
399-
/*
400-
* Issue a zone append BIO to process sync dio writes. The append
401-
* file offset is saved to check the zone append write location
402-
* on completion of the BIO.
403-
*/
404-
za_bio->inode = inode;
405-
za_bio->append_offset = file_offset;
406-
407-
bio->bi_opf &= ~REQ_OP_WRITE;
408-
bio->bi_opf |= REQ_OP_ZONE_APPEND;
409-
bio->bi_iter.bi_sector = z->z_sector;
410-
bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io;
411-
412-
submit_bio(bio);
413-
}
414-
415344
static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
416345
int error, unsigned int flags)
417346
{
@@ -442,14 +371,6 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
442371
return 0;
443372
}
444373

445-
static struct bio_set zonefs_zone_append_bio_set;
446-
447-
static const struct iomap_dio_ops zonefs_zone_append_dio_ops = {
448-
.submit_io = zonefs_file_zone_append_dio_submit_io,
449-
.end_io = zonefs_file_write_dio_end_io,
450-
.bio_set = &zonefs_zone_append_bio_set,
451-
};
452-
453374
static const struct iomap_dio_ops zonefs_write_dio_ops = {
454375
.end_io = zonefs_file_write_dio_end_io,
455376
};
@@ -533,17 +454,15 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
533454
struct zonefs_inode_info *zi = ZONEFS_I(inode);
534455
struct zonefs_zone *z = zonefs_inode_zone(inode);
535456
struct super_block *sb = inode->i_sb;
536-
const struct iomap_dio_ops *dio_ops;
537-
bool sync = is_sync_kiocb(iocb);
538-
bool append = false;
539457
ssize_t ret, count;
540458

541459
/*
542460
* For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
543461
* as this can cause write reordering (e.g. the first aio gets EAGAIN
544462
* on the inode lock but the second goes through but is now unaligned).
545463
*/
546-
if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
464+
if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
465+
(iocb->ki_flags & IOCB_NOWAIT))
547466
return -EOPNOTSUPP;
548467

549468
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -573,18 +492,6 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
573492
goto inode_unlock;
574493
}
575494
mutex_unlock(&zi->i_truncate_mutex);
576-
append = sync;
577-
}
578-
579-
if (append) {
580-
unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev);
581-
582-
max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize);
583-
iov_iter_truncate(from, max);
584-
585-
dio_ops = &zonefs_zone_append_dio_ops;
586-
} else {
587-
dio_ops = &zonefs_write_dio_ops;
588495
}
589496

590497
/*
@@ -593,7 +500,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
593500
* the user can make sense of the error.
594501
*/
595502
ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
596-
dio_ops, 0, NULL, 0);
503+
&zonefs_write_dio_ops, 0, NULL, 0);
597504
if (ret == -ENOTBLK)
598505
ret = -EBUSY;
599506

@@ -938,15 +845,3 @@ const struct file_operations zonefs_file_operations = {
938845
.splice_write = iter_file_splice_write,
939846
.iopoll = iocb_bio_iopoll,
940847
};
941-
942-
int zonefs_file_bioset_init(void)
943-
{
944-
return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE,
945-
offsetof(struct zonefs_zone_append_bio, bio),
946-
BIOSET_NEED_BVECS);
947-
}
948-
949-
void zonefs_file_bioset_exit(void)
950-
{
951-
bioset_exit(&zonefs_zone_append_bio_set);
952-
}

fs/zonefs/super.c

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,13 +1412,9 @@ static int __init zonefs_init(void)
14121412

14131413
BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE);
14141414

1415-
ret = zonefs_file_bioset_init();
1416-
if (ret)
1417-
return ret;
1418-
14191415
ret = zonefs_init_inodecache();
14201416
if (ret)
1421-
goto destroy_bioset;
1417+
return ret;
14221418

14231419
ret = zonefs_sysfs_init();
14241420
if (ret)
@@ -1434,8 +1430,6 @@ static int __init zonefs_init(void)
14341430
zonefs_sysfs_exit();
14351431
destroy_inodecache:
14361432
zonefs_destroy_inodecache();
1437-
destroy_bioset:
1438-
zonefs_file_bioset_exit();
14391433

14401434
return ret;
14411435
}
@@ -1445,7 +1439,6 @@ static void __exit zonefs_exit(void)
14451439
unregister_filesystem(&zonefs_type);
14461440
zonefs_sysfs_exit();
14471441
zonefs_destroy_inodecache();
1448-
zonefs_file_bioset_exit();
14491442
}
14501443

14511444
MODULE_AUTHOR("Damien Le Moal");

fs/zonefs/zonefs.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,8 +279,6 @@ extern const struct file_operations zonefs_dir_operations;
279279
extern const struct address_space_operations zonefs_file_aops;
280280
extern const struct file_operations zonefs_file_operations;
281281
int zonefs_file_truncate(struct inode *inode, loff_t isize);
282-
int zonefs_file_bioset_init(void);
283-
void zonefs_file_bioset_exit(void);
284282

285283
/* In sysfs.c */
286284
int zonefs_sysfs_register(struct super_block *sb);

0 commit comments

Comments
 (0)