@@ -342,6 +342,77 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
342
342
return generic_file_llseek_size (file , offset , whence , isize , isize );
343
343
}
344
344
345
+ struct zonefs_zone_append_bio {
346
+ /* The target inode of the BIO */
347
+ struct inode * inode ;
348
+
349
+ /* For sync writes, the target append write offset */
350
+ u64 append_offset ;
351
+
352
+ /*
353
+ * This member must come last, bio_alloc_bioset will allocate enough
354
+ * bytes for entire zonefs_bio but relies on bio being last.
355
+ */
356
+ struct bio bio ;
357
+ };
358
+
359
+ static inline struct zonefs_zone_append_bio *
360
+ zonefs_zone_append_bio (struct bio * bio )
361
+ {
362
+ return container_of (bio , struct zonefs_zone_append_bio , bio );
363
+ }
364
+
365
+ static void zonefs_file_zone_append_dio_bio_end_io (struct bio * bio )
366
+ {
367
+ struct zonefs_zone_append_bio * za_bio = zonefs_zone_append_bio (bio );
368
+ struct zonefs_zone * z = zonefs_inode_zone (za_bio -> inode );
369
+ sector_t za_sector ;
370
+
371
+ if (bio -> bi_status != BLK_STS_OK )
372
+ goto bio_end ;
373
+
374
+ /*
375
+ * If the file zone was written underneath the file system, the zone
376
+ * append operation can still succedd (if the zone is not full) but
377
+ * the write append location will not be where we expect it to be.
378
+ * Check that we wrote where we intended to, that is, at z->z_wpoffset.
379
+ */
380
+ za_sector = z -> z_sector + (za_bio -> append_offset >> SECTOR_SHIFT );
381
+ if (bio -> bi_iter .bi_sector != za_sector ) {
382
+ zonefs_warn (za_bio -> inode -> i_sb ,
383
+ "Invalid write sector %llu for zone at %llu\n" ,
384
+ bio -> bi_iter .bi_sector , z -> z_sector );
385
+ bio -> bi_status = BLK_STS_IOERR ;
386
+ }
387
+
388
+ bio_end :
389
+ iomap_dio_bio_end_io (bio );
390
+ }
391
+
392
+ static void zonefs_file_zone_append_dio_submit_io (const struct iomap_iter * iter ,
393
+ struct bio * bio ,
394
+ loff_t file_offset )
395
+ {
396
+ struct zonefs_zone_append_bio * za_bio = zonefs_zone_append_bio (bio );
397
+ struct inode * inode = iter -> inode ;
398
+ struct zonefs_zone * z = zonefs_inode_zone (inode );
399
+
400
+ /*
401
+ * Issue a zone append BIO to process sync dio writes. The append
402
+ * file offset is saved to check the zone append write location
403
+ * on completion of the BIO.
404
+ */
405
+ za_bio -> inode = inode ;
406
+ za_bio -> append_offset = file_offset ;
407
+
408
+ bio -> bi_opf &= ~REQ_OP_WRITE ;
409
+ bio -> bi_opf |= REQ_OP_ZONE_APPEND ;
410
+ bio -> bi_iter .bi_sector = z -> z_sector ;
411
+ bio -> bi_end_io = zonefs_file_zone_append_dio_bio_end_io ;
412
+
413
+ submit_bio (bio );
414
+ }
415
+
345
416
static int zonefs_file_write_dio_end_io (struct kiocb * iocb , ssize_t size ,
346
417
int error , unsigned int flags )
347
418
{
@@ -372,93 +443,17 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
372
443
return 0 ;
373
444
}
374
445
375
- static const struct iomap_dio_ops zonefs_write_dio_ops = {
376
- .end_io = zonefs_file_write_dio_end_io ,
377
- };
446
+ static struct bio_set zonefs_zone_append_bio_set ;
378
447
379
- static ssize_t zonefs_file_dio_append (struct kiocb * iocb , struct iov_iter * from )
380
- {
381
- struct inode * inode = file_inode (iocb -> ki_filp );
382
- struct zonefs_zone * z = zonefs_inode_zone (inode );
383
- struct block_device * bdev = inode -> i_sb -> s_bdev ;
384
- unsigned int max = bdev_max_zone_append_sectors (bdev );
385
- pgoff_t start , end ;
386
- struct bio * bio ;
387
- ssize_t size = 0 ;
388
- int nr_pages ;
389
- ssize_t ret ;
390
-
391
- max = ALIGN_DOWN (max << SECTOR_SHIFT , inode -> i_sb -> s_blocksize );
392
- iov_iter_truncate (from , max );
393
-
394
- /*
395
- * If the inode block size (zone write granularity) is smaller than the
396
- * page size, we may be appending data belonging to the last page of the
397
- * inode straddling inode->i_size, with that page already cached due to
398
- * a buffered read or readahead. So make sure to invalidate that page.
399
- * This will always be a no-op for the case where the block size is
400
- * equal to the page size.
401
- */
402
- start = iocb -> ki_pos >> PAGE_SHIFT ;
403
- end = (iocb -> ki_pos + iov_iter_count (from ) - 1 ) >> PAGE_SHIFT ;
404
- if (invalidate_inode_pages2_range (inode -> i_mapping , start , end ))
405
- return - EBUSY ;
406
-
407
- nr_pages = iov_iter_npages (from , BIO_MAX_VECS );
408
- if (!nr_pages )
409
- return 0 ;
410
-
411
- bio = bio_alloc (bdev , nr_pages ,
412
- REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE , GFP_NOFS );
413
- bio -> bi_iter .bi_sector = z -> z_sector ;
414
- bio -> bi_ioprio = iocb -> ki_ioprio ;
415
- if (iocb_is_dsync (iocb ))
416
- bio -> bi_opf |= REQ_FUA ;
417
-
418
- ret = bio_iov_iter_get_pages (bio , from );
419
- if (unlikely (ret ))
420
- goto out_release ;
421
-
422
- size = bio -> bi_iter .bi_size ;
423
- task_io_account_write (size );
424
-
425
- if (iocb -> ki_flags & IOCB_HIPRI )
426
- bio_set_polled (bio , iocb );
427
-
428
- ret = submit_bio_wait (bio );
429
-
430
- /*
431
- * If the file zone was written underneath the file system, the zone
432
- * write pointer may not be where we expect it to be, but the zone
433
- * append write can still succeed. So check manually that we wrote where
434
- * we intended to, that is, at zi->i_wpoffset.
435
- */
436
- if (!ret ) {
437
- sector_t wpsector =
438
- z -> z_sector + (z -> z_wpoffset >> SECTOR_SHIFT );
439
-
440
- if (bio -> bi_iter .bi_sector != wpsector ) {
441
- zonefs_warn (inode -> i_sb ,
442
- "Corrupted write pointer %llu for zone at %llu\n" ,
443
- bio -> bi_iter .bi_sector , z -> z_sector );
444
- ret = - EIO ;
445
- }
446
- }
447
-
448
- zonefs_file_write_dio_end_io (iocb , size , ret , 0 );
449
- trace_zonefs_file_dio_append (inode , size , ret );
450
-
451
- out_release :
452
- bio_release_pages (bio , false);
453
- bio_put (bio );
454
-
455
- if (ret >= 0 ) {
456
- iocb -> ki_pos += size ;
457
- return size ;
458
- }
448
+ static const struct iomap_dio_ops zonefs_zone_append_dio_ops = {
449
+ .submit_io = zonefs_file_zone_append_dio_submit_io ,
450
+ .end_io = zonefs_file_write_dio_end_io ,
451
+ .bio_set = & zonefs_zone_append_bio_set ,
452
+ };
459
453
460
- return ret ;
461
- }
454
+ static const struct iomap_dio_ops zonefs_write_dio_ops = {
455
+ .end_io = zonefs_file_write_dio_end_io ,
456
+ };
462
457
463
458
/*
464
459
* Do not exceed the LFS limits nor the file zone size. If pos is under the
@@ -539,6 +534,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
539
534
struct zonefs_inode_info * zi = ZONEFS_I (inode );
540
535
struct zonefs_zone * z = zonefs_inode_zone (inode );
541
536
struct super_block * sb = inode -> i_sb ;
537
+ const struct iomap_dio_ops * dio_ops ;
542
538
bool sync = is_sync_kiocb (iocb );
543
539
bool append = false;
544
540
ssize_t ret , count ;
@@ -582,20 +578,26 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
582
578
}
583
579
584
580
if (append ) {
585
- ret = zonefs_file_dio_append (iocb , from );
581
+ unsigned int max = bdev_max_zone_append_sectors (sb -> s_bdev );
582
+
583
+ max = ALIGN_DOWN (max << SECTOR_SHIFT , sb -> s_blocksize );
584
+ iov_iter_truncate (from , max );
585
+
586
+ dio_ops = & zonefs_zone_append_dio_ops ;
586
587
} else {
587
- /*
588
- * iomap_dio_rw() may return ENOTBLK if there was an issue with
589
- * page invalidation. Overwrite that error code with EBUSY to
590
- * be consistent with zonefs_file_dio_append() return value for
591
- * similar issues.
592
- */
593
- ret = iomap_dio_rw (iocb , from , & zonefs_write_iomap_ops ,
594
- & zonefs_write_dio_ops , 0 , NULL , 0 );
595
- if (ret == - ENOTBLK )
596
- ret = - EBUSY ;
588
+ dio_ops = & zonefs_write_dio_ops ;
597
589
}
598
590
591
+ /*
592
+ * iomap_dio_rw() may return ENOTBLK if there was an issue with
593
+ * page invalidation. Overwrite that error code with EBUSY so that
594
+ * the user can make sense of the error.
595
+ */
596
+ ret = iomap_dio_rw (iocb , from , & zonefs_write_iomap_ops ,
597
+ dio_ops , 0 , NULL , 0 );
598
+ if (ret == - ENOTBLK )
599
+ ret = - EBUSY ;
600
+
599
601
if (zonefs_zone_is_seq (z ) &&
600
602
(ret > 0 || ret == - EIOCBQUEUED )) {
601
603
if (ret > 0 )
@@ -900,3 +902,15 @@ const struct file_operations zonefs_file_operations = {
900
902
.splice_write = iter_file_splice_write ,
901
903
.iopoll = iocb_bio_iopoll ,
902
904
};
905
+
906
+ int zonefs_file_bioset_init (void )
907
+ {
908
+ return bioset_init (& zonefs_zone_append_bio_set , BIO_POOL_SIZE ,
909
+ offsetof(struct zonefs_zone_append_bio , bio ),
910
+ BIOSET_NEED_BVECS );
911
+ }
912
+
913
+ void zonefs_file_bioset_exit (void )
914
+ {
915
+ bioset_exit (& zonefs_zone_append_bio_set );
916
+ }
0 commit comments