Skip to content

Commit efb0b63

Browse files
committed
Merge tag 'zonefs-6.8-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs
Pull zonefs fix from Damien Le Moal: - Fix direct write error handling to avoid a race between failed IO completion and the submission path itself which can result in an invalid file size exposed to the user after the failed IO. * tag 'zonefs-6.8-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs: zonefs: Improve error handling
2 parents 0f1dd5e + 14db5f6 commit efb0b63

File tree

2 files changed

+65
-43
lines changed

2 files changed

+65
-43
lines changed

fs/zonefs/file.c

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
348348
struct zonefs_inode_info *zi = ZONEFS_I(inode);
349349

350350
if (error) {
351-
zonefs_io_error(inode, true);
351+
/*
352+
* For Sync IOs, error recovery is called from
353+
* zonefs_file_dio_write().
354+
*/
355+
if (!is_sync_kiocb(iocb))
356+
zonefs_io_error(inode, true);
352357
return error;
353358
}
354359

@@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
491496
ret = -EINVAL;
492497
goto inode_unlock;
493498
}
499+
/*
500+
* Advance the zone write pointer offset. This assumes that the
501+
* IO will succeed, which is OK to do because we do not allow
502+
* partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
503+
* fails, the error path will correct the write pointer offset.
504+
*/
505+
z->z_wpoffset += count;
506+
zonefs_inode_account_active(inode);
494507
mutex_unlock(&zi->i_truncate_mutex);
495508
}
496509

@@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
504517
if (ret == -ENOTBLK)
505518
ret = -EBUSY;
506519

507-
if (zonefs_zone_is_seq(z) &&
508-
(ret > 0 || ret == -EIOCBQUEUED)) {
509-
if (ret > 0)
510-
count = ret;
511-
512-
/*
513-
* Update the zone write pointer offset assuming the write
514-
* operation succeeded. If it did not, the error recovery path
515-
* will correct it. Also do active seq file accounting.
516-
*/
517-
mutex_lock(&zi->i_truncate_mutex);
518-
z->z_wpoffset += count;
519-
zonefs_inode_account_active(inode);
520-
mutex_unlock(&zi->i_truncate_mutex);
520+
/*
521+
* For a failed IO or partial completion, trigger error recovery
522+
* to update the zone write pointer offset to a correct value.
523+
* For asynchronous IOs, zonefs_file_write_dio_end_io() may already
524+
* have executed error recovery if the IO already completed when we
525+
* reach here. However, we cannot know that and execute error recovery
526+
* again (that will not change anything).
527+
*/
528+
if (zonefs_zone_is_seq(z)) {
529+
if (ret > 0 && ret != count)
530+
ret = -EIO;
531+
if (ret < 0 && ret != -EIOCBQUEUED)
532+
zonefs_io_error(inode, true);
521533
}
522534

523535
inode_unlock:

fs/zonefs/super.c

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
246246
z->z_mode = inode->i_mode;
247247
}
248248

249-
struct zonefs_ioerr_data {
250-
struct inode *inode;
251-
bool write;
252-
};
253-
254249
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
255250
void *data)
256251
{
257-
struct zonefs_ioerr_data *err = data;
258-
struct inode *inode = err->inode;
252+
struct blk_zone *z = data;
253+
254+
*z = *zone;
255+
return 0;
256+
}
257+
258+
static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
259+
bool write)
260+
{
259261
struct zonefs_zone *z = zonefs_inode_zone(inode);
260262
struct super_block *sb = inode->i_sb;
261263
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
270272
data_size = zonefs_check_zone_condition(sb, z, zone);
271273
isize = i_size_read(inode);
272274
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
273-
!err->write && isize == data_size)
274-
return 0;
275+
!write && isize == data_size)
276+
return;
275277

276278
/*
277279
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
292294
* In all cases, warn about inode size inconsistency and handle the
293295
* IO error according to the zone condition and to the mount options.
294296
*/
295-
if (zonefs_zone_is_seq(z) && isize != data_size)
297+
if (isize != data_size)
296298
zonefs_warn(sb,
297299
"inode %lu: invalid size %lld (should be %lld)\n",
298300
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
352354
zonefs_i_size_write(inode, data_size);
353355
z->z_wpoffset = data_size;
354356
zonefs_inode_account_active(inode);
355-
356-
return 0;
357357
}
358358

359359
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
367367
{
368368
struct zonefs_zone *z = zonefs_inode_zone(inode);
369369
struct super_block *sb = inode->i_sb;
370-
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
371370
unsigned int noio_flag;
372-
unsigned int nr_zones = 1;
373-
struct zonefs_ioerr_data err = {
374-
.inode = inode,
375-
.write = write,
376-
};
371+
struct blk_zone zone;
377372
int ret;
378373

379374
/*
380-
* The only files that have more than one zone are conventional zone
381-
* files with aggregated conventional zones, for which the inode zone
382-
* size is always larger than the device zone size.
375+
* Conventional zone have no write pointer and cannot become read-only
376+
* or offline. So simply fake a report for a single or aggregated zone
377+
* and let zonefs_handle_io_error() correct the zone inode information
378+
* according to the mount options.
383379
*/
384-
if (z->z_size > bdev_zone_sectors(sb->s_bdev))
385-
nr_zones = z->z_size >>
386-
(sbi->s_zone_sectors_shift + SECTOR_SHIFT);
380+
if (!zonefs_zone_is_seq(z)) {
381+
zone.start = z->z_sector;
382+
zone.len = z->z_size >> SECTOR_SHIFT;
383+
zone.wp = zone.start + zone.len;
384+
zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
385+
zone.cond = BLK_ZONE_COND_NOT_WP;
386+
zone.capacity = zone.len;
387+
goto handle_io_error;
388+
}
387389

388390
/*
389391
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
394396
* the GFP_NOIO context avoids both problems.
395397
*/
396398
noio_flag = memalloc_noio_save();
397-
ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
398-
zonefs_io_error_cb, &err);
399-
if (ret != nr_zones)
399+
ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
400+
zonefs_io_error_cb, &zone);
401+
memalloc_noio_restore(noio_flag);
402+
403+
if (ret != 1) {
400404
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
401405
inode->i_ino, ret);
402-
memalloc_noio_restore(noio_flag);
406+
zonefs_warn(sb, "remounting filesystem read-only\n");
407+
sb->s_flags |= SB_RDONLY;
408+
return;
409+
}
410+
411+
handle_io_error:
412+
zonefs_handle_io_error(inode, &zone, write);
403413
}
404414

405415
static struct kmem_cache *zonefs_inode_cachep;

0 commit comments

Comments
 (0)