Skip to content

Commit a6aa36e

Browse files
damien-lemoalaxboe
authored andcommitted
block: Remove zone write plugs when handling native zone append writes
For devices that natively support zone append operations, REQ_OP_ZONE_APPEND BIOs are not processed through zone write plugging and are immediately issued to the zoned device. This means that there is no write pointer offset tracking done for these operations and that a zone write plug is not necessary. However, when receiving a zone append BIO, we may already have a zone write plug for the target zone if that zone was previously partially written using regular write operations. In such case, since the write pointer offset of the zone write plug is not incremented by the amount of sectors appended to the zone, 2 issues arise: 1) we risk leaving the plug in the disk hash table if the zone is fully written using zone append or regular write operations, because the write pointer offset will never reach the "zone full" state. 2) Regular write operations that are issued after zone append operations will always be failed by blk_zone_wplug_prepare_bio() as the write pointer alignment check will fail, even if the user correctly accounted for the zone append operations and issued the regular writes with a correct sector. Avoid these issues by immediately removing the zone write plug of zones that are the target of zone append operations when blk_zone_plug_bio() is called. The new function blk_zone_wplug_handle_native_zone_append() implements this for devices that natively support zone append. The removal of the zone write plug using disk_remove_zone_wplug() requires aborting all plugged regular write using disk_zone_wplug_abort() as otherwise the plugged write BIOs would never be executed (with the plug removed, the completion path will never see again the zone write plug as disk_get_zone_wplug() will return NULL). Rate-limited warnings are added to blk_zone_wplug_handle_native_zone_append() and to disk_zone_wplug_abort() to signal this. Since blk_zone_wplug_handle_native_zone_append() is called in the hot path for operations that will not be plugged, disk_get_zone_wplug() is optimized under the assumption that a user issuing zone append operations is not at the same time issuing regular writes and that there are no hashed zone write plugs. The struct gendisk atomic counter nr_zone_wplugs is added to check this, with this counter incremented in disk_insert_zone_wplug() and decremented in disk_remove_zone_wplug(). To be consistent with this fix, we do not need to fill the zone write plug hash table with zone write plugs for zones that are partially written for a device that supports native zone append operations. So modify blk_revalidate_seq_zone() to return early to avoid allocating and inserting a zone write plug for partially written sequential zones if the device natively supports zone append. Reported-by: Jorgen Hansen <Jorgen.Hansen@wdc.com> Fixes: 9b1ce7f ("block: Implement zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal <dlemoal@kernel.org> Tested-by: Jorgen Hansen <Jorgen.Hansen@wdc.com> Link: https://lore.kernel.org/r/20250214041434.82564-1-dlemoal@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 889c570 commit a6aa36e

File tree

2 files changed

+73
-10
lines changed

2 files changed

+73
-10
lines changed

block/blk-zoned.c

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -410,13 +410,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
410410
}
411411
}
412412
hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
413+
atomic_inc(&disk->nr_zone_wplugs);
413414
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
414415

415416
return true;
416417
}
417418

418-
static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
419-
sector_t sector)
419+
static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
420+
sector_t sector)
420421
{
421422
unsigned int zno = disk_zone_no(disk, sector);
422423
unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
@@ -437,6 +438,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
437438
return NULL;
438439
}
439440

441+
static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
442+
sector_t sector)
443+
{
444+
if (!atomic_read(&disk->nr_zone_wplugs))
445+
return NULL;
446+
447+
return disk_get_hashed_zone_wplug(disk, sector);
448+
}
449+
440450
static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
441451
{
442452
struct blk_zone_wplug *zwplug =
@@ -503,6 +513,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk,
503513
zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
504514
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
505515
hlist_del_init_rcu(&zwplug->node);
516+
atomic_dec(&disk->nr_zone_wplugs);
506517
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
507518
disk_put_zone_wplug(zwplug);
508519
}
@@ -593,6 +604,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
593604
{
594605
struct bio *bio;
595606

607+
if (bio_list_empty(&zwplug->bio_list))
608+
return;
609+
610+
pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
611+
zwplug->disk->disk_name, zwplug->zone_no);
596612
while ((bio = bio_list_pop(&zwplug->bio_list)))
597613
blk_zone_wplug_bio_io_error(zwplug, bio);
598614
}
@@ -1040,6 +1056,47 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
10401056
return true;
10411057
}
10421058

1059+
static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1060+
{
1061+
struct gendisk *disk = bio->bi_bdev->bd_disk;
1062+
struct blk_zone_wplug *zwplug;
1063+
unsigned long flags;
1064+
1065+
/*
1066+
* We have native support for zone append operations, so we are not
1067+
* going to handle @bio through plugging. However, we may already have a
1068+
* zone write plug for the target zone if that zone was previously
1069+
* partially written using regular writes. In such case, we risk leaving
1070+
* the plug in the disk hash table if the zone is fully written using
1071+
* zone append operations. Avoid this by removing the zone write plug.
1072+
*/
1073+
zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1074+
if (likely(!zwplug))
1075+
return;
1076+
1077+
spin_lock_irqsave(&zwplug->lock, flags);
1078+
1079+
/*
1080+
* We are about to remove the zone write plug. But if the user
1081+
* (mistakenly) has issued regular writes together with native zone
1082+
* append, we must aborts the writes as otherwise the plugged BIOs would
1083+
* not be executed by the plug BIO work as disk_get_zone_wplug() will
1084+
* return NULL after the plug is removed. Aborting the plugged write
1085+
* BIOs is consistent with the fact that these writes will most likely
1086+
* fail anyway as there is no ordering guarantees between zone append
1087+
* operations and regular write operations.
1088+
*/
1089+
if (!bio_list_empty(&zwplug->bio_list)) {
1090+
pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1091+
disk->disk_name, zwplug->zone_no);
1092+
disk_zone_wplug_abort(zwplug);
1093+
}
1094+
disk_remove_zone_wplug(disk, zwplug);
1095+
spin_unlock_irqrestore(&zwplug->lock, flags);
1096+
1097+
disk_put_zone_wplug(zwplug);
1098+
}
1099+
10431100
/**
10441101
* blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
10451102
* @bio: The BIO being submitted
@@ -1096,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
10961153
*/
10971154
switch (bio_op(bio)) {
10981155
case REQ_OP_ZONE_APPEND:
1099-
if (!bdev_emulates_zone_append(bdev))
1156+
if (!bdev_emulates_zone_append(bdev)) {
1157+
blk_zone_wplug_handle_native_zone_append(bio);
11001158
return false;
1159+
}
11011160
fallthrough;
11021161
case REQ_OP_WRITE:
11031162
case REQ_OP_WRITE_ZEROES:
@@ -1284,6 +1343,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
12841343
{
12851344
unsigned int i;
12861345

1346+
atomic_set(&disk->nr_zone_wplugs, 0);
12871347
disk->zone_wplugs_hash_bits =
12881348
min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
12891349

@@ -1338,6 +1398,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
13381398
}
13391399
}
13401400

1401+
WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
13411402
kfree(disk->zone_wplugs_hash);
13421403
disk->zone_wplugs_hash = NULL;
13431404
disk->zone_wplugs_hash_bits = 0;
@@ -1550,11 +1611,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
15501611
}
15511612

15521613
/*
1553-
* We need to track the write pointer of all zones that are not
1554-
* empty nor full. So make sure we have a zone write plug for
1555-
* such zone if the device has a zone write plug hash table.
1614+
* If the device needs zone append emulation, we need to track the
1615+
* write pointer of all zones that are not empty nor full. So make sure
1616+
* we have a zone write plug for such zone if the device has a zone
1617+
* write plug hash table.
15561618
*/
1557-
if (!disk->zone_wplugs_hash)
1619+
if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
15581620
return 0;
15591621

15601622
disk_zone_wplug_sync_wp_offset(disk, zone);

include/linux/blkdev.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,11 @@ struct gendisk {
196196
unsigned int zone_capacity;
197197
unsigned int last_zone_capacity;
198198
unsigned long __rcu *conv_zones_bitmap;
199-
unsigned int zone_wplugs_hash_bits;
200-
spinlock_t zone_wplugs_lock;
199+
unsigned int zone_wplugs_hash_bits;
200+
atomic_t nr_zone_wplugs;
201+
spinlock_t zone_wplugs_lock;
201202
struct mempool_s *zone_wplugs_pool;
202-
struct hlist_head *zone_wplugs_hash;
203+
struct hlist_head *zone_wplugs_hash;
203204
struct workqueue_struct *zone_wplugs_wq;
204205
#endif /* CONFIG_BLK_DEV_ZONED */
205206

0 commit comments

Comments
 (0)