Skip to content

Commit cd5fc65

Browse files
YuKuai-huaweiliu-song-6
authored andcommitted
md/md-bitmap: move bitmap_{start, end}write to md upper layer
There are two BUG reports that raid5 will hang at bitmap_startwrite([1],[2]), root cause is that bitmap start write and end write is unbalanced, it's not quite clear where, and while reviewing raid5 code, it's found that bitmap operations can be optimized. For example, for a 4 disks raid5, with chunksize=8k, if user issue a IO (0 + 48k) to the array: ┌────────────────────────────────────────────────────────────┐ │chunk 0 │ │ ┌────────────┬─────────────┬─────────────┬────────────┼ │ sh0 │A0: 0 + 4k │A1: 8k + 4k │A2: 16k + 4k │A3: P │ │ ┼────────────┼─────────────┼─────────────┼────────────┼ │ sh1 │B0: 4k + 4k │B1: 12k + 4k │B2: 20k + 4k │B3: P │ ┼──────┴────────────┴─────────────┴─────────────┴────────────┼ │chunk 1 │ │ ┌────────────┬─────────────┬─────────────┬────────────┤ │ sh2 │C0: 24k + 4k│C1: 32k + 4k │C2: P │C3: 40k + 4k│ │ ┼────────────┼─────────────┼─────────────┼────────────┼ │ sh3 │D0: 28k + 4k│D1: 36k + 4k │D2: P │D3: 44k + 4k│ └──────┴────────────┴─────────────┴─────────────┴────────────┘ Before this patch, 4 stripe head will be used, and each sh will attach bio for 3 disks, and each attached bio will trigger bitmap_startwrite() once, which means total 12 times. - 3 times (0 + 4k), for (A0, A1 and A2) - 3 times (4 + 4k), for (B0, B1 and B2) - 3 times (8 + 4k), for (C0, C1 and C3) - 3 times (12 + 4k), for (D0, D1 and D3) After this patch, md upper layer will calculate that IO range (0 + 48k) is corresponding to the bitmap (0 + 16k), and call bitmap_startwrite() just once. Noted that this patch will align bitmap ranges to the chunks, for example, if user issue a IO (0 + 4k) to array: - Before this patch, 1 time (0 + 4k), for A0; - After this patch, 1 time (0 + 8k) for chunk 0; Usually, one bitmap bit will represent more than one disk chunk, and this doesn't have any difference. And even if user really created a array that one chunk contain multiple bits, the overhead is that more data will be recovered after power failure. Also remove STRIPE_BITMAP_PENDING since it's not used anymore. [1] https://lore.kernel.org/all/CAJpMwyjmHQLvm6zg1cmQErttNNQPDAAXPKM3xgTjMhbfts986Q@mail.gmail.com/ [2] https://lore.kernel.org/all/ADF7D720-5764-4AF3-B68E-1845988737AA@flyingcircus.io/ Signed-off-by: Yu Kuai <yukuai3@huawei.com> Link: https://lore.kernel.org/r/20250109015145.158868-6-yukuai1@huaweicloud.com Signed-off-by: Song Liu <song@kernel.org>
1 parent 9c89f60 commit cd5fc65

File tree

7 files changed

+37
-56
lines changed

7 files changed

+37
-56
lines changed

drivers/md/md.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8745,12 +8745,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
87458745
}
87468746
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
87478747

8748+
static void md_bitmap_start(struct mddev *mddev,
8749+
struct md_io_clone *md_io_clone)
8750+
{
8751+
if (mddev->pers->bitmap_sector)
8752+
mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
8753+
&md_io_clone->sectors);
8754+
8755+
mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
8756+
md_io_clone->sectors);
8757+
}
8758+
8759+
static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
8760+
{
8761+
mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
8762+
md_io_clone->sectors);
8763+
}
8764+
87488765
static void md_end_clone_io(struct bio *bio)
87498766
{
87508767
struct md_io_clone *md_io_clone = bio->bi_private;
87518768
struct bio *orig_bio = md_io_clone->orig_bio;
87528769
struct mddev *mddev = md_io_clone->mddev;
87538770

8771+
if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8772+
md_bitmap_end(mddev, md_io_clone);
8773+
87548774
if (bio->bi_status && !orig_bio->bi_status)
87558775
orig_bio->bi_status = bio->bi_status;
87568776

@@ -8775,6 +8795,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
87758795
if (blk_queue_io_stat(bdev->bd_disk->queue))
87768796
md_io_clone->start_time = bio_start_io_acct(*bio);
87778797

8798+
if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
8799+
md_io_clone->offset = (*bio)->bi_iter.bi_sector;
8800+
md_io_clone->sectors = bio_sectors(*bio);
8801+
md_bitmap_start(mddev, md_io_clone);
8802+
}
8803+
87788804
clone->bi_end_io = md_end_clone_io;
87798805
clone->bi_private = md_io_clone;
87808806
*bio = clone;
@@ -8793,6 +8819,9 @@ void md_free_cloned_bio(struct bio *bio)
87938819
struct bio *orig_bio = md_io_clone->orig_bio;
87948820
struct mddev *mddev = md_io_clone->mddev;
87958821

8822+
if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
8823+
md_bitmap_end(mddev, md_io_clone);
8824+
87968825
if (bio->bi_status && !orig_bio->bi_status)
87978826
orig_bio->bi_status = bio->bi_status;
87988827

drivers/md/md.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,8 @@ struct md_io_clone {
831831
struct mddev *mddev;
832832
struct bio *orig_bio;
833833
unsigned long start_time;
834+
sector_t offset;
835+
unsigned long sectors;
834836
struct bio bio_clone;
835837
};
836838

drivers/md/raid1.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -422,8 +422,6 @@ static void close_write(struct r1bio *r1_bio)
422422

423423
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
424424
mddev->bitmap_ops->end_behind_write(mddev);
425-
/* clear the bitmap if all writes complete successfully */
426-
mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors);
427425
md_write_end(mddev);
428426
}
429427

@@ -1632,8 +1630,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
16321630

16331631
if (test_bit(R1BIO_BehindIO, &r1_bio->state))
16341632
mddev->bitmap_ops->start_behind_write(mddev);
1635-
mddev->bitmap_ops->startwrite(mddev, r1_bio->sector,
1636-
r1_bio->sectors);
16371633
first_clone = 0;
16381634
}
16391635

drivers/md/raid10.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,8 +428,6 @@ static void close_write(struct r10bio *r10_bio)
428428
{
429429
struct mddev *mddev = r10_bio->mddev;
430430

431-
/* clear the bitmap if all writes complete successfully */
432-
mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors);
433431
md_write_end(mddev);
434432
}
435433

@@ -1506,7 +1504,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
15061504
md_account_bio(mddev, &bio);
15071505
r10_bio->master_bio = bio;
15081506
atomic_set(&r10_bio->remaining, 1);
1509-
mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors);
15101507

15111508
for (i = 0; i < conf->copies; i++) {
15121509
if (r10_bio->devs[i].bio)

drivers/md/raid5-cache.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
313313
if (sh->dev[i].written) {
314314
set_bit(R5_UPTODATE, &sh->dev[i].flags);
315315
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
316-
conf->mddev->bitmap_ops->endwrite(conf->mddev,
317-
sh->sector, RAID5_STRIPE_SECTORS(conf));
318316
}
319317
}
320318
}

drivers/md/raid5.c

Lines changed: 6 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
906906
if (raid5_has_log(conf) || raid5_has_ppl(conf))
907907
return false;
908908
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
909-
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
910-
is_full_stripe_write(sh);
909+
is_full_stripe_write(sh);
911910
}
912911

913912
/* we only do back search */
@@ -3545,29 +3544,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
35453544
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
35463545
sh->dev[dd_idx].sector);
35473546

3548-
if (conf->mddev->bitmap && firstwrite) {
3549-
/* Cannot hold spinlock over bitmap_startwrite,
3550-
* but must ensure this isn't added to a batch until
3551-
* we have added to the bitmap and set bm_seq.
3552-
* So set STRIPE_BITMAP_PENDING to prevent
3553-
* batching.
3554-
* If multiple __add_stripe_bio() calls race here they
3555-
* much all set STRIPE_BITMAP_PENDING. So only the first one
3556-
* to complete "bitmap_startwrite" gets to set
3557-
* STRIPE_BIT_DELAY. This is important as once a stripe
3558-
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
3559-
* any more.
3560-
*/
3561-
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3562-
spin_unlock_irq(&sh->stripe_lock);
3563-
conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
3564-
RAID5_STRIPE_SECTORS(conf));
3565-
spin_lock_irq(&sh->stripe_lock);
3566-
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3567-
if (!sh->batch_head) {
3568-
sh->bm_seq = conf->seq_flush+1;
3569-
set_bit(STRIPE_BIT_DELAY, &sh->state);
3570-
}
3547+
if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
3548+
sh->bm_seq = conf->seq_flush+1;
3549+
set_bit(STRIPE_BIT_DELAY, &sh->state);
35713550
}
35723551
}
35733552

@@ -3618,7 +3597,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
36183597
BUG_ON(sh->batch_head);
36193598
for (i = disks; i--; ) {
36203599
struct bio *bi;
3621-
int bitmap_end = 0;
36223600

36233601
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
36243602
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3643,8 +3621,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
36433621
sh->dev[i].towrite = NULL;
36443622
sh->overwrite_disks = 0;
36453623
spin_unlock_irq(&sh->stripe_lock);
3646-
if (bi)
3647-
bitmap_end = 1;
36483624

36493625
log_stripe_write_finished(sh);
36503626

@@ -3659,10 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
36593635
bio_io_error(bi);
36603636
bi = nextbi;
36613637
}
3662-
if (bitmap_end)
3663-
conf->mddev->bitmap_ops->endwrite(conf->mddev,
3664-
sh->sector, RAID5_STRIPE_SECTORS(conf));
3665-
bitmap_end = 0;
36663638
/* and fail all 'written' */
36673639
bi = sh->dev[i].written;
36683640
sh->dev[i].written = NULL;
@@ -3671,7 +3643,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
36713643
sh->dev[i].page = sh->dev[i].orig_page;
36723644
}
36733645

3674-
if (bi) bitmap_end = 1;
36753646
while (bi && bi->bi_iter.bi_sector <
36763647
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
36773648
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3705,9 +3676,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
37053676
bi = nextbi;
37063677
}
37073678
}
3708-
if (bitmap_end)
3709-
conf->mddev->bitmap_ops->endwrite(conf->mddev,
3710-
sh->sector, RAID5_STRIPE_SECTORS(conf));
37113679
/* If we were in the middle of a write the parity block might
37123680
* still be locked - so just clear all R5_LOCKED flags
37133681
*/
@@ -4056,8 +4024,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
40564024
bio_endio(wbi);
40574025
wbi = wbi2;
40584026
}
4059-
conf->mddev->bitmap_ops->endwrite(conf->mddev,
4060-
sh->sector, RAID5_STRIPE_SECTORS(conf));
4027+
40614028
if (head_sh->batch_head) {
40624029
sh = list_first_entry(&sh->batch_list,
40634030
struct stripe_head,
@@ -4882,8 +4849,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
48824849
(1 << STRIPE_COMPUTE_RUN) |
48834850
(1 << STRIPE_DISCARD) |
48844851
(1 << STRIPE_BATCH_READY) |
4885-
(1 << STRIPE_BATCH_ERR) |
4886-
(1 << STRIPE_BITMAP_PENDING)),
4852+
(1 << STRIPE_BATCH_ERR)),
48874853
"stripe state: %lx\n", sh->state);
48884854
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
48894855
(1 << STRIPE_REPLACED)),
@@ -5774,10 +5740,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
57745740
}
57755741
spin_unlock_irq(&sh->stripe_lock);
57765742
if (conf->mddev->bitmap) {
5777-
for (d = 0; d < conf->raid_disks - conf->max_degraded;
5778-
d++)
5779-
mddev->bitmap_ops->startwrite(mddev, sh->sector,
5780-
RAID5_STRIPE_SECTORS(conf));
57815743
sh->bm_seq = conf->seq_flush + 1;
57825744
set_bit(STRIPE_BIT_DELAY, &sh->state);
57835745
}

drivers/md/raid5.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,9 +371,6 @@ enum {
371371
STRIPE_ON_RELEASE_LIST,
372372
STRIPE_BATCH_READY,
373373
STRIPE_BATCH_ERR,
374-
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
375-
* to batch yet.
376-
*/
377374
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
378375
* this bit is used in two scenarios:
379376
*

0 commit comments

Comments
 (0)