Skip to content

Commit b391133

Browse files
YuKuai-huaweiliu-song-6
authored andcommitted
md: split MD_RECOVERY_NEEDED out of mddev_resume
New mddev_resume() calls are added to synchronize IO with array reconfiguration, however, this introduces a performance regression while adding it in md_start_sync(): 1) someone sets MD_RECOVERY_NEEDED first; 2) daemon thread grabs reconfig_mutex, then clears MD_RECOVERY_NEEDED and queues a new sync work; 3) daemon thread releases reconfig_mutex; 4) in md_start_sync a) check that there are spares that can be added/removed, then suspend the array; b) remove_and_add_spares may not be called, or called without really add/remove spares; c) resume the array, then set MD_RECOVERY_NEEDED again! Loop between 2 - 4, then mddev_suspend() will be called quite often, for consequence, normal IO will be quite slow. Fix this problem by don't set MD_RECOVERY_NEEDED again in md_start_sync(), hence the loop will be broken. Fixes: bc08041 ("md: suspend array in md_start_sync() if array need reconfiguration") Suggested-by: Song Liu <song@kernel.org> Reported-by: Janpieter Sollie <janpieter.sollie@edpnet.be> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218200 Signed-off-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20231207020724.2797445-1-yukuai1@huaweicloud.com
1 parent f52f5c7 commit b391133

File tree

1 file changed

+26
-4
lines changed

1 file changed

+26
-4
lines changed

drivers/md/md.c

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
490490
}
491491
EXPORT_SYMBOL_GPL(mddev_suspend);
492492

493-
void mddev_resume(struct mddev *mddev)
493+
static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
494494
{
495495
lockdep_assert_not_held(&mddev->reconfig_mutex);
496496

@@ -507,12 +507,18 @@ void mddev_resume(struct mddev *mddev)
507507
percpu_ref_resurrect(&mddev->active_io);
508508
wake_up(&mddev->sb_wait);
509509

510-
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
510+
if (recovery_needed)
511+
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
511512
md_wakeup_thread(mddev->thread);
512513
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
513514

514515
mutex_unlock(&mddev->suspend_mutex);
515516
}
517+
518+
void mddev_resume(struct mddev *mddev)
519+
{
520+
return __mddev_resume(mddev, true);
521+
}
516522
EXPORT_SYMBOL_GPL(mddev_resume);
517523

518524
/*
@@ -9389,7 +9395,15 @@ static void md_start_sync(struct work_struct *ws)
93899395
goto not_running;
93909396
}
93919397

9392-
suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
9398+
mddev_unlock(mddev);
9399+
/*
9400+
* md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9401+
* not set it again. Otherwise, we may cause issue like this one:
9402+
* https://bugzilla.kernel.org/show_bug.cgi?id=218200
9403+
* Therefore, use __mddev_resume(mddev, false).
9404+
*/
9405+
if (suspend)
9406+
__mddev_resume(mddev, false);
93939407
md_wakeup_thread(mddev->sync_thread);
93949408
sysfs_notify_dirent_safe(mddev->sysfs_action);
93959409
md_new_event();
@@ -9401,7 +9415,15 @@ static void md_start_sync(struct work_struct *ws)
94019415
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
94029416
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
94039417
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9404-
suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
9418+
mddev_unlock(mddev);
9419+
/*
9420+
* md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9421+
* not set it again. Otherwise, we may cause issue like this one:
9422+
* https://bugzilla.kernel.org/show_bug.cgi?id=218200
9423+
* Therefore, use __mddev_resume(mddev, false).
9424+
*/
9425+
if (suspend)
9426+
__mddev_resume(mddev, false);
94059427

94069428
wake_up(&resync_wait);
94079429
if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&

0 commit comments

Comments
 (0)