Skip to content

Commit f52f5c7

Browse files
YuKuai-huaweiliu-song-6
authored andcommitted
md: fix stopping sync thread
Currently sync thread is stopped from multiple contex: - idle_sync_thread - frozen_sync_thread - __md_stop_writes - md_set_readonly - do_md_stop And there are some problems: 1) sync_work is flushed while reconfig_mutex is grabbed, this can deadlock because the work function will grab reconfig_mutex as well. 2) md_reap_sync_thread() can't be called directly while md_do_sync() is not finished yet, for example, commit 130443d ("md: refactor idle/frozen_sync_thread() to fix deadlock"). 3) If MD_RECOVERY_RUNNING is not set, there is no need to stop sync_thread at all because sync_thread must not be registered. Factor out a helper stop_sync_thread(), so that above contex will behave the same. Fix 1) by flushing sync_work after reconfig_mutex is released, before waiting for sync_thread to be done; Fix 2) bt letting daemon thread to unregister sync_thread; Fix 3) by always checking MD_RECOVERY_RUNNING first. Fixes: db5e653 ("md: delay choosing sync action to md_start_sync()") Signed-off-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20231205094215.1824240-4-yukuai1@huaweicloud.com
1 parent c9f7cb5 commit f52f5c7

File tree

1 file changed

+37
-53
lines changed

1 file changed

+37
-53
lines changed

drivers/md/md.c

Lines changed: 37 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -4840,59 +4840,72 @@ action_show(struct mddev *mddev, char *page)
48404840
return sprintf(page, "%s\n", type);
48414841
}
48424842

4843-
static void stop_sync_thread(struct mddev *mddev)
4843+
/**
4844+
* stop_sync_thread() - wait for sync_thread to stop if it's running.
4845+
* @mddev: the array.
4846+
* @locked: if set, reconfig_mutex will still be held after this function
4847+
* return; if not set, reconfig_mutex will be released after this
4848+
* function return.
4849+
* @check_seq: if set, only wait for curent running sync_thread to stop, noted
4850+
* that new sync_thread can still start.
4851+
*/
4852+
static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
48444853
{
4845-
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4846-
return;
4854+
int sync_seq;
48474855

4848-
if (mddev_lock(mddev))
4849-
return;
4856+
if (check_seq)
4857+
sync_seq = atomic_read(&mddev->sync_seq);
48504858

4851-
/*
4852-
* Check again in case MD_RECOVERY_RUNNING is cleared before lock is
4853-
* held.
4854-
*/
48554859
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4856-
mddev_unlock(mddev);
4860+
if (!locked)
4861+
mddev_unlock(mddev);
48574862
return;
48584863
}
48594864

4860-
if (work_pending(&mddev->sync_work))
4861-
flush_workqueue(md_misc_wq);
4865+
mddev_unlock(mddev);
48624866

48634867
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
48644868
/*
48654869
* Thread might be blocked waiting for metadata update which will now
48664870
* never happen
48674871
*/
48684872
md_wakeup_thread_directly(mddev->sync_thread);
4873+
if (work_pending(&mddev->sync_work))
4874+
flush_work(&mddev->sync_work);
48694875

4870-
mddev_unlock(mddev);
4876+
wait_event(resync_wait,
4877+
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4878+
(check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
4879+
4880+
if (locked)
4881+
mddev_lock_nointr(mddev);
48714882
}
48724883

48734884
static void idle_sync_thread(struct mddev *mddev)
48744885
{
4875-
int sync_seq = atomic_read(&mddev->sync_seq);
4876-
48774886
mutex_lock(&mddev->sync_mutex);
48784887
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4879-
stop_sync_thread(mddev);
48804888

4881-
wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
4882-
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
4889+
if (mddev_lock(mddev)) {
4890+
mutex_unlock(&mddev->sync_mutex);
4891+
return;
4892+
}
48834893

4894+
stop_sync_thread(mddev, false, true);
48844895
mutex_unlock(&mddev->sync_mutex);
48854896
}
48864897

48874898
static void frozen_sync_thread(struct mddev *mddev)
48884899
{
48894900
mutex_lock(&mddev->sync_mutex);
48904901
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4891-
stop_sync_thread(mddev);
48924902

4893-
wait_event(resync_wait, mddev->sync_thread == NULL &&
4894-
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
4903+
if (mddev_lock(mddev)) {
4904+
mutex_unlock(&mddev->sync_mutex);
4905+
return;
4906+
}
48954907

4908+
stop_sync_thread(mddev, false, false);
48964909
mutex_unlock(&mddev->sync_mutex);
48974910
}
48984911

@@ -6264,14 +6277,7 @@ static void md_clean(struct mddev *mddev)
62646277

62656278
static void __md_stop_writes(struct mddev *mddev)
62666279
{
6267-
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6268-
if (work_pending(&mddev->sync_work))
6269-
flush_workqueue(md_misc_wq);
6270-
if (mddev->sync_thread) {
6271-
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6272-
md_reap_sync_thread(mddev);
6273-
}
6274-
6280+
stop_sync_thread(mddev, true, false);
62756281
del_timer_sync(&mddev->safemode_timer);
62766282

62776283
if (mddev->pers && mddev->pers->quiesce) {
@@ -6363,18 +6369,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
63636369
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
63646370
md_wakeup_thread(mddev->thread);
63656371
}
6366-
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6367-
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6368-
6369-
/*
6370-
* Thread might be blocked waiting for metadata update which will now
6371-
* never happen
6372-
*/
6373-
md_wakeup_thread_directly(mddev->sync_thread);
63746372

6375-
mddev_unlock(mddev);
6376-
wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6377-
&mddev->recovery));
6373+
stop_sync_thread(mddev, false, false);
63786374
wait_event(mddev->sb_wait,
63796375
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
63806376
mddev_lock_nointr(mddev);
@@ -6428,20 +6424,8 @@ static int do_md_stop(struct mddev *mddev, int mode,
64286424
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
64296425
md_wakeup_thread(mddev->thread);
64306426
}
6431-
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6432-
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
64336427

6434-
/*
6435-
* Thread might be blocked waiting for metadata update which will now
6436-
* never happen
6437-
*/
6438-
md_wakeup_thread_directly(mddev->sync_thread);
6439-
6440-
mddev_unlock(mddev);
6441-
wait_event(resync_wait, (mddev->sync_thread == NULL &&
6442-
!test_bit(MD_RECOVERY_RUNNING,
6443-
&mddev->recovery)));
6444-
mddev_lock_nointr(mddev);
6428+
stop_sync_thread(mddev, true, false);
64456429

64466430
mutex_lock(&mddev->open_mutex);
64476431
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||

0 commit comments

Comments
 (0)