Skip to content

Commit 35a0a40

Browse files
zhaohemliu-song-6
authored andcommitted
md-cluster: fix no recovery job when adding/re-adding a disk
The commit db5e653 ("md: delay choosing sync action to md_start_sync()") delays the start of the sync action. In a clustered environment, this will cause another node to first activate the spare disk and skip recovery. As a result, no nodes will perform recovery when a disk is added or re-added. Before db5e653: ``` node1 node2 ---------------------------------------------------------------- md_check_recovery + md_update_sb | sendmsg: METADATA_UPDATED + md_choose_sync_action process_metadata_update | remove_and_add_spares //node1 has not finished adding + call mddev->sync_work //the spare disk:do nothing md_start_sync starts md_do_sync md_do_sync + grabbed resync_lockres:DLM_LOCK_EX + do syncing job md_check_recovery sendmsg: METADATA_UPDATED process_metadata_update //activate spare disk ... ... md_do_sync waiting to grab resync_lockres:EX ``` After db5e653: (note: if 'cmd:idle' sets MD_RECOVERY_INTR after md_check_recovery starts md_start_sync, setting the INTR action will exacerbate the delay in node1 calling the md_do_sync function.) ``` node1 node2 ---------------------------------------------------------------- md_check_recovery + md_update_sb | sendmsg: METADATA_UPDATED + calls mddev->sync_work process_metadata_update //node1 has not finished adding //the spare disk:do nothing md_start_sync + md_choose_sync_action | remove_and_add_spares + calls md_do_sync md_check_recovery md_update_sb sendmsg: METADATA_UPDATED process_metadata_update //activate spare disk ... ... ... ... md_do_sync + grabbed resync_lockres:EX + raid1_sync_request skip sync under conf->fullsync:0 md_do_sync 1. waiting to grab resync_lockres:EX 2. when node1 could grab EX lock, node1 will skip resync under recovery_offset:MaxSector ``` How to trigger: ```(commands @Node1) # to easily watch the recovery status echo 2000 > /proc/sys/dev/raid/speed_limit_max ssh root@node2 "echo 2000 > /proc/sys/dev/raid/speed_limit_max" mdadm -CR /dev/md0 -l1 -b clustered -n 2 /dev/sda /dev/sdb --assume-clean ssh root@node2 mdadm -A /dev/md0 /dev/sda /dev/sdb mdadm --manage /dev/md0 --fail /dev/sda --remove /dev/sda mdadm --manage /dev/md0 --add /dev/sdc === "cat /proc/mdstat" on both node, there are no recovery action. === ``` How to fix: because md layer code logic is hard to restore for speeding up sync job on local node, we add new cluster msg to pending the another node to active disk. Signed-off-by: Heming Zhao <heming.zhao@suse.com> Reviewed-by: Su Yue <glass.su@suse.com> Acked-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20240709104120.22243-2-heming.zhao@suse.com
1 parent fff42f2 commit 35a0a40

File tree

3 files changed

+43
-3
lines changed

3 files changed

+43
-3
lines changed

drivers/md/md-cluster.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct resync_info {
5757
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
5858
#define MD_CLUSTER_PENDING_RECV_EVENT 7
5959
#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
60+
#define MD_CLUSTER_WAITING_FOR_SYNC 9
6061

6162
struct md_cluster_info {
6263
struct mddev *mddev; /* the md device which md_cluster_info belongs to */
@@ -92,6 +93,7 @@ struct md_cluster_info {
9293
sector_t sync_hi;
9394
};
9495

96+
/* For compatibility, add the new msg_type at the end. */
9597
enum msg_type {
9698
METADATA_UPDATED = 0,
9799
RESYNCING,
@@ -101,6 +103,7 @@ enum msg_type {
101103
BITMAP_NEEDS_SYNC,
102104
CHANGE_CAPACITY,
103105
BITMAP_RESIZE,
106+
RESYNCING_START,
104107
};
105108

106109
struct cluster_msg {
@@ -461,6 +464,7 @@ static void process_suspend_info(struct mddev *mddev,
461464
clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
462465
remove_suspend_info(mddev, slot);
463466
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
467+
clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
464468
md_wakeup_thread(mddev->thread);
465469
return;
466470
}
@@ -531,6 +535,7 @@ static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
531535
res = -1;
532536
}
533537
clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
538+
set_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
534539
return res;
535540
}
536541

@@ -599,6 +604,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
599604
case CHANGE_CAPACITY:
600605
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
601606
break;
607+
case RESYNCING_START:
608+
clear_bit(MD_CLUSTER_WAITING_FOR_SYNC, &mddev->cluster_info->state);
609+
break;
602610
case RESYNCING:
603611
set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
604612
process_suspend_info(mddev, le32_to_cpu(msg->slot),
@@ -1345,6 +1353,23 @@ static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
13451353
spin_unlock_irq(&cinfo->suspend_lock);
13461354
}
13471355

1356+
static int resync_status_get(struct mddev *mddev)
1357+
{
1358+
struct md_cluster_info *cinfo = mddev->cluster_info;
1359+
1360+
return test_bit(MD_CLUSTER_WAITING_FOR_SYNC, &cinfo->state);
1361+
}
1362+
1363+
static int resync_start_notify(struct mddev *mddev)
1364+
{
1365+
struct md_cluster_info *cinfo = mddev->cluster_info;
1366+
struct cluster_msg cmsg = {0};
1367+
1368+
cmsg.type = cpu_to_le32(RESYNCING_START);
1369+
1370+
return sendmsg(cinfo, &cmsg, 0);
1371+
}
1372+
13481373
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
13491374
{
13501375
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1579,6 +1604,8 @@ static const struct md_cluster_operations cluster_ops = {
15791604
.resync_start = resync_start,
15801605
.resync_finish = resync_finish,
15811606
.resync_info_update = resync_info_update,
1607+
.resync_start_notify = resync_start_notify,
1608+
.resync_status_get = resync_status_get,
15821609
.resync_info_get = resync_info_get,
15831610
.metadata_update_start = metadata_update_start,
15841611
.metadata_update_finish = metadata_update_finish,

drivers/md/md-cluster.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ struct md_cluster_operations {
1414
int (*leave)(struct mddev *mddev);
1515
int (*slot_number)(struct mddev *mddev);
1616
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
17+
int (*resync_start_notify)(struct mddev *mddev);
18+
int (*resync_status_get)(struct mddev *mddev);
1719
void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
1820
int (*metadata_update_start)(struct mddev *mddev);
1921
int (*metadata_update_finish)(struct mddev *mddev);

drivers/md/md.c

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8978,7 +8978,8 @@ void md_do_sync(struct md_thread *thread)
89788978
* This will mean we have to start checking from the beginning again.
89798979
*
89808980
*/
8981-
8981+
if (mddev_is_clustered(mddev))
8982+
md_cluster_ops->resync_start_notify(mddev);
89828983
do {
89838984
int mddev2_minor = -1;
89848985
mddev->curr_resync = MD_RESYNC_DELAYED;
@@ -9992,8 +9993,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
99929993
*/
99939994
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
99949995
!(le32_to_cpu(sb->feature_map) &
9995-
MD_FEATURE_RESHAPE_ACTIVE)) {
9996-
rdev2->saved_raid_disk = role;
9996+
MD_FEATURE_RESHAPE_ACTIVE) &&
9997+
!md_cluster_ops->resync_status_get(mddev)) {
9998+
/*
9999+
* -1 to make raid1_add_disk() set conf->fullsync
10000+
* to 1. This could avoid skipping sync when the
10001+
* remote node is down during resyncing.
10002+
*/
10003+
if ((le32_to_cpu(sb->feature_map)
10004+
& MD_FEATURE_RECOVERY_OFFSET))
10005+
rdev2->saved_raid_disk = -1;
10006+
else
10007+
rdev2->saved_raid_disk = role;
999710008
ret = remove_and_add_spares(mddev, rdev2);
999810009
pr_info("Activated spare: %pg\n",
999910010
rdev2->bdev);

0 commit comments

Comments
 (0)