Skip to content

Commit fff42f2

Browse files
zhaohemliu-song-6
authored andcommitted
md-cluster: fix hanging issue while a new disk adding
The commit 1bbe254 ("md-cluster: check for timeout while a new disk adding") is correct in terms of code syntax but not suite real clustered code logic. When a timeout occurs while adding a new disk, if recv_daemon() bypasses the unlock for ack_lockres:CR, another node will be waiting to grab EX lock. This will cause the cluster to hang indefinitely. How to fix: 1. In dlm_lock_sync(), change the wait behaviour from forever to a timeout, This could avoid the hanging issue when another node fails to handle cluster msg. Another result of this change is that if another node receives an unknown msg (e.g. a new msg_type), the old code will hang, whereas the new code will timeout and fail. This could help cluster_md handle new msg_type from different nodes with different kernel/module versions (e.g. The user only updates one leg's kernel and monitors the stability of the new kernel). 2. The old code for __sendmsg() always returns 0 (success) under the design (must successfully unlock ->message_lockres). This commit makes this function return an error number when an error occurs. Fixes: 1bbe254 ("md-cluster: check for timeout while a new disk adding") Signed-off-by: Heming Zhao <heming.zhao@suse.com> Reviewed-by: Su Yue <glass.su@suse.com> Acked-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20240709104120.22243-1-heming.zhao@suse.com
1 parent 3c1743a commit fff42f2

File tree

1 file changed

+12
-10
lines changed

1 file changed

+12
-10
lines changed

drivers/md/md-cluster.c

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#define LVB_SIZE 64
1717
#define NEW_DEV_TIMEOUT 5000
18+
#define WAIT_DLM_LOCK_TIMEOUT (30 * HZ)
1819

1920
struct dlm_lock_resource {
2021
dlm_lockspace_t *ls;
@@ -130,8 +131,13 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
130131
0, sync_ast, res, res->bast);
131132
if (ret)
132133
return ret;
133-
wait_event(res->sync_locking, res->sync_locking_done);
134+
ret = wait_event_timeout(res->sync_locking, res->sync_locking_done,
135+
WAIT_DLM_LOCK_TIMEOUT);
134136
res->sync_locking_done = false;
137+
if (!ret) {
138+
pr_err("locking DLM '%s' timeout!\n", res->name);
139+
return -EBUSY;
140+
}
135141
if (res->lksb.sb_status == 0)
136142
res->mode = mode;
137143
return res->lksb.sb_status;
@@ -743,15 +749,15 @@ static void unlock_comm(struct md_cluster_info *cinfo)
743749
*/
744750
static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
745751
{
746-
int error;
752+
int error, unlock_error;
747753
int slot = cinfo->slot_number - 1;
748754

749755
cmsg->slot = cpu_to_le32(slot);
750756
/*get EX on Message*/
751757
error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
752758
if (error) {
753759
pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
754-
goto failed_message;
760+
return error;
755761
}
756762

757763
memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
@@ -781,14 +787,10 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
781787
}
782788

783789
failed_ack:
784-
error = dlm_unlock_sync(cinfo->message_lockres);
785-
if (unlikely(error != 0)) {
790+
while ((unlock_error = dlm_unlock_sync(cinfo->message_lockres)))
786791
pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
787-
error);
788-
/* in case the message can't be released due to some reason */
789-
goto failed_ack;
790-
}
791-
failed_message:
792+
unlock_error);
793+
792794
return error;
793795
}
794796

0 commit comments

Comments
 (0)