Skip to content

Commit ce0d5bd

Browse files
lxbszidryomov
authored andcommitted
ceph: make num_fwd and num_retry to __u32
The num_fwd in MClientRequestForward is int32_t, while the num_fwd in ceph_mds_request_head is __u8. This is buggy when the num_fwd is larger than 256 it will always be truncate to 0 again. But the client couldn't recoginize this. This will make them to __u32 instead. Because the old cephs will directly copy the raw memories when decoding the reqeust's head, so we need to make sure this kclient will be compatible with old cephs. For newer cephs they will decode the requests depending the version, which will be much simpler and easier to extend new members. Link: https://tracker.ceph.com/issues/62145 Signed-off-by: Xiubo Li <xiubli@redhat.com> Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com> Reviewed-by: Milind Changire <mchangir@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
1 parent 3af5ae2 commit ce0d5bd

File tree

3 files changed

+126
-90
lines changed

3 files changed

+126
-90
lines changed

fs/ceph/mds_client.c

Lines changed: 103 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -2840,6 +2840,18 @@ static void encode_mclientrequest_tail(void **p,
28402840
}
28412841
}
28422842

2843+
static struct ceph_mds_request_head_legacy *
2844+
find_legacy_request_head(void *p, u64 features)
2845+
{
2846+
bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
2847+
struct ceph_mds_request_head_old *ohead;
2848+
2849+
if (legacy)
2850+
return (struct ceph_mds_request_head_legacy *)p;
2851+
ohead = (struct ceph_mds_request_head_old *)p;
2852+
return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
2853+
}
2854+
28432855
/*
28442856
* called under mdsc->mutex
28452857
*/
@@ -2850,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28502862
int mds = session->s_mds;
28512863
struct ceph_mds_client *mdsc = session->s_mdsc;
28522864
struct ceph_msg *msg;
2853-
struct ceph_mds_request_head_old *head;
2865+
struct ceph_mds_request_head_legacy *lhead;
28542866
const char *path1 = NULL;
28552867
const char *path2 = NULL;
28562868
u64 ino1 = 0, ino2 = 0;
@@ -2862,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28622874
void *p, *end;
28632875
int ret;
28642876
bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
2877+
bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
2878+
&session->s_features);
28652879

28662880
ret = set_request_path_attr(req->r_inode, req->r_dentry,
28672881
req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2893,7 +2907,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
28932907
goto out_free2;
28942908
}
28952909

2896-
len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
2910+
/*
2911+
* For old cephs without supporting the 32bit retry/fwd feature
2912+
* it will copy the raw memories directly when decoding the
2913+
* requests. While new cephs will decode the head depending the
2914+
* version member, so we need to make sure it will be compatible
2915+
* with them both.
2916+
*/
2917+
if (legacy)
2918+
len = sizeof(struct ceph_mds_request_head_legacy);
2919+
else if (old_version)
2920+
len = sizeof(struct ceph_mds_request_head_old);
2921+
else
2922+
len = sizeof(struct ceph_mds_request_head);
28972923

28982924
/* filepaths */
28992925
len += 2 * (1 + sizeof(u32) + sizeof(u64));
@@ -2938,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
29382964

29392965
msg->hdr.tid = cpu_to_le64(req->r_tid);
29402966

2967+
lhead = find_legacy_request_head(msg->front.iov_base,
2968+
session->s_con.peer_features);
2969+
29412970
/*
2942-
* The old ceph_mds_request_head didn't contain a version field, and
2971+
* The ceph_mds_request_head_legacy didn't contain a version field, and
29432972
* one was added when we moved the message version from 3->4.
29442973
*/
29452974
if (legacy) {
29462975
msg->hdr.version = cpu_to_le16(3);
2947-
head = msg->front.iov_base;
2948-
p = msg->front.iov_base + sizeof(*head);
2976+
p = msg->front.iov_base + sizeof(*lhead);
2977+
} else if (old_version) {
2978+
struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
2979+
2980+
msg->hdr.version = cpu_to_le16(4);
2981+
ohead->version = cpu_to_le16(1);
2982+
p = msg->front.iov_base + sizeof(*ohead);
29492983
} else {
2950-
struct ceph_mds_request_head *new_head = msg->front.iov_base;
2984+
struct ceph_mds_request_head *nhead = msg->front.iov_base;
29512985

29522986
msg->hdr.version = cpu_to_le16(6);
2953-
new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
2954-
head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
2955-
p = msg->front.iov_base + sizeof(*new_head);
2987+
nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
2988+
p = msg->front.iov_base + sizeof(*nhead);
29562989
}
29572990

29582991
end = msg->front.iov_base + msg->front.iov_len;
29592992

2960-
head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
2961-
head->op = cpu_to_le32(req->r_op);
2962-
head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
2963-
req->r_cred->fsuid));
2964-
head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
2965-
req->r_cred->fsgid));
2966-
head->ino = cpu_to_le64(req->r_deleg_ino);
2967-
head->args = req->r_args;
2993+
lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
2994+
lhead->op = cpu_to_le32(req->r_op);
2995+
lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
2996+
req->r_cred->fsuid));
2997+
lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
2998+
req->r_cred->fsgid));
2999+
lhead->ino = cpu_to_le64(req->r_deleg_ino);
3000+
lhead->args = req->r_args;
29683001

29693002
ceph_encode_filepath(&p, end, ino1, path1);
29703003
ceph_encode_filepath(&p, end, ino2, path2);
@@ -3006,7 +3039,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
30063039
p = msg->front.iov_base + req->r_request_release_offset;
30073040
}
30083041

3009-
head->num_releases = cpu_to_le16(releases);
3042+
lhead->num_releases = cpu_to_le16(releases);
30103043

30113044
encode_mclientrequest_tail(&p, req);
30123045

@@ -3057,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
30573090
complete_all(&req->r_completion);
30583091
}
30593092

3060-
static struct ceph_mds_request_head_old *
3061-
find_old_request_head(void *p, u64 features)
3062-
{
3063-
bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
3064-
struct ceph_mds_request_head *new_head;
3065-
3066-
if (legacy)
3067-
return (struct ceph_mds_request_head_old *)p;
3068-
new_head = (struct ceph_mds_request_head *)p;
3069-
return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
3070-
}
3071-
30723093
/*
30733094
* called under mdsc->mutex
30743095
*/
@@ -3078,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
30783099
{
30793100
int mds = session->s_mds;
30803101
struct ceph_mds_client *mdsc = session->s_mdsc;
3081-
struct ceph_mds_request_head_old *rhead;
3102+
struct ceph_mds_request_head_legacy *lhead;
3103+
struct ceph_mds_request_head *nhead;
30823104
struct ceph_msg *msg;
3083-
int flags = 0, max_retry;
3105+
int flags = 0, old_max_retry;
3106+
bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
3107+
&session->s_features);
30843108

30853109
/*
3086-
* The type of 'r_attempts' in kernel 'ceph_mds_request'
3087-
* is 'int', while in 'ceph_mds_request_head' the type of
3088-
* 'num_retry' is '__u8'. So in case the request retries
3089-
* exceeding 256 times, the MDS will receive a incorrect
3090-
* retry seq.
3091-
*
3092-
* In this case it's ususally a bug in MDS and continue
3093-
* retrying the request makes no sense.
3094-
*
3095-
* In future this could be fixed in ceph code, so avoid
3096-
* using the hardcode here.
3110+
* Avoid inifinite retrying after overflow. The client will
3111+
* increase the retry count and if the MDS is old version,
3112+
* so we limit to retry at most 256 times.
30973113
*/
3098-
max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
3099-
max_retry = 1 << (max_retry * BITS_PER_BYTE);
3100-
if (req->r_attempts >= max_retry) {
3101-
pr_warn_ratelimited("%s request tid %llu seq overflow\n",
3102-
__func__, req->r_tid);
3103-
return -EMULTIHOP;
3114+
if (req->r_attempts) {
3115+
old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
3116+
num_retry);
3117+
old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
3118+
if ((old_version && req->r_attempts >= old_max_retry) ||
3119+
((uint32_t)req->r_attempts >= U32_MAX)) {
3120+
pr_warn_ratelimited("%s request tid %llu seq overflow\n",
3121+
__func__, req->r_tid);
3122+
return -EMULTIHOP;
3123+
}
31043124
}
31053125

31063126
req->r_attempts++;
@@ -3126,20 +3146,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
31263146
* d_move mangles the src name.
31273147
*/
31283148
msg = req->r_request;
3129-
rhead = find_old_request_head(msg->front.iov_base,
3130-
session->s_con.peer_features);
3149+
lhead = find_legacy_request_head(msg->front.iov_base,
3150+
session->s_con.peer_features);
31313151

3132-
flags = le32_to_cpu(rhead->flags);
3152+
flags = le32_to_cpu(lhead->flags);
31333153
flags |= CEPH_MDS_FLAG_REPLAY;
3134-
rhead->flags = cpu_to_le32(flags);
3154+
lhead->flags = cpu_to_le32(flags);
31353155

31363156
if (req->r_target_inode)
3137-
rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
3157+
lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
31383158

3139-
rhead->num_retry = req->r_attempts - 1;
3159+
lhead->num_retry = req->r_attempts - 1;
3160+
if (!old_version) {
3161+
nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
3162+
nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
3163+
}
31403164

31413165
/* remove cap/dentry releases from message */
3142-
rhead->num_releases = 0;
3166+
lhead->num_releases = 0;
31433167

31443168
p = msg->front.iov_base + req->r_request_release_offset;
31453169
encode_mclientrequest_tail(&p, req);
@@ -3160,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
31603184
}
31613185
req->r_request = msg;
31623186

3163-
rhead = find_old_request_head(msg->front.iov_base,
3164-
session->s_con.peer_features);
3165-
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
3187+
lhead = find_legacy_request_head(msg->front.iov_base,
3188+
session->s_con.peer_features);
3189+
lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
31663190
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
31673191
flags |= CEPH_MDS_FLAG_REPLAY;
31683192
if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
31693193
flags |= CEPH_MDS_FLAG_ASYNC;
31703194
if (req->r_parent)
31713195
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
3172-
rhead->flags = cpu_to_le32(flags);
3173-
rhead->num_fwd = req->r_num_fwd;
3174-
rhead->num_retry = req->r_attempts - 1;
3196+
lhead->flags = cpu_to_le32(flags);
3197+
lhead->num_fwd = req->r_num_fwd;
3198+
lhead->num_retry = req->r_attempts - 1;
3199+
if (!old_version) {
3200+
nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
3201+
nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
3202+
nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
3203+
}
31753204

31763205
dout(" r_parent = %p\n", req->r_parent);
31773206
return 0;
@@ -3830,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
38303859
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
38313860
dout("forward tid %llu aborted, unregistering\n", tid);
38323861
__unregister_request(mdsc, req);
3833-
} else if (fwd_seq <= req->r_num_fwd) {
3862+
} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
38343863
/*
3835-
* The type of 'num_fwd' in ceph 'MClientRequestForward'
3836-
* is 'int32_t', while in 'ceph_mds_request_head' the
3837-
* type is '__u8'. So in case the request bounces between
3838-
* MDSes exceeding 256 times, the client will get stuck.
3839-
*
3840-
* In this case it's ususally a bug in MDS and continue
3841-
* bouncing the request makes no sense.
3864+
* Avoid inifinite retrying after overflow.
38423865
*
3843-
* In future this could be fixed in ceph code, so avoid
3844-
* using the hardcode here.
3866+
* The MDS will increase the fwd count and in client side
3867+
* if the num_fwd is less than the one saved in request
3868+
* that means the MDS is an old version and overflowed of
3869+
* 8 bits.
38453870
*/
3846-
int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
3847-
max = 1 << (max * BITS_PER_BYTE);
3848-
if (req->r_num_fwd >= max) {
3849-
mutex_lock(&req->r_fill_mutex);
3850-
req->r_err = -EMULTIHOP;
3851-
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
3852-
mutex_unlock(&req->r_fill_mutex);
3853-
aborted = true;
3854-
pr_warn_ratelimited("forward tid %llu seq overflow\n",
3855-
tid);
3856-
} else {
3857-
dout("forward tid %llu to mds%d - old seq %d <= %d\n",
3858-
tid, next_mds, req->r_num_fwd, fwd_seq);
3859-
}
3871+
mutex_lock(&req->r_fill_mutex);
3872+
req->r_err = -EMULTIHOP;
3873+
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
3874+
mutex_unlock(&req->r_fill_mutex);
3875+
aborted = true;
3876+
pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
38603877
} else {
38613878
/* resend. forward race not possible; mds would drop */
38623879
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);

fs/ceph/mds_client.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@ enum ceph_feature_type {
3232
CEPHFS_FEATURE_ALTERNATE_NAME,
3333
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
3434
CEPHFS_FEATURE_OP_GETVXATTR,
35+
CEPHFS_FEATURE_32BITS_RETRY_FWD,
3536

36-
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
37+
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
3738
};
3839

3940
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -47,6 +48,7 @@ enum ceph_feature_type {
4748
CEPHFS_FEATURE_ALTERNATE_NAME, \
4849
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
4950
CEPHFS_FEATURE_OP_GETVXATTR, \
51+
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
5052
}
5153

5254
/*

include/linux/ceph/ceph_fs.h

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ union ceph_mds_request_args_ext {
486486
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
487487
#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
488488

489-
struct ceph_mds_request_head_old {
489+
struct ceph_mds_request_head_legacy {
490490
__le64 oldest_client_tid;
491491
__le32 mdsmap_epoch; /* on client */
492492
__le32 flags; /* CEPH_MDS_FLAG_* */
@@ -499,9 +499,9 @@ struct ceph_mds_request_head_old {
499499
union ceph_mds_request_args args;
500500
} __attribute__ ((packed));
501501

502-
#define CEPH_MDS_REQUEST_HEAD_VERSION 1
502+
#define CEPH_MDS_REQUEST_HEAD_VERSION 2
503503

504-
struct ceph_mds_request_head {
504+
struct ceph_mds_request_head_old {
505505
__le16 version; /* struct version */
506506
__le64 oldest_client_tid;
507507
__le32 mdsmap_epoch; /* on client */
@@ -515,6 +515,23 @@ struct ceph_mds_request_head {
515515
union ceph_mds_request_args_ext args;
516516
} __attribute__ ((packed));
517517

518+
struct ceph_mds_request_head {
519+
__le16 version; /* struct version */
520+
__le64 oldest_client_tid;
521+
__le32 mdsmap_epoch; /* on client */
522+
__le32 flags; /* CEPH_MDS_FLAG_* */
523+
__u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */
524+
__le16 num_releases; /* # include cap/lease release records */
525+
__le32 op; /* mds op code */
526+
__le32 caller_uid, caller_gid;
527+
__le64 ino; /* use this ino for openc, mkdir, mknod,
528+
etc. (if replaying) */
529+
union ceph_mds_request_args_ext args;
530+
531+
__le32 ext_num_retry; /* new count retry attempts */
532+
__le32 ext_num_fwd; /* new count fwd attempts */
533+
} __attribute__ ((packed));
534+
518535
/* cap/lease release record */
519536
struct ceph_mds_request_release {
520537
__le64 ino, cap_id; /* ino and unique cap id */

0 commit comments

Comments
 (0)