@@ -2840,6 +2840,18 @@ static void encode_mclientrequest_tail(void **p,
2840
2840
}
2841
2841
}
2842
2842
2843
+ static struct ceph_mds_request_head_legacy *
2844
+ find_legacy_request_head (void * p , u64 features )
2845
+ {
2846
+ bool legacy = !(features & CEPH_FEATURE_FS_BTIME );
2847
+ struct ceph_mds_request_head_old * ohead ;
2848
+
2849
+ if (legacy )
2850
+ return (struct ceph_mds_request_head_legacy * )p ;
2851
+ ohead = (struct ceph_mds_request_head_old * )p ;
2852
+ return (struct ceph_mds_request_head_legacy * )& ohead -> oldest_client_tid ;
2853
+ }
2854
+
2843
2855
/*
2844
2856
* called under mdsc->mutex
2845
2857
*/
@@ -2850,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
2850
2862
int mds = session -> s_mds ;
2851
2863
struct ceph_mds_client * mdsc = session -> s_mdsc ;
2852
2864
struct ceph_msg * msg ;
2853
- struct ceph_mds_request_head_old * head ;
2865
+ struct ceph_mds_request_head_legacy * lhead ;
2854
2866
const char * path1 = NULL ;
2855
2867
const char * path2 = NULL ;
2856
2868
u64 ino1 = 0 , ino2 = 0 ;
@@ -2862,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
2862
2874
void * p , * end ;
2863
2875
int ret ;
2864
2876
bool legacy = !(session -> s_con .peer_features & CEPH_FEATURE_FS_BTIME );
2877
+ bool old_version = !test_bit (CEPHFS_FEATURE_32BITS_RETRY_FWD ,
2878
+ & session -> s_features );
2865
2879
2866
2880
ret = set_request_path_attr (req -> r_inode , req -> r_dentry ,
2867
2881
req -> r_parent , req -> r_path1 , req -> r_ino1 .ino ,
@@ -2893,7 +2907,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
2893
2907
goto out_free2 ;
2894
2908
}
2895
2909
2896
- len = legacy ? sizeof (* head ) : sizeof (struct ceph_mds_request_head );
2910
+ /*
2911
+ * For old cephs without supporting the 32bit retry/fwd feature
2912
+ * it will copy the raw memories directly when decoding the
2913
+ * requests. While new cephs will decode the head depending the
2914
+ * version member, so we need to make sure it will be compatible
2915
+ * with them both.
2916
+ */
2917
+ if (legacy )
2918
+ len = sizeof (struct ceph_mds_request_head_legacy );
2919
+ else if (old_version )
2920
+ len = sizeof (struct ceph_mds_request_head_old );
2921
+ else
2922
+ len = sizeof (struct ceph_mds_request_head );
2897
2923
2898
2924
/* filepaths */
2899
2925
len += 2 * (1 + sizeof (u32 ) + sizeof (u64 ));
@@ -2938,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
2938
2964
2939
2965
msg -> hdr .tid = cpu_to_le64 (req -> r_tid );
2940
2966
2967
+ lhead = find_legacy_request_head (msg -> front .iov_base ,
2968
+ session -> s_con .peer_features );
2969
+
2941
2970
/*
2942
- * The old ceph_mds_request_head didn't contain a version field, and
2971
+ * The ceph_mds_request_head_legacy didn't contain a version field, and
2943
2972
* one was added when we moved the message version from 3->4.
2944
2973
*/
2945
2974
if (legacy ) {
2946
2975
msg -> hdr .version = cpu_to_le16 (3 );
2947
- head = msg -> front .iov_base ;
2948
- p = msg -> front .iov_base + sizeof (* head );
2976
+ p = msg -> front .iov_base + sizeof (* lhead );
2977
+ } else if (old_version ) {
2978
+ struct ceph_mds_request_head_old * ohead = msg -> front .iov_base ;
2979
+
2980
+ msg -> hdr .version = cpu_to_le16 (4 );
2981
+ ohead -> version = cpu_to_le16 (1 );
2982
+ p = msg -> front .iov_base + sizeof (* ohead );
2949
2983
} else {
2950
- struct ceph_mds_request_head * new_head = msg -> front .iov_base ;
2984
+ struct ceph_mds_request_head * nhead = msg -> front .iov_base ;
2951
2985
2952
2986
msg -> hdr .version = cpu_to_le16 (6 );
2953
- new_head -> version = cpu_to_le16 (CEPH_MDS_REQUEST_HEAD_VERSION );
2954
- head = (struct ceph_mds_request_head_old * )& new_head -> oldest_client_tid ;
2955
- p = msg -> front .iov_base + sizeof (* new_head );
2987
+ nhead -> version = cpu_to_le16 (CEPH_MDS_REQUEST_HEAD_VERSION );
2988
+ p = msg -> front .iov_base + sizeof (* nhead );
2956
2989
}
2957
2990
2958
2991
end = msg -> front .iov_base + msg -> front .iov_len ;
2959
2992
2960
- head -> mdsmap_epoch = cpu_to_le32 (mdsc -> mdsmap -> m_epoch );
2961
- head -> op = cpu_to_le32 (req -> r_op );
2962
- head -> caller_uid = cpu_to_le32 (from_kuid (& init_user_ns ,
2963
- req -> r_cred -> fsuid ));
2964
- head -> caller_gid = cpu_to_le32 (from_kgid (& init_user_ns ,
2965
- req -> r_cred -> fsgid ));
2966
- head -> ino = cpu_to_le64 (req -> r_deleg_ino );
2967
- head -> args = req -> r_args ;
2993
+ lhead -> mdsmap_epoch = cpu_to_le32 (mdsc -> mdsmap -> m_epoch );
2994
+ lhead -> op = cpu_to_le32 (req -> r_op );
2995
+ lhead -> caller_uid = cpu_to_le32 (from_kuid (& init_user_ns ,
2996
+ req -> r_cred -> fsuid ));
2997
+ lhead -> caller_gid = cpu_to_le32 (from_kgid (& init_user_ns ,
2998
+ req -> r_cred -> fsgid ));
2999
+ lhead -> ino = cpu_to_le64 (req -> r_deleg_ino );
3000
+ lhead -> args = req -> r_args ;
2968
3001
2969
3002
ceph_encode_filepath (& p , end , ino1 , path1 );
2970
3003
ceph_encode_filepath (& p , end , ino2 , path2 );
@@ -3006,7 +3039,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
3006
3039
p = msg -> front .iov_base + req -> r_request_release_offset ;
3007
3040
}
3008
3041
3009
- head -> num_releases = cpu_to_le16 (releases );
3042
+ lhead -> num_releases = cpu_to_le16 (releases );
3010
3043
3011
3044
encode_mclientrequest_tail (& p , req );
3012
3045
@@ -3057,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
3057
3090
complete_all (& req -> r_completion );
3058
3091
}
3059
3092
3060
- static struct ceph_mds_request_head_old *
3061
- find_old_request_head (void * p , u64 features )
3062
- {
3063
- bool legacy = !(features & CEPH_FEATURE_FS_BTIME );
3064
- struct ceph_mds_request_head * new_head ;
3065
-
3066
- if (legacy )
3067
- return (struct ceph_mds_request_head_old * )p ;
3068
- new_head = (struct ceph_mds_request_head * )p ;
3069
- return (struct ceph_mds_request_head_old * )& new_head -> oldest_client_tid ;
3070
- }
3071
-
3072
3093
/*
3073
3094
* called under mdsc->mutex
3074
3095
*/
@@ -3078,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
3078
3099
{
3079
3100
int mds = session -> s_mds ;
3080
3101
struct ceph_mds_client * mdsc = session -> s_mdsc ;
3081
- struct ceph_mds_request_head_old * rhead ;
3102
+ struct ceph_mds_request_head_legacy * lhead ;
3103
+ struct ceph_mds_request_head * nhead ;
3082
3104
struct ceph_msg * msg ;
3083
- int flags = 0 , max_retry ;
3105
+ int flags = 0 , old_max_retry ;
3106
+ bool old_version = !test_bit (CEPHFS_FEATURE_32BITS_RETRY_FWD ,
3107
+ & session -> s_features );
3084
3108
3085
3109
/*
3086
- * The type of 'r_attempts' in kernel 'ceph_mds_request'
3087
- * is 'int', while in 'ceph_mds_request_head' the type of
3088
- * 'num_retry' is '__u8'. So in case the request retries
3089
- * exceeding 256 times, the MDS will receive a incorrect
3090
- * retry seq.
3091
- *
3092
- * In this case it's ususally a bug in MDS and continue
3093
- * retrying the request makes no sense.
3094
- *
3095
- * In future this could be fixed in ceph code, so avoid
3096
- * using the hardcode here.
3110
+ * Avoid inifinite retrying after overflow. The client will
3111
+ * increase the retry count and if the MDS is old version,
3112
+ * so we limit to retry at most 256 times.
3097
3113
*/
3098
- max_retry = sizeof_field (struct ceph_mds_request_head , num_retry );
3099
- max_retry = 1 << (max_retry * BITS_PER_BYTE );
3100
- if (req -> r_attempts >= max_retry ) {
3101
- pr_warn_ratelimited ("%s request tid %llu seq overflow\n" ,
3102
- __func__ , req -> r_tid );
3103
- return - EMULTIHOP ;
3114
+ if (req -> r_attempts ) {
3115
+ old_max_retry = sizeof_field (struct ceph_mds_request_head_old ,
3116
+ num_retry );
3117
+ old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE );
3118
+ if ((old_version && req -> r_attempts >= old_max_retry ) ||
3119
+ ((uint32_t )req -> r_attempts >= U32_MAX )) {
3120
+ pr_warn_ratelimited ("%s request tid %llu seq overflow\n" ,
3121
+ __func__ , req -> r_tid );
3122
+ return - EMULTIHOP ;
3123
+ }
3104
3124
}
3105
3125
3106
3126
req -> r_attempts ++ ;
@@ -3126,20 +3146,24 @@ static int __prepare_send_request(struct ceph_mds_session *session,
3126
3146
* d_move mangles the src name.
3127
3147
*/
3128
3148
msg = req -> r_request ;
3129
- rhead = find_old_request_head (msg -> front .iov_base ,
3130
- session -> s_con .peer_features );
3149
+ lhead = find_legacy_request_head (msg -> front .iov_base ,
3150
+ session -> s_con .peer_features );
3131
3151
3132
- flags = le32_to_cpu (rhead -> flags );
3152
+ flags = le32_to_cpu (lhead -> flags );
3133
3153
flags |= CEPH_MDS_FLAG_REPLAY ;
3134
- rhead -> flags = cpu_to_le32 (flags );
3154
+ lhead -> flags = cpu_to_le32 (flags );
3135
3155
3136
3156
if (req -> r_target_inode )
3137
- rhead -> ino = cpu_to_le64 (ceph_ino (req -> r_target_inode ));
3157
+ lhead -> ino = cpu_to_le64 (ceph_ino (req -> r_target_inode ));
3138
3158
3139
- rhead -> num_retry = req -> r_attempts - 1 ;
3159
+ lhead -> num_retry = req -> r_attempts - 1 ;
3160
+ if (!old_version ) {
3161
+ nhead = (struct ceph_mds_request_head * )msg -> front .iov_base ;
3162
+ nhead -> ext_num_retry = cpu_to_le32 (req -> r_attempts - 1 );
3163
+ }
3140
3164
3141
3165
/* remove cap/dentry releases from message */
3142
- rhead -> num_releases = 0 ;
3166
+ lhead -> num_releases = 0 ;
3143
3167
3144
3168
p = msg -> front .iov_base + req -> r_request_release_offset ;
3145
3169
encode_mclientrequest_tail (& p , req );
@@ -3160,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
3160
3184
}
3161
3185
req -> r_request = msg ;
3162
3186
3163
- rhead = find_old_request_head (msg -> front .iov_base ,
3164
- session -> s_con .peer_features );
3165
- rhead -> oldest_client_tid = cpu_to_le64 (__get_oldest_tid (mdsc ));
3187
+ lhead = find_legacy_request_head (msg -> front .iov_base ,
3188
+ session -> s_con .peer_features );
3189
+ lhead -> oldest_client_tid = cpu_to_le64 (__get_oldest_tid (mdsc ));
3166
3190
if (test_bit (CEPH_MDS_R_GOT_UNSAFE , & req -> r_req_flags ))
3167
3191
flags |= CEPH_MDS_FLAG_REPLAY ;
3168
3192
if (test_bit (CEPH_MDS_R_ASYNC , & req -> r_req_flags ))
3169
3193
flags |= CEPH_MDS_FLAG_ASYNC ;
3170
3194
if (req -> r_parent )
3171
3195
flags |= CEPH_MDS_FLAG_WANT_DENTRY ;
3172
- rhead -> flags = cpu_to_le32 (flags );
3173
- rhead -> num_fwd = req -> r_num_fwd ;
3174
- rhead -> num_retry = req -> r_attempts - 1 ;
3196
+ lhead -> flags = cpu_to_le32 (flags );
3197
+ lhead -> num_fwd = req -> r_num_fwd ;
3198
+ lhead -> num_retry = req -> r_attempts - 1 ;
3199
+ if (!old_version ) {
3200
+ nhead = (struct ceph_mds_request_head * )msg -> front .iov_base ;
3201
+ nhead -> ext_num_fwd = cpu_to_le32 (req -> r_num_fwd );
3202
+ nhead -> ext_num_retry = cpu_to_le32 (req -> r_attempts - 1 );
3203
+ }
3175
3204
3176
3205
dout (" r_parent = %p\n" , req -> r_parent );
3177
3206
return 0 ;
@@ -3830,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
3830
3859
if (test_bit (CEPH_MDS_R_ABORTED , & req -> r_req_flags )) {
3831
3860
dout ("forward tid %llu aborted, unregistering\n" , tid );
3832
3861
__unregister_request (mdsc , req );
3833
- } else if (fwd_seq <= req -> r_num_fwd ) {
3862
+ } else if (fwd_seq <= req -> r_num_fwd || ( uint32_t ) fwd_seq >= U32_MAX ) {
3834
3863
/*
3835
- * The type of 'num_fwd' in ceph 'MClientRequestForward'
3836
- * is 'int32_t', while in 'ceph_mds_request_head' the
3837
- * type is '__u8'. So in case the request bounces between
3838
- * MDSes exceeding 256 times, the client will get stuck.
3839
- *
3840
- * In this case it's ususally a bug in MDS and continue
3841
- * bouncing the request makes no sense.
3864
+ * Avoid inifinite retrying after overflow.
3842
3865
*
3843
- * In future this could be fixed in ceph code, so avoid
3844
- * using the hardcode here.
3866
+ * The MDS will increase the fwd count and in client side
3867
+ * if the num_fwd is less than the one saved in request
3868
+ * that means the MDS is an old version and overflowed of
3869
+ * 8 bits.
3845
3870
*/
3846
- int max = sizeof_field (struct ceph_mds_request_head , num_fwd );
3847
- max = 1 << (max * BITS_PER_BYTE );
3848
- if (req -> r_num_fwd >= max ) {
3849
- mutex_lock (& req -> r_fill_mutex );
3850
- req -> r_err = - EMULTIHOP ;
3851
- set_bit (CEPH_MDS_R_ABORTED , & req -> r_req_flags );
3852
- mutex_unlock (& req -> r_fill_mutex );
3853
- aborted = true;
3854
- pr_warn_ratelimited ("forward tid %llu seq overflow\n" ,
3855
- tid );
3856
- } else {
3857
- dout ("forward tid %llu to mds%d - old seq %d <= %d\n" ,
3858
- tid , next_mds , req -> r_num_fwd , fwd_seq );
3859
- }
3871
+ mutex_lock (& req -> r_fill_mutex );
3872
+ req -> r_err = - EMULTIHOP ;
3873
+ set_bit (CEPH_MDS_R_ABORTED , & req -> r_req_flags );
3874
+ mutex_unlock (& req -> r_fill_mutex );
3875
+ aborted = true;
3876
+ pr_warn_ratelimited ("forward tid %llu seq overflow\n" , tid );
3860
3877
} else {
3861
3878
/* resend. forward race not possible; mds would drop */
3862
3879
dout ("forward tid %llu to mds%d (we resend)\n" , tid , next_mds );
0 commit comments