@@ -75,6 +75,7 @@ MONGO_FAIL_POINT_DEFINE(reshardingPauseDonorBeforeCatalogCacheRefresh);
75
75
MONGO_FAIL_POINT_DEFINE (reshardingPauseDonorAfterBlockingReads);
76
76
MONGO_FAIL_POINT_DEFINE (reshardingDonorFailsAfterTransitionToDonatingOplogEntries);
77
77
MONGO_FAIL_POINT_DEFINE (removeDonorDocFailpoint);
78
+ MONGO_FAIL_POINT_DEFINE (reshardingDonorFailsBeforeObtainingTimestamp);
78
79
79
80
using namespace fmt ::literals;
80
81
@@ -119,18 +120,6 @@ Timestamp generateMinFetchTimestamp(OperationContext* opCtx, const NamespaceStri
119
120
return generatedOpTime.getTimestamp ();
120
121
}
121
122
122
- /* *
123
- * Returns whether it is possible for the donor to be in 'state' when resharding will indefinitely
124
- * abort.
125
- */
126
- bool inPotentialAbortScenario (const DonorStateEnum& state) {
127
- // Regardless of whether resharding will abort or commit, the donor will eventually reach state
128
- // kDone.
129
- // Additionally, if the donor is in state kError, it is guaranteed that the coordinator will
130
- // eventually begin the abort process.
131
- return state == DonorStateEnum::kError || state == DonorStateEnum::kDone ;
132
- }
133
-
134
123
/* *
135
124
* Fulfills the promise if it is not already. Otherwise, does nothing.
136
125
*/
@@ -146,6 +135,17 @@ void ensureFulfilledPromise(WithLock lk, SharedPromise<void>& sp, Status error)
146
135
}
147
136
}
148
137
138
+ /* *
139
+ * Returns whether it is possible for the donor to be in 'state' when resharding will indefinitely
140
+ * abort.
141
+ */
142
+ bool inPotentialAbortScenario (const DonorStateEnum& state) {
143
+ // Regardless of whether resharding will abort or commit, the donor will eventually reach state
144
+ // kDone. Additionally, if the donor is in state kError, it is guaranteed that the coordinator
145
+ // will eventually begin the abort process.
146
+ return state == DonorStateEnum::kError || state == DonorStateEnum::kDone ;
147
+ }
148
+
149
149
class ExternalStateImpl : public ReshardingDonorService ::DonorStateMachineExternalState {
150
150
public:
151
151
ShardId myShardId (ServiceContext* serviceContext) const override {
@@ -378,7 +378,7 @@ ExecutorFuture<void> ReshardingDonorService::DonorStateMachine::_finishReshardin
378
378
}
379
379
380
380
// If aborted, the donor must be allowed to transition to done from any state.
381
- _transitionState (DonorStateEnum:: kDone );
381
+ _transitionToDone (aborted );
382
382
}
383
383
384
384
{
@@ -611,6 +611,12 @@ void ReshardingDonorService::DonorStateMachine::
611
611
_externalState->waitForCollectionFlush (opCtx.get (), _metadata.getTempReshardingNss ());
612
612
}
613
613
614
+ reshardingDonorFailsBeforeObtainingTimestamp.execute ([&](const BSONObj& data) {
615
+ auto errmsgElem = data[" errmsg" ];
616
+ StringData errmsg = errmsgElem ? errmsgElem.checkAndGetStringData () : " Failing for test" _sd;
617
+ uasserted (ErrorCodes::InternalError, errmsg);
618
+ });
619
+
614
620
Timestamp minFetchTimestamp = [this ] {
615
621
auto opCtx = _cancelableOpCtxFactory->makeOperationContext (&cc ());
616
622
return generateMinFetchTimestamp (opCtx.get (), _metadata.getSourceNss ());
@@ -847,12 +853,12 @@ void ReshardingDonorService::DonorStateMachine::_dropOriginalCollectionThenTrans
847
853
opCtx.get (), _metadata.getSourceNss (), _metadata.getSourceUUID ());
848
854
}
849
855
850
- _transitionState (DonorStateEnum:: kDone );
856
+ _transitionToDone ( false /* aborted */ );
851
857
}
852
858
853
859
void ReshardingDonorService::DonorStateMachine::_transitionState (DonorStateEnum newState) {
854
860
invariant (newState != DonorStateEnum::kDonatingInitialData &&
855
- newState != DonorStateEnum::kError );
861
+ newState != DonorStateEnum::kError && newState != DonorStateEnum:: kDone );
856
862
857
863
auto newDonorCtx = _donorCtx;
858
864
newDonorCtx.setState (newState);
@@ -864,6 +870,10 @@ void ReshardingDonorService::DonorStateMachine::_transitionState(DonorShardConte
864
870
auto oldState = _donorCtx.getState ();
865
871
auto newState = newDonorCtx.getState ();
866
872
873
+ // The donor state machine enters the kError state on unrecoverable errors and so we don't
874
+ // expect it to ever transition from kError except to kDone.
875
+ invariant (oldState != DonorStateEnum::kError || newState == DonorStateEnum::kDone );
876
+
867
877
_updateDonorDocument (std::move (newDonorCtx));
868
878
869
879
_metrics->onStateTransition (oldState, newState);
@@ -894,6 +904,16 @@ void ReshardingDonorService::DonorStateMachine::_transitionToError(Status abortR
894
904
_transitionState (std::move (newDonorCtx));
895
905
}
896
906
907
+ void ReshardingDonorService::DonorStateMachine::_transitionToDone (bool aborted) {
908
+ auto newDonorCtx = _donorCtx;
909
+ newDonorCtx.setState (DonorStateEnum::kDone );
910
+ if (aborted) {
911
+ resharding::emplaceTruncatedAbortReasonIfExists (newDonorCtx,
912
+ resharding::coordinatorAbortedError ());
913
+ }
914
+ _transitionState (std::move (newDonorCtx));
915
+ }
916
+
897
917
/* *
898
918
* Returns a query filter of the form
899
919
* {
@@ -1069,6 +1089,13 @@ CancellationToken ReshardingDonorService::DonorStateMachine::_initAbortSource(
1069
1089
_abortSource = CancellationSource (stepdownToken);
1070
1090
}
1071
1091
1092
+ if (_donorCtx.getState () == DonorStateEnum::kDone && _donorCtx.getAbortReason ()) {
1093
+ // A donor in state kDone with an abortReason is indication that the coordinator
1094
+ // has persisted the decision and called abort on all participants. Canceling the
1095
+ // _abortSource to avoid repeating the future chain.
1096
+ _abortSource->cancel ();
1097
+ }
1098
+
1072
1099
if (auto future = _coordinatorHasDecisionPersisted.getFuture (); future.isReady ()) {
1073
1100
if (auto status = future.getNoThrow (); !status.isOK ()) {
1074
1101
// onReshardingFieldsChanges() missed canceling _abortSource because _initAbortSource()
0 commit comments