Skip to content

Commit 62423b2

Browse files
jsteemannMongoDB Bot
authored andcommitted
SERVER-98650 Correctly handle StorageUnavailable exceptions during query restore (#30814)
GitOrigin-RevId: dace223539da0ae82a57a790fec9040851033687
1 parent 644cd0c commit 62423b2

16 files changed

+173
-35
lines changed
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/**
2+
* Tests server behavior when a StorageUnavailableException is thrown when a PlanExecutor is being
3+
* restored. Specifically, tests the case where the exception is thrown when the getMore command
4+
* is restoring the cursor in order to use it, as well as a case where the BatchedDeleteStage
5+
* throws when restoring.
6+
*/
7+
8+
const conn = MongoRunner.runMongod();
9+
assert.neq(null, conn, "mongod was unable to start up");
10+
11+
const db = conn.getDB("test");
12+
13+
function runTest() {
14+
load("jstests/libs/fail_point_util.js");
15+
16+
let collName = jsTestName();
17+
db[collName].drop();
18+
19+
for (let x = 0; x < 5; ++x) {
20+
assert.commandWorked(db[collName].insert({_id: x, a: 1}));
21+
}
22+
23+
assert.commandWorked(db[collName].createIndex({a: 1}));
24+
25+
//
26+
// Test find command.
27+
//
28+
let res = db.runCommand({find: collName, filter: {a: 1}, batchSize: 1});
29+
assert.eq(1, res.cursor.firstBatch.length, tojson(res));
30+
31+
// Configure the failpoint to trip once, when the getMore command restores the cursor.
32+
let fp1 = configureFailPoint(db, "throwDuringIndexScanRestore", {} /* data */, {times: 1});
33+
34+
let getMoreRes =
35+
assert.commandWorked(db.runCommand({getMore: res.cursor.id, collection: collName}));
36+
assert.eq(4, getMoreRes.cursor.nextBatch.length, tojson(getMoreRes));
37+
38+
//
39+
// Test aggregate command.
40+
//
41+
res =
42+
db.runCommand({aggregate: collName, pipeline: [{$match: {a: 1}}], cursor: {batchSize: 1}});
43+
assert.eq(1, res.cursor.firstBatch.length, tojson(res));
44+
45+
// Configure the failpoint to trip once, when the getMore command restores the cursor.
46+
let fp2 = configureFailPoint(db, "throwDuringIndexScanRestore", {} /* data */, {times: 1});
47+
getMoreRes =
48+
assert.commandWorked(db.runCommand({getMore: res.cursor.id, collection: collName}));
49+
assert.eq(4, getMoreRes.cursor.nextBatch.length, tojson(getMoreRes));
50+
51+
//
52+
// Test batched delete command.
53+
//
54+
55+
// Configure the fail point to trip once.
56+
let fp3 = configureFailPoint(
57+
db, "batchedDeleteStageThrowTemporarilyUnavailableException", {} /* data */, {times: 1});
58+
assert.commandWorked(db[collName].remove({}));
59+
}
60+
61+
// Fetch current value of 'internalQueryFrameworkControl' parameter.
62+
const previousQueryFrameworkControlValue =
63+
assert.commandWorked(db.adminCommand({getParameter: 1, internalQueryFrameworkControl: 1}))
64+
.internalQueryFrameworkControl;
65+
66+
// Force query engine to classic for this test, as SBE does not yield for queries test here.
67+
assert.commandWorked(
68+
db.adminCommand({setParameter: 1, internalQueryFrameworkControl: "forceClassicEngine"}));
69+
70+
try {
71+
// Run actual test.
72+
runTest();
73+
} finally {
74+
// Restore previous configuration of 'internalQueryFrameworkControl' parameter.
75+
assert.commandWorked(db.adminCommand(
76+
{setParameter: 1, internalQueryFrameworkControl: previousQueryFrameworkControlValue}));
77+
}
78+
79+
MongoRunner.stopMongod(conn);

src/mongo/db/exec/batched_delete_stage.cpp

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ namespace mongo {
5555

5656
MONGO_FAIL_POINT_DEFINE(throwWriteConflictExceptionInBatchedDeleteStage);
5757
MONGO_FAIL_POINT_DEFINE(batchedDeleteStageSleepAfterNDocuments);
58+
MONGO_FAIL_POINT_DEFINE(batchedDeleteStageThrowTemporarilyUnavailableException);
5859

5960
namespace {
6061

@@ -228,15 +229,22 @@ PlanStage::StageState BatchedDeleteStage::doWork(WorkingSetID* out) {
228229
_commitStagedDeletes = _passStagingComplete || !_stagedDeletesBuffer.empty();
229230
}
230231

232+
// We need to check if 'planStateStage' is 'NEED_YIELD' earlier than we check 'isEOF()'.
233+
// The reason is that 'isEOF()' returns true if we have completed staging and have an empty
234+
// buffer of staged deletes. However, even if 'isEOF()' returns true, the 'planStateStage' can
235+
// be 'NEED_YIELD' here if inside 'restoreState()' in '_deleteBatch()' a
236+
// 'StorageEngineException' was thrown and caught. In this case, the 'planStateStage' is
237+
// 'NEED_YIELD' and 'isEOF()' already returns true.
238+
if (planStageState == PlanStage::NEED_YIELD) {
239+
*out = idToReturn;
240+
return PlanStage::NEED_YIELD;
241+
}
242+
231243
if (isEOF()) {
232244
invariant(planStageState != PlanStage::NEED_YIELD);
233245
return PlanStage::IS_EOF;
234246
}
235247

236-
if (planStageState == PlanStage::NEED_YIELD) {
237-
*out = idToReturn;
238-
}
239-
240248
return planStageState;
241249
}
242250

@@ -490,6 +498,13 @@ PlanStage::StageState BatchedDeleteStage::_tryRestoreState(WorkingSetID* out) {
490498
"BatchedDeleteStage::_tryRestoreState",
491499
collection()->ns().ns(),
492500
[&] {
501+
if (MONGO_unlikely(
502+
batchedDeleteStageThrowTemporarilyUnavailableException.shouldFail())) {
503+
throwTemporarilyUnavailableException(
504+
str::stream()
505+
<< "Hit failpoint '"
506+
<< batchedDeleteStageThrowTemporarilyUnavailableException.getName() << "'.");
507+
}
493508
child()->restoreState(&collection());
494509
return PlanStage::NEED_TIME;
495510
},

src/mongo/db/exec/cached_plan.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ Status CachedPlanStage::tryYield(PlanYieldPolicy* yieldPolicy) {
194194
// In all cases, the actual yielding happens here.
195195
if (yieldPolicy->shouldYieldOrInterrupt(expCtx()->opCtx)) {
196196
// Here's where we yield.
197-
return yieldPolicy->yieldOrInterrupt(expCtx()->opCtx);
197+
return yieldPolicy->yieldOrInterrupt(
198+
expCtx()->opCtx, nullptr, RestoreContext::RestoreType::kYield);
198199
}
199200

200201
return Status::OK();

src/mongo/db/exec/index_scan.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ int sgn(int i) {
6060

6161
namespace mongo {
6262

63+
MONGO_FAIL_POINT_DEFINE(throwDuringIndexScanRestore);
64+
6365
// static
6466
const char* IndexScan::kStageType = "IXSCAN";
6567

@@ -284,8 +286,14 @@ void IndexScan::doSaveStateRequiresIndex() {
284286
}
285287

286288
void IndexScan::doRestoreStateRequiresIndex() {
287-
if (_indexCursor)
289+
if (_indexCursor) {
290+
if (MONGO_unlikely(throwDuringIndexScanRestore.shouldFail())) {
291+
throwTemporarilyUnavailableException(str::stream()
292+
<< "Hit failpoint '"
293+
<< throwDuringIndexScanRestore.getName() << "'.");
294+
}
288295
_indexCursor->restore();
296+
}
289297
}
290298

291299
void IndexScan::doDetachFromOperationContext() {

src/mongo/db/exec/multi_plan.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,8 @@ void MultiPlanStage::tryYield(PlanYieldPolicy* yieldPolicy) {
189189
// 3) we need to yield and retry due to a WriteConflictException.
190190
// In all cases, the actual yielding happens here.
191191
if (yieldPolicy->shouldYieldOrInterrupt(expCtx()->opCtx)) {
192-
uassertStatusOK(yieldPolicy->yieldOrInterrupt(expCtx()->opCtx));
192+
uassertStatusOK(yieldPolicy->yieldOrInterrupt(
193+
expCtx()->opCtx, nullptr, RestoreContext::RestoreType::kYield));
193194
}
194195
}
195196

src/mongo/db/exec/sbe/stages/stages.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,8 @@ class CanInterrupt {
521521
opCtx->checkForInterrupt();
522522
}
523523
} else if (_yieldPolicy->shouldYieldOrInterrupt(opCtx)) {
524-
uassertStatusOK(_yieldPolicy->yieldOrInterrupt(opCtx));
524+
uassertStatusOK(_yieldPolicy->yieldOrInterrupt(
525+
opCtx, nullptr, RestoreContext::RestoreType::kYield));
525526
}
526527
}
527528

src/mongo/db/exec/trial_stage.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,8 @@ Status TrialStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) {
8383
throwWriteConflictException(
8484
"Write conflict during TrialStage plan selection and yielding is disabled.");
8585
}
86-
auto yieldStatus = yieldPolicy->yieldOrInterrupt(expCtx()->opCtx);
86+
auto yieldStatus = yieldPolicy->yieldOrInterrupt(
87+
expCtx()->opCtx, nullptr, RestoreContext::RestoreType::kYield);
8788
if (!yieldStatus.isOK()) {
8889
return yieldStatus;
8990
}

src/mongo/db/query/mock_yield_policies.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ class MockYieldPolicy : public PlanYieldPolicy {
4949
MONGO_UNREACHABLE;
5050
}
5151

52-
void restoreState(OperationContext* opCtx, const Yieldable* yieldable) override final {
52+
void restoreState(OperationContext* opCtx,
53+
const Yieldable* yieldable,
54+
RestoreContext::RestoreType restoreType) final {
5355
MONGO_UNREACHABLE;
5456
}
5557
};
@@ -67,7 +69,9 @@ class AlwaysTimeOutYieldPolicy final : public MockYieldPolicy {
6769
return true;
6870
}
6971

70-
Status yieldOrInterrupt(OperationContext*, std::function<void()> whileYieldingFn) override {
72+
Status yieldOrInterrupt(OperationContext*,
73+
std::function<void()> whileYieldingFn,
74+
RestoreContext::RestoreType restoreType) override {
7175
return {ErrorCodes::ExceededTimeLimit, "Using AlwaysTimeOutYieldPolicy"};
7276
}
7377
};
@@ -85,7 +89,9 @@ class AlwaysPlanKilledYieldPolicy final : public MockYieldPolicy {
8589
return true;
8690
}
8791

88-
Status yieldOrInterrupt(OperationContext*, std::function<void()> whileYieldingFn) override {
92+
Status yieldOrInterrupt(OperationContext*,
93+
std::function<void()> whileYieldingFn,
94+
RestoreContext::RestoreType restoreType) override {
8995
return {ErrorCodes::QueryPlanKilled, "Using AlwaysPlanKilledYieldPolicy"};
9096
}
9197
};
@@ -103,7 +109,9 @@ class NoopYieldPolicy final : public MockYieldPolicy {
103109
return false;
104110
}
105111

106-
Status yieldOrInterrupt(OperationContext*, std::function<void()> whileYieldingFn) override {
112+
Status yieldOrInterrupt(OperationContext*,
113+
std::function<void()> whileYieldingFn,
114+
RestoreContext::RestoreType restoreType) override {
107115
MONGO_UNREACHABLE;
108116
}
109117
};

src/mongo/db/query/plan_executor_impl.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,8 @@ void PlanExecutorImpl::restoreState(const RestoreContext& context) {
265265
throw;
266266

267267
// Handles retries by calling restoreStateWithoutRetrying() in a loop.
268-
uassertStatusOK(_yieldPolicy->yieldOrInterrupt(getOpCtx()));
268+
uassertStatusOK(_yieldPolicy->yieldOrInterrupt(
269+
getOpCtx(), nullptr /* whileYieldingFn */, context.type()));
269270
}
270271
}
271272

@@ -383,7 +384,8 @@ PlanExecutor::ExecState PlanExecutorImpl::_getNextImpl(Snapshotted<Document>* ob
383384
};
384385

385386
if (_yieldPolicy->shouldYieldOrInterrupt(_opCtx)) {
386-
uassertStatusOK(_yieldPolicy->yieldOrInterrupt(_opCtx, whileYieldingFn));
387+
uassertStatusOK(_yieldPolicy->yieldOrInterrupt(
388+
_opCtx, whileYieldingFn, RestoreContext::RestoreType::kYield));
387389
}
388390

389391
WorkingSetID id = WorkingSet::INVALID_ID;

src/mongo/db/query/plan_insert_listener.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -114,16 +114,20 @@ void waitForInserts(OperationContext* opCtx,
114114
ON_BLOCK_EXIT([curOp] { curOp->resumeTimer(); });
115115

116116
notifier->prepareForWait(opCtx);
117-
auto yieldResult = yieldPolicy->yieldOrInterrupt(opCtx, [opCtx, &notifier] {
118-
const auto deadline = awaitDataState(opCtx).waitForInsertsDeadline;
119-
notifier->waitUntil(opCtx, deadline);
120-
if (MONGO_unlikely(planExecutorHangWhileYieldedInWaitForInserts.shouldFail())) {
121-
LOGV2(4452903,
122-
"PlanExecutor - planExecutorHangWhileYieldedInWaitForInserts fail point enabled. "
123-
"Blocking until fail point is disabled");
124-
planExecutorHangWhileYieldedInWaitForInserts.pauseWhileSet();
125-
}
126-
});
117+
auto yieldResult = yieldPolicy->yieldOrInterrupt(
118+
opCtx,
119+
[opCtx, &notifier] {
120+
const auto deadline = awaitDataState(opCtx).waitForInsertsDeadline;
121+
notifier->waitUntil(opCtx, deadline);
122+
if (MONGO_unlikely(planExecutorHangWhileYieldedInWaitForInserts.shouldFail())) {
123+
LOGV2(4452903,
124+
"PlanExecutor - planExecutorHangWhileYieldedInWaitForInserts fail point "
125+
"enabled. "
126+
"Blocking until fail point is disabled");
127+
planExecutorHangWhileYieldedInWaitForInserts.pauseWhileSet();
128+
}
129+
},
130+
RestoreContext::RestoreType::kYield);
127131
notifier->doneWaiting(opCtx);
128132

129133
uassertStatusOK(yieldResult);

0 commit comments

Comments
 (0)