Skip to content

Commit b887bba

Browse files
authored
Add new catchup mode to use transaction results to skip failed transaction and signature verification (#4536)
# Description Resolves [#X](#2814 (comment)) Adds a new config option, `CATCHUP_SKIP_KNOWN_RESULTS_FOR_TESTING`. When this config option is enabled, transaction results are downloaded from history archives for the catchup range. Failed transactions are not applied, and signatures are not verified. This mode is only available in test builds. The plan is to make this mode configurable when launching supercluster runs from jenkins, with it being enabled by default but disabled specifically for release validation when we want a more comprehensive catchup. I'll raise a separate PR for that. ## Perf testing ### Locally running catchup on 1000 ledgers: ``` user/system/total time seconds *Baseline (no skipping):* 429 / 115 / 138s *Skip Failed:* 373 / 99 / 114s (1.14x / 1.16x / 1.21x speedup over baseline) *Skip Failed + verification:* 334 / 88 / 95s (1.28x / 1.30x / 1.45x speedup over baseline) ``` ### Supercluster PubnetParallelCatchup Completed in 14h 47min (vs ~24 hours with recent releases / master HEAD). https://buildmeister-v3.stellar-ops.com/job/Core/job/stellar-supercluster/1055/ <!--- Describe what this pull request does, which issue it's resolving (usually applicable for code changes). ---> # Checklist - [ ] Reviewed the [contributing](https://github.com/stellar/stellar-core/blob/master/CONTRIBUTING.md#submitting-changes) document - [ ] Rebased on top of master (no merge commits) - [ ] Ran `clang-format` v8.0.0 (via `make format` or the Visual Studio extension) - [ ] Compiles - [ ] Ran all tests - [ ] If change impacts performance, include supporting evidence per the [performance document](https://github.com/stellar/stellar-core/blob/master/performance-eval/performance-eval.md)
2 parents 7458426 + 47caaff commit b887bba

19 files changed

+476
-78
lines changed

src/catchup/ApplyCheckpointWork.cpp

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,42 @@ ApplyCheckpointWork::openInputFiles()
9393
mTxIn.open(ti.localPath_nogz());
9494
mTxHistoryEntry = TransactionHistoryEntry();
9595
mHeaderHistoryEntry = LedgerHeaderHistoryEntry();
96+
#ifdef BUILD_TESTS
97+
if (mApp.getConfig().CATCHUP_SKIP_KNOWN_RESULTS_FOR_TESTING)
98+
{
99+
mTxResultIn = std::make_optional<XDRInputFileStream>();
100+
FileTransferInfo tri(mDownloadDir, FileType::HISTORY_FILE_TYPE_RESULTS,
101+
mCheckpoint);
102+
if (!tri.localPath_nogz().empty() &&
103+
std::filesystem::exists(tri.localPath_nogz()))
104+
{
105+
CLOG_DEBUG(History, "Replaying transaction results from {}",
106+
tri.localPath_nogz());
107+
108+
try
109+
{
110+
mTxResultIn->open(tri.localPath_nogz());
111+
}
112+
catch (std::exception const& e)
113+
{
114+
CLOG_DEBUG(History,
115+
"Failed to open transaction results file: {}. All "
116+
"transactions will be applied.",
117+
e.what());
118+
}
119+
mTxHistoryResultEntry =
120+
std::make_optional<TransactionHistoryResultEntry>();
121+
}
122+
else
123+
{
124+
CLOG_DEBUG(History,
125+
"Results file {} not found for checkpoint {} . All "
126+
"transactions will be applied for this checkpoint.",
127+
tri.localPath_nogz(), mCheckpoint);
128+
mTxHistoryResultEntry = std::nullopt;
129+
}
130+
}
131+
#endif
96132
mFilesOpen = true;
97133
}
98134

@@ -138,6 +174,43 @@ ApplyCheckpointWork::getCurrentTxSet()
138174
return TxSetXDRFrame::makeEmpty(lm.getLastClosedLedgerHeader());
139175
}
140176

177+
#ifdef BUILD_TESTS
178+
std::optional<TransactionResultSet>
179+
ApplyCheckpointWork::getCurrentTxResultSet()
180+
{
181+
ZoneScoped;
182+
auto& lm = mApp.getLedgerManager();
183+
auto seq = lm.getLastClosedLedgerNum() + 1;
184+
// Check mTxResultSet prior to loading next result set.
185+
// This order is important because it accounts for ledger "gaps"
186+
// in the history archives (which are caused by ledgers with empty tx
187+
// sets, as those are not uploaded).
188+
while (mTxResultIn && mTxResultIn->readOne(*mTxHistoryResultEntry))
189+
{
190+
if (mTxHistoryResultEntry)
191+
{
192+
if (mTxHistoryResultEntry->ledgerSeq < seq)
193+
{
194+
CLOG_DEBUG(History, "Advancing past txresultset for ledger {}",
195+
mTxHistoryResultEntry->ledgerSeq);
196+
}
197+
else if (mTxHistoryResultEntry->ledgerSeq > seq)
198+
{
199+
break;
200+
}
201+
else
202+
{
203+
releaseAssert(mTxHistoryResultEntry->ledgerSeq == seq);
204+
CLOG_DEBUG(History, "Loaded txresultset for ledger {}", seq);
205+
return std::make_optional(mTxHistoryResultEntry->txResultSet);
206+
}
207+
}
208+
}
209+
CLOG_DEBUG(History, "No txresultset for ledger {}", seq);
210+
return std::nullopt;
211+
}
212+
#endif // BUILD_TESTS
213+
141214
std::shared_ptr<LedgerCloseData>
142215
ApplyCheckpointWork::getNextLedgerCloseData()
143216
{
@@ -216,6 +289,14 @@ ApplyCheckpointWork::getNextLedgerCloseData()
216289
CLOG_DEBUG(History, "Ledger {} has {} transactions", header.ledgerSeq,
217290
txset->sizeTxTotal());
218291

292+
std::optional<TransactionResultSet> txres = std::nullopt;
293+
#ifdef BUILD_TESTS
294+
if (mApp.getConfig().CATCHUP_SKIP_KNOWN_RESULTS_FOR_TESTING)
295+
{
296+
txres = getCurrentTxResultSet();
297+
}
298+
#endif
299+
219300
// We've verified the ledgerHeader (in the "trusted part of history"
220301
// sense) in CATCHUP_VERIFY phase; we now need to check that the
221302
// txhash we're about to apply is the one denoted by that ledger
@@ -246,7 +327,7 @@ ApplyCheckpointWork::getNextLedgerCloseData()
246327

247328
return std::make_shared<LedgerCloseData>(
248329
header.ledgerSeq, txset, header.scpValue,
249-
std::make_optional<Hash>(mHeaderHistoryEntry.hash));
330+
std::make_optional<Hash>(mHeaderHistoryEntry.hash), txres);
250331
}
251332

252333
BasicWork::State

src/catchup/ApplyCheckpointWork.h

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,24 @@ class TmpDir;
2121
struct LedgerHeaderHistoryEntry;
2222

2323
/**
24-
* This class is responsible for applying transactions stored in files on
25-
* temporary directory (downloadDir) to local ledger. It requires two sets of
26-
* files - ledgers and transactions - int .xdr format. Transaction files are
27-
* used to read transactions that will be used and ledger files are used to
24+
* This class is responsible for applying transactions stored in files in the
25+
* temporary directory (downloadDir) to local the ledger. It requires two sets
26+
* of files - ledgers and transactions - in .xdr format. Transaction files are
27+
* used to read transactions that will be applied and ledger files are used to
2828
* check if ledger hashes are matching.
2929
*
30+
* It may also require a third set of files - transaction results - to use in
31+
* accelerated replay, where failed transactions are not applied and successful
32+
* transactions are applied without verifying their signatures.
33+
*
3034
* In each run it skips or applies transactions from one ledger. Skipping occurs
31-
* when ledger to be applied is older than LCL from local ledger. At LCL
32-
* boundary checks are made to confirm that ledgers from files knit up with
33-
* LCL. If everything is OK, an apply ledger operation is performed. Then
34-
* another check is made - if new local ledger matches corresponding ledger from
35-
* file.
35+
* when the ledger to be applied is older than the LCL of the local ledger. At
36+
* LCL, boundary checks are made to confirm that the ledgers from the files knit
37+
* up with LCL. If everything is OK, an apply ledger operation is performed.
38+
* Then another check is made - if the new local ledger matches corresponding
39+
* the ledger from file.
3640
*
37-
* Constructor of this class takes some important parameters:
41+
* The constructor of this class takes some important parameters:
3842
* * downloadDir - directory containing ledger and transaction files
3943
* * range - LedgerRange to apply, must be checkpoint-aligned,
4044
* and cover at most one checkpoint.
@@ -49,6 +53,10 @@ class ApplyCheckpointWork : public BasicWork
4953
XDRInputFileStream mHdrIn;
5054
XDRInputFileStream mTxIn;
5155
TransactionHistoryEntry mTxHistoryEntry;
56+
#ifdef BUILD_TESTS
57+
std::optional<XDRInputFileStream> mTxResultIn;
58+
std::optional<TransactionHistoryResultEntry> mTxHistoryResultEntry;
59+
#endif // BUILD_TESTS
5260
LedgerHeaderHistoryEntry mHeaderHistoryEntry;
5361
OnFailureCallback mOnFailure;
5462

@@ -57,6 +65,9 @@ class ApplyCheckpointWork : public BasicWork
5765
std::shared_ptr<ConditionalWork> mConditionalWork;
5866

5967
TxSetXDRFrameConstPtr getCurrentTxSet();
68+
#ifdef BUILD_TESTS
69+
std::optional<TransactionResultSet> getCurrentTxResultSet();
70+
#endif // BUILD_TESTS
6071
void openInputFiles();
6172

6273
std::shared_ptr<LedgerCloseData> getNextLedgerCloseData();

src/catchup/CatchupConfiguration.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
namespace stellar
1414
{
1515

16-
// Each catchup can be configured by two parameters destination ledger
16+
// Each catchup can be configured by two parameters: destination ledger
1717
// (and its hash, if known) and count of ledgers to apply.
1818
// Value of count can be adjusted in different ways during catchup. If applying
1919
// count ledgers would mean going before the last closed ledger - it is
@@ -31,12 +31,13 @@ namespace stellar
3131
// and catchup to that instead of destination ledger. This is useful when
3232
// doing offline commandline catchups with stellar-core catchup command.
3333
//
34-
// Catchup can be done in two modes - ONLINE nad OFFLINE. In ONLINE mode node
35-
// is connected to the network. If receives ledgers during catchup and applies
36-
// them after history is applied. Also additional closing ledger is required
37-
// to mark catchup as complete and node as synced. In OFFLINE mode node is not
38-
// connected to network, so new ledgers are not being externalized. Only
39-
// buckets and transactions from history archives are applied.
34+
// Catchup can be done in two modes - ONLINE and OFFLINE. In ONLINE mode, the
35+
// node is connected to the network. It receives ledgers during catchup and
36+
// applies them after history is applied. Also, an additional closing ledger is
37+
// required to mark catchup as complete and the node as synced. In OFFLINE mode,
38+
// the node is not connected to network, so new ledgers are not being
39+
// externalized. Only buckets and transactions from history archives are
40+
// applied.
4041
class CatchupConfiguration
4142
{
4243
public:

src/catchup/CatchupWork.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,22 @@ using WorkSeqPtr = std::shared_ptr<WorkSequence>;
2424

2525
// CatchupWork does all the necessary work to perform any type of catchup.
2626
// It accepts CatchupConfiguration structure to know from which ledger to which
27-
// one do the catchup and if it involves only applying ledgers or ledgers and
27+
// one to do the catchup and if it involves only applying ledgers or ledgers and
2828
// buckets.
2929
//
30-
// First thing it does is to get a history state which allows to calculate
31-
// proper destination ledger (in case CatchupConfiguration::CURRENT) was used
32-
// and to get list of buckets that should be in database on that ledger.
30+
// First, it gets a history state, which allows it to calculate a
31+
// proper destination ledger (in case CatchupConfiguration::CURRENT)
32+
// and get a list of buckets that should be in the database on that ledger.
3333
//
34-
// Next step is downloading and verifying ledgers (if verifyMode is set to
35-
// VERIFY_BUFFERED_LEDGERS it can also verify against ledgers currently
34+
// Next, it downloads and verifies ledgers (if verifyMode is set to
35+
// VERIFY_BUFFERED_LEDGERS, it can also verify against ledgers currently
3636
// buffered in LedgerManager).
3737
//
3838
// Then, depending on configuration, it can download, verify and apply buckets
3939
// (as in MINIMAL and RECENT catchups), and then download and apply
4040
// transactions (as in COMPLETE and RECENT catchups).
4141
//
42-
// After that, catchup is done and node can replay buffered ledgers and take
42+
// After that, catchup is done and the node can replay buffered ledgers and take
4343
// part in consensus protocol.
4444

4545
class CatchupWork : public Work

src/catchup/DownloadApplyTxsWork.cpp

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ DownloadApplyTxsWork::yieldMoreWork()
4343
{
4444
throw std::runtime_error("Work has no more children to iterate over!");
4545
}
46-
4746
CLOG_INFO(History,
4847
"Downloading, unzipping and applying {} for checkpoint {}",
4948
typeString(FileType::HISTORY_FILE_TYPE_TRANSACTIONS),
@@ -80,6 +79,53 @@ DownloadApplyTxsWork::yieldMoreWork()
8079
mApp, mDownloadDir, LedgerRange::inclusive(low, high), cb);
8180

8281
std::vector<std::shared_ptr<BasicWork>> seq{getAndUnzip};
82+
std::vector<FileTransferInfo> filesToTransfer{ft};
83+
std::vector<std::shared_ptr<BasicWork>> optionalDownloads;
84+
#ifdef BUILD_TESTS
85+
if (mApp.getConfig().CATCHUP_SKIP_KNOWN_RESULTS_FOR_TESTING)
86+
{
87+
CLOG_INFO(History,
88+
"Downloading, unzipping and applying {} for checkpoint {}",
89+
typeString(FileType::HISTORY_FILE_TYPE_RESULTS),
90+
mCheckpointToQueue);
91+
92+
FileTransferInfo resultsFile(mDownloadDir,
93+
FileType::HISTORY_FILE_TYPE_RESULTS,
94+
mCheckpointToQueue);
95+
auto getResultsWork = std::make_shared<GetAndUnzipRemoteFileWork>(
96+
mApp, resultsFile, mArchive, /*logErrorOnFailure=*/false);
97+
std::weak_ptr<GetAndUnzipRemoteFileWork> getResultsWorkWeak =
98+
getResultsWork;
99+
seq.emplace_back(getResultsWork);
100+
seq.emplace_back(std::make_shared<WorkWithCallback>(
101+
mApp, "get-results-" + std::to_string(mCheckpointToQueue),
102+
[apply, getResultsWorkWeak, checkpoint, &dir](Application& app) {
103+
auto getResults = getResultsWorkWeak.lock();
104+
if (getResults && getResults->getState() != State::WORK_SUCCESS)
105+
{
106+
auto archive = getResults->getArchive();
107+
if (archive)
108+
{
109+
FileTransferInfo ti(dir,
110+
FileType::HISTORY_FILE_TYPE_RESULTS,
111+
checkpoint);
112+
CLOG_WARNING(
113+
History,
114+
"Archive {} maybe contains corrupt results file "
115+
"{}. "
116+
"This is not fatal as long as the archive contains "
117+
"valid transaction history. Catchup will proceed "
118+
"but"
119+
"the node will not be able to skip known results.",
120+
archive->getName(), ti.remoteName());
121+
}
122+
}
123+
return true;
124+
}));
125+
126+
filesToTransfer.push_back(resultsFile);
127+
}
128+
#endif // BUILD_TESTS
83129

84130
auto maybeWaitForMerges = [](Application& app) {
85131
if (app.getConfig().CATCHUP_WAIT_MERGES_TX_APPLY_FOR_TESTING)
@@ -139,28 +185,34 @@ DownloadApplyTxsWork::yieldMoreWork()
139185
mApp, "wait-merges" + apply->getName(), maybeWaitForMerges, apply));
140186
}
141187

142-
seq.push_back(std::make_shared<WorkWithCallback>(
143-
mApp, "delete-transactions-" + std::to_string(mCheckpointToQueue),
144-
[ft](Application& app) {
145-
try
146-
{
147-
std::filesystem::remove(
148-
std::filesystem::path(ft.localPath_nogz()));
149-
CLOG_DEBUG(History, "Deleted transactions {}",
188+
for (auto const& ft : filesToTransfer)
189+
{
190+
auto deleteWorkName = "delete-" + ft.getTypeString() + "-" +
191+
std::to_string(mCheckpointToQueue);
192+
seq.push_back(std::make_shared<WorkWithCallback>(
193+
mApp, deleteWorkName, [ft](Application& app) {
194+
CLOG_DEBUG(History, "Deleting {} {}", ft.getTypeString(),
150195
ft.localPath_nogz());
196+
try
197+
{
198+
std::filesystem::remove(
199+
std::filesystem::path(ft.localPath_nogz()));
200+
CLOG_DEBUG(History, "Deleted {} {}", ft.getTypeString(),
201+
ft.localPath_nogz());
202+
}
203+
catch (std::filesystem::filesystem_error const& e)
204+
{
205+
CLOG_ERROR(History, "Could not delete {} {}: {}",
206+
ft.getTypeString(), ft.localPath_nogz(),
207+
e.what());
208+
return false;
209+
}
151210
return true;
152-
}
153-
catch (std::filesystem::filesystem_error const& e)
154-
{
155-
CLOG_ERROR(History, "Could not delete transactions {}: {}",
156-
ft.localPath_nogz(), e.what());
157-
return false;
158-
}
159-
}));
160-
211+
}));
212+
}
161213
auto nextWork = std::make_shared<WorkSequence>(
162214
mApp, "download-apply-" + std::to_string(mCheckpointToQueue), seq,
163-
BasicWork::RETRY_NEVER);
215+
BasicWork::RETRY_NEVER, true /*stop at first failure*/);
164216
mCheckpointToQueue += mApp.getHistoryManager().getCheckpointFrequency();
165217
mLastYieldedWork = nextWork;
166218
return nextWork;

src/herder/LedgerCloseData.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,21 @@ LedgerCloseData::LedgerCloseData(uint32_t ledgerSeq,
2626
releaseAssert(txSet->getContentsHash() == mValue.txSetHash);
2727
}
2828

29+
#ifdef BUILD_TESTS
30+
LedgerCloseData::LedgerCloseData(
31+
uint32_t ledgerSeq, TxSetXDRFrameConstPtr txSet, StellarValue const& v,
32+
std::optional<Hash> const& expectedLedgerHash,
33+
std::optional<TransactionResultSet> const& expectedResults)
34+
: mLedgerSeq(ledgerSeq)
35+
, mTxSet(txSet)
36+
, mValue(v)
37+
, mExpectedLedgerHash(expectedLedgerHash)
38+
, mExpectedResults(expectedResults)
39+
{
40+
releaseAssert(txSet->getContentsHash() == mValue.txSetHash);
41+
}
42+
#endif // BUILD_TESTS
43+
2944
std::string
3045
stellarValueToString(Config const& c, StellarValue const& sv)
3146
{

0 commit comments

Comments
 (0)