Skip to content

Commit e0ac218

Browse files
DamianWasilewiczMongoDB Bot
authored andcommitted
SERVER-99883 Implement buildBatchedInsertContexts (#31848)
GitOrigin-RevId: 6b4bb91
1 parent 22eecb9 commit e0ac218

File tree

6 files changed

+532
-1
lines changed

6 files changed

+532
-1
lines changed

src/mongo/db/timeseries/bucket_catalog/bucket_catalog.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,18 @@ BucketCatalog::BucketCatalog(size_t numberOfStripes, std::function<uint64_t()> m
175175
});
176176
}
177177

178+
BatchedInsertContext::BatchedInsertContext(
179+
BucketKey& bucketKey,
180+
StripeNumber stripeNumber,
181+
const TimeseriesOptions& options,
182+
ExecutionStatsController& stats,
183+
std::vector<BatchedInsertTuple>& measurementsTimesAndIndices)
184+
: key(std::move(bucketKey)),
185+
stripeNumber(stripeNumber),
186+
options(options),
187+
stats(stats),
188+
measurementsTimesAndIndices(measurementsTimesAndIndices){};
189+
178190
BSONObj getMetadata(BucketCatalog& catalog, const BucketId& bucketId) {
179191
auto const& stripe = *catalog.stripes[internal::getStripeNumber(catalog, bucketId)];
180192
stdx::lock_guard stripeLock{stripe.mutex};

src/mongo/db/timeseries/bucket_catalog/bucket_catalog.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,15 @@ namespace mongo::timeseries::bucket_catalog {
6363

6464
using StripeNumber = std::uint8_t;
6565
using ShouldClearFn = std::function<bool(const UUID&)>;
66+
// Tuple that stores a measurement, the time value for that measurement, and the index of the
67+
// measurement from the original insert request.
68+
using BatchedInsertTuple = std::tuple<BSONObj, Date_t, size_t>;
6669

6770
/**
6871
* Bundle of information that gets passed down into 'insert' and functions below it that may create
6972
* a new bucket. It stores information that is used to decide which bucket to insert a measurement
70-
* into.
73+
* into. Binding these values together is used to sort on measurement timestamps and keep track of
74+
* the original index in the user batch for error reporting.
7175
*/
7276
struct InsertContext {
7377
BucketKey key;
@@ -80,6 +84,26 @@ struct InsertContext {
8084
};
8185
};
8286

87+
/**
88+
* Represents a set of measurements that should target one bucket. The measurements contained in
89+
* this struct are a best-effort guess at a grouping based on, but not intended to be a guarantee as
90+
* to, what will fit in a bucket. The measurements stored within the struct should be sorted on
91+
* time, and are guaranteed only to share a metaField value.
92+
*/
93+
struct BatchedInsertContext {
94+
BucketKey key;
95+
StripeNumber stripeNumber;
96+
const TimeseriesOptions& options;
97+
ExecutionStatsController stats;
98+
std::vector<BatchedInsertTuple> measurementsTimesAndIndices;
99+
100+
BatchedInsertContext(BucketKey&,
101+
StripeNumber,
102+
const TimeseriesOptions&,
103+
ExecutionStatsController&,
104+
std::vector<BatchedInsertTuple>&);
105+
};
106+
83107
/**
84108
* Return type indicating that a call to 'insert' or 'tryInsert' successfully staged the input
85109
* measurement for insertion. See 'insert' and 'tryInsert' for more information.

src/mongo/db/timeseries/write_ops/internal/BUILD.bazel

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,17 @@ mongo_cc_library(
3131
"//src/mongo/db/transaction",
3232
],
3333
)
34+
35+
mongo_cc_unit_test(
36+
name = "db_timeseries_write_ops_internal_test",
37+
srcs = [
38+
"timeseries_write_ops_internal_test.cpp",
39+
],
40+
tags = ["mongo_unittest_fourth_group"],
41+
deps = [
42+
"//src/mongo/db/catalog:catalog_test_fixture",
43+
"//src/mongo/db/collection_crud",
44+
"//src/mongo/db/query/write_ops:write_ops_exec",
45+
"//src/mongo/db/timeseries/write_ops:timeseries_write_ops",
46+
],
47+
)

src/mongo/db/timeseries/write_ops/internal/timeseries_write_ops_internal.cpp

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#include "mongo/db/query/write_ops/write_ops_exec_util.h"
3535
#include "mongo/db/shard_role.h"
3636
#include "mongo/db/storage/storage_parameters_gen.h"
37+
#include "mongo/db/timeseries/bucket_catalog/bucket_catalog_helpers.h"
38+
#include "mongo/db/timeseries/bucket_catalog/bucket_catalog_internal.h"
3739
#include "mongo/db/timeseries/bucket_catalog/global_bucket_catalog.h"
3840
#include "mongo/db/timeseries/bucket_compression.h"
3941
#include "mongo/db/timeseries/bucket_compression_failure.h"
@@ -921,6 +923,138 @@ size_t performOrderedTimeseriesWrites(OperationContext* opCtx,
921923
return request.getDocuments().size();
922924
}
923925

926+
StatusWith<std::vector<bucket_catalog::BatchedInsertContext>> buildBatchedInsertContextsNoMetaField(
927+
const bucket_catalog::BucketCatalog& bucketCatalog,
928+
const UUID& collectionUUID,
929+
const TimeseriesOptions& timeseriesOptions,
930+
const std::vector<BSONObj>& userMeasurementsBatch,
931+
bucket_catalog::ExecutionStatsController& stats,
932+
tracking::Context& trackingContext) {
933+
934+
std::vector<bucket_catalog::BatchedInsertTuple> batchedInsertTupleVector;
935+
936+
// As part of the InsertBatchTuple struct we store the index of the measurement in the original
937+
// user batch for error reporting and retryability purposes.
938+
for (size_t i = 0; i < userMeasurementsBatch.size(); i++) {
939+
auto swTime =
940+
bucket_catalog::extractTime(userMeasurementsBatch[i], timeseriesOptions.getTimeField());
941+
if (!swTime.isOK()) {
942+
return swTime.getStatus();
943+
}
944+
batchedInsertTupleVector.emplace_back(userMeasurementsBatch[i], swTime.getValue(), i);
945+
}
946+
947+
// Empty metadata.
948+
BSONElement metadata;
949+
auto bucketKey = bucket_catalog::BucketKey{
950+
collectionUUID, bucket_catalog::BucketMetadata{trackingContext, metadata, boost::none}};
951+
auto stripeNumber = bucket_catalog::internal::getStripeNumber(bucketCatalog, bucketKey);
952+
953+
std::sort(
954+
batchedInsertTupleVector.begin(), batchedInsertTupleVector.end(), [](auto& lhs, auto& rhs) {
955+
// Sort measurements on their timeField.
956+
return std::get<Date_t>(lhs) < std::get<Date_t>(rhs);
957+
});
958+
959+
return {{bucket_catalog::BatchedInsertContext(
960+
bucketKey, stripeNumber, timeseriesOptions, stats, batchedInsertTupleVector)}};
961+
};
962+
963+
StatusWith<std::vector<bucket_catalog::BatchedInsertContext>>
964+
buildBatchedInsertContextsWithMetaField(const bucket_catalog::BucketCatalog& bucketCatalog,
965+
const UUID& collectionUUID,
966+
const TimeseriesOptions& timeseriesOptions,
967+
const std::vector<BSONObj>& userMeasurementsBatch,
968+
StringData metaFieldName,
969+
bucket_catalog::ExecutionStatsController& stats,
970+
tracking::Context& trackingContext) {
971+
// Maps from the string representation of a distinct metaField value to a vector of
972+
// BatchedInsertTuples whose measurements have that same metaField value.
973+
stdx::unordered_map<std::string, std::vector<bucket_catalog::BatchedInsertTuple>>
974+
metaFieldToBatchedInsertTuples;
975+
// Maps from the string representation of a metaField value to the BSONElement of that metaField
976+
// value. Workaround for the fact that BSONElements are not hashable.
977+
stdx::unordered_map<std::string, BSONElement> metaFieldStringToBSONElement;
978+
979+
// Go through the vector of user measurements and create a map from each distinct metaField
980+
// value to a vector of InsertBatchTuples for that metaField. As part of the InsertBatchTuple
981+
// struct we store the index of the measurement in the original user batch for error reporting
982+
// and retryability purposes.
983+
for (size_t i = 0; i < userMeasurementsBatch.size(); i++) {
984+
auto swTimeAndMeta =
985+
bucket_catalog::extractTimeAndMeta(userMeasurementsBatch[i],
986+
timeseriesOptions.getTimeField(),
987+
timeseriesOptions.getMetaField().get());
988+
if (!swTimeAndMeta.isOK()) {
989+
return swTimeAndMeta.getStatus();
990+
}
991+
auto time = std::get<Date_t>(swTimeAndMeta.getValue());
992+
auto meta = std::get<BSONElement>(swTimeAndMeta.getValue());
993+
994+
metaFieldStringToBSONElement.try_emplace(meta.String(), meta);
995+
metaFieldToBatchedInsertTuples.try_emplace(
996+
meta.String(), std::vector<bucket_catalog::BatchedInsertTuple>{});
997+
998+
metaFieldToBatchedInsertTuples[meta.String()].emplace_back(
999+
userMeasurementsBatch[i], time, i);
1000+
}
1001+
1002+
std::vector<bucket_catalog::BatchedInsertContext> batchedInsertContexts;
1003+
1004+
// Go through all meta-unique batches, sort by time, and fill result
1005+
for (auto& [metaFieldString, batchedInsertTupleVector] : metaFieldToBatchedInsertTuples) {
1006+
std::sort(batchedInsertTupleVector.begin(),
1007+
batchedInsertTupleVector.end(),
1008+
[](auto& lhs, auto& rhs) {
1009+
// Sort measurements on their timeField.
1010+
return std::get<Date_t>(lhs) < std::get<Date_t>(rhs);
1011+
});
1012+
auto metadata = metaFieldStringToBSONElement[metaFieldString];
1013+
auto bucketKey = bucket_catalog::BucketKey{
1014+
collectionUUID, bucket_catalog::BucketMetadata{trackingContext, metadata, boost::none}};
1015+
auto stripeNumber = bucket_catalog::internal::getStripeNumber(bucketCatalog, bucketKey);
1016+
1017+
batchedInsertContexts.emplace_back(bucket_catalog::BatchedInsertContext(
1018+
bucketKey, stripeNumber, timeseriesOptions, stats, batchedInsertTupleVector));
1019+
}
1020+
1021+
return batchedInsertContexts;
1022+
}
1023+
1024+
StatusWith<std::vector<bucket_catalog::BatchedInsertContext>> buildBatchedInsertContexts(
1025+
bucket_catalog::BucketCatalog& bucketCatalog,
1026+
const UUID& collectionUUID,
1027+
const TimeseriesOptions& timeseriesOptions,
1028+
const std::vector<BSONObj>& userMeasurementsBatch) {
1029+
1030+
auto metaFieldName = timeseriesOptions.getMetaField();
1031+
auto& trackingContext = bucket_catalog::getTrackingContext(
1032+
bucketCatalog.trackingContexts, bucket_catalog::TrackingScope::kOpenBucketsByKey);
1033+
auto stats =
1034+
bucket_catalog::internal::getOrInitializeExecutionStats(bucketCatalog, collectionUUID);
1035+
1036+
auto swBatchedInsertContexts = (metaFieldName)
1037+
? buildBatchedInsertContextsWithMetaField(bucketCatalog,
1038+
collectionUUID,
1039+
timeseriesOptions,
1040+
userMeasurementsBatch,
1041+
metaFieldName.get(),
1042+
stats,
1043+
trackingContext)
1044+
: buildBatchedInsertContextsNoMetaField(bucketCatalog,
1045+
collectionUUID,
1046+
timeseriesOptions,
1047+
userMeasurementsBatch,
1048+
stats,
1049+
trackingContext);
1050+
1051+
if (!swBatchedInsertContexts.isOK()) {
1052+
return swBatchedInsertContexts.getStatus();
1053+
}
1054+
1055+
return swBatchedInsertContexts.getValue();
1056+
}
1057+
9241058
} // namespace internal
9251059

9261060
} // namespace mongo::timeseries::write_ops

src/mongo/db/timeseries/write_ops/internal/timeseries_write_ops_internal.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,4 +91,60 @@ bool commitTimeseriesBucket(OperationContext* opCtx,
9191
absl::flat_hash_map<int, int>& retryAttemptsForDup,
9292
const mongo::write_ops::InsertCommandRequest& request);
9393

94+
/**
95+
* Given a batch of user measurements for a collection that does not have a metaField value, returns
96+
* a BatchedInsertContext for all of the user measurements.
97+
*
98+
* Passes through the inputted measurements twice, once to record the index of the measurement in
99+
* the original user batch for error reporting, and then again to sort the measurements based on
100+
* their time field.
101+
*
102+
* This is slightly more efficient and requires fewer maps/data structures than the metaField
103+
* variant, because we do not need to split up the measurements into different batches according to
104+
* their metaField value.
105+
*/
106+
StatusWith<std::vector<bucket_catalog::BatchedInsertContext>> buildBatchedInsertContextsNoMetaField(
107+
const bucket_catalog::BucketCatalog& bucketCatalog,
108+
const UUID& collectionUUID,
109+
const TimeseriesOptions& timeseriesOptions,
110+
const std::vector<BSONObj>& userMeasurementsBatch,
111+
bucket_catalog::ExecutionStatsController& stats,
112+
tracking::Context& trackingContext);
113+
114+
115+
/**
116+
* Given a batch of user measurements for a collection that does have a metaField value, returns a
117+
* vector of BatchedInsertContexts with each BatchedInsertContext storing the measurements for a
118+
* particular metaField value.
119+
*
120+
* Passes through the inputted measurements twice, once to record the index of the measurement in
121+
* the original user batch for error reporting, and then again to sort the measurements based on
122+
* their time field.
123+
*/
124+
StatusWith<std::vector<bucket_catalog::BatchedInsertContext>>
125+
buildBatchedInsertContextsWithMetaField(const bucket_catalog::BucketCatalog& bucketCatalog,
126+
const UUID& collectionUUID,
127+
const TimeseriesOptions& timeseriesOptions,
128+
const std::vector<BSONObj>& userMeasurementsBatch,
129+
StringData metaFieldName,
130+
bucket_catalog::ExecutionStatsController& stats,
131+
tracking::Context& trackingContext);
132+
133+
/**
134+
* Given a set of measurements, splits up the measurements into batches based on the metaField.
135+
* Returns a vector of BatchedInsertContext where each BatchedInsertContext will contain the batch
136+
* of measurements for a particular metaField value, sorted on time, as well as other bucket-level
137+
* metadata.
138+
*
139+
* If the time-series collection has no metaField value, then all of the measurements will be
140+
* batched into one BatchedInsertContext.
141+
*
142+
* If any of the inserted measurements are malformed (i.e. missing the proper time field), returns a
143+
* Status with an error code.
144+
*/
145+
StatusWith<std::vector<bucket_catalog::BatchedInsertContext>> buildBatchedInsertContexts(
146+
bucket_catalog::BucketCatalog& bucketCatalog,
147+
const UUID& collectionUUID,
148+
const TimeseriesOptions& timeseriesOptions,
149+
const std::vector<BSONObj>& userMeasurementsBatch);
94150
} // namespace mongo::timeseries::write_ops::internal

0 commit comments

Comments
 (0)