Skip to content

Commit dc55962

Browse files
fix(snapshot): Refactor the big_value_mu_ mutex
fixes #4152 Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>
1 parent 966a1a4 commit dc55962

File tree

3 files changed

+80
-59
lines changed

3 files changed

+80
-59
lines changed

src/server/rdb_save.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ class RdbSerializer : public SerializerBase {
225225
// Must be called in the thread to which `it` belongs.
226226
// Returns the serialized rdb_type or the error.
227227
// expire_ms = 0 means no expiry.
228+
// This function might preempt if flush_fun_ is used.
228229
io::Result<uint8_t> SaveEntry(const PrimeKey& pk, const PrimeValue& pv, uint64_t expire_ms,
229230
uint32_t mc_flags, DbIndex dbid);
230231

src/server/snapshot.cc

Lines changed: 63 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,11 @@ constexpr size_t kMinBlobSize = 32_KB;
3838

3939
SliceSnapshot::SliceSnapshot(CompressionMode compression_mode, DbSlice* slice,
4040
SnapshotDataConsumerInterface* consumer, Context* cntx)
41-
: db_slice_(slice), compression_mode_(compression_mode), consumer_(consumer), cntx_(cntx) {
42-
db_array_ = slice->databases();
41+
: db_slice_(slice),
42+
db_array_(slice->databases()),
43+
compression_mode_(compression_mode),
44+
consumer_(consumer),
45+
cntx_(cntx) {
4346
tl_slice_snapshots.insert(this);
4447
}
4548

@@ -126,7 +129,9 @@ void SliceSnapshot::FinalizeJournalStream(bool cancel) {
126129

127130
journal->UnregisterOnChange(cb_id);
128131
if (!cancel) {
129-
serializer_->SendJournalOffset(journal->GetLsn());
132+
CallSerializerUnderLock([&journal](RdbSerializer* serializer) {
133+
serializer->SendJournalOffset(journal->GetLsn());
134+
});
130135
PushSerialized(true);
131136
}
132137
}
@@ -163,16 +168,15 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
163168

164169
uint64_t last_yield = 0;
165170
PrimeTable* pt = &db_array_[db_indx]->prime;
166-
current_db_ = db_indx;
167171

168172
VLOG(1) << "Start traversing " << pt->size() << " items for index " << db_indx;
169173
do {
170174
if (cntx_->IsCancelled()) {
171175
return;
172176
}
173177

174-
PrimeTable::Cursor next =
175-
pt->TraverseBuckets(cursor, [this](auto it) { return BucketSaveCb(it); });
178+
PrimeTable::Cursor next = pt->TraverseBuckets(
179+
cursor, [this, db_indx](auto it) { return BucketSaveCb(db_indx, it); });
176180
cursor = next;
177181
PushSerialized(false);
178182

@@ -194,7 +198,8 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
194198

195199
CHECK(!serialize_bucket_running_);
196200
if (send_full_sync_cut) {
197-
CHECK(!serializer_->SendFullSyncCut());
201+
CallSerializerUnderLock(
202+
[](RdbSerializer* serializer) { CHECK(!serializer->SendFullSyncCut()); });
198203
PushSerialized(true);
199204
}
200205

@@ -212,7 +217,9 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
212217

213218
// The replica sends the LSN of the next entry is wants to receive.
214219
while (!cntx_->IsCancelled() && journal->IsLSNInBuffer(lsn)) {
215-
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
220+
CallSerializerUnderLock([entry = journal->GetEntry(lsn)](RdbSerializer* serializer) {
221+
serializer->WriteJournalEntry(entry);
222+
});
216223
PushSerialized(false);
217224
lsn++;
218225
}
@@ -229,10 +236,7 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
229236

230237
// GetLsn() is always the next lsn that we expect to create.
231238
if (journal->GetLsn() == lsn) {
232-
{
233-
FiberAtomicGuard fg;
234-
serializer_->SendFullSyncCut();
235-
}
239+
CallSerializerUnderLock([](RdbSerializer* serializer) { serializer->SendFullSyncCut(); });
236240
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
237241
OnJournalEntry(item, await);
238242
};
@@ -248,9 +252,7 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
248252
}
249253
}
250254

251-
bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
252-
std::lock_guard guard(big_value_mu_);
253-
255+
bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it) {
254256
++stats_.savecb_calls;
255257

256258
auto check = [&](auto v) {
@@ -267,7 +269,7 @@ bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
267269
return false;
268270
}
269271

270-
db_slice_->FlushChangeToEarlierCallbacks(current_db_, DbSlice::Iterator::FromPrime(it),
272+
db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
271273
snapshot_version_);
272274

273275
auto* blocking_counter = db_slice_->BlockingCounter();
@@ -276,7 +278,7 @@ bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
276278
// zero.
277279
std::lock_guard blocking_counter_guard(*blocking_counter);
278280

279-
stats_.loop_serialized += SerializeBucket(current_db_, it);
281+
stats_.loop_serialized += SerializeBucket(db_index, it);
280282

281283
return false;
282284
}
@@ -292,20 +294,19 @@ unsigned SliceSnapshot::SerializeBucket(DbIndex db_index, PrimeTable::bucket_ite
292294
while (!it.is_done()) {
293295
++result;
294296
// might preempt due to big value serialization.
295-
SerializeEntry(db_index, it->first, it->second, nullopt, serializer_.get());
297+
SerializeEntry(db_index, it->first, it->second);
296298
++it;
297299
}
298300
serialize_bucket_running_ = false;
299301
return result;
300302
}
301303

302-
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv,
303-
optional<uint64_t> expire, RdbSerializer* serializer) {
304+
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv) {
304305
if (pv.IsExternal() && pv.IsCool())
305-
return SerializeEntry(db_indx, pk, pv.GetCool().record->value, expire, serializer);
306+
return SerializeEntry(db_indx, pk, pv.GetCool().record->value);
306307

307-
time_t expire_time = expire.value_or(0);
308-
if (!expire && pv.HasExpire()) {
308+
time_t expire_time = 0;
309+
if (pv.HasExpire()) {
309310
auto eit = db_array_[db_indx]->expire.Find(pk);
310311
expire_time = db_slice_->ExpireTime(eit);
311312
}
@@ -318,18 +319,26 @@ void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const Pr
318319
EngineShard::tlocal()->tiered_storage()->Read(
319320
db_indx, pk.ToString(), pv,
320321
[future](const std::string& v) mutable { future.Resolve(PrimeValue(v)); });
321-
delayed_entries_.push_back(
322-
{db_indx, PrimeKey(pk.ToString()), std::move(future), expire_time, mc_flags});
322+
323+
auto key = PrimeKey(pk.ToString());
324+
325+
delayed_entries_.push_back({db_indx, std::move(key), std::move(future), expire_time, mc_flags});
323326
++type_freq_map_[RDB_TYPE_STRING];
324327
} else {
325-
io::Result<uint8_t> res = serializer->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
328+
io::Result<uint8_t> res;
329+
CallSerializerUnderLock([&](RdbSerializer* serializer) {
330+
res = serializer->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
331+
});
326332
CHECK(res);
327333
++type_freq_map_[*res];
328334
}
329335
}
330336

331337
size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
332338
io::StringFile sfile;
339+
340+
// FlushSerialized is already under lock
341+
// So no locking is needed
333342
serializer_->FlushToSink(&sfile, flush_state);
334343

335344
size_t serialized = sfile.val.size();
@@ -339,53 +348,59 @@ size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
339348
uint64_t id = rec_id_++;
340349
DVLOG(2) << "Pushing " << id;
341350

342-
fb2::NoOpLock lk;
343-
344-
// We create a critical section here that ensures that records are pushed in sequential order.
345-
// As a result, it is not possible for two fiber producers to push concurrently.
346-
// If A.id = 5, and then B.id = 6, and both are blocked here, it means that last_pushed_id_ < 4.
347-
// Once last_pushed_id_ = 4, A will be unblocked, while B will wait until A finishes pushing and
348-
// update last_pushed_id_ to 5.
349-
seq_cond_.wait(lk, [&] { return id == this->last_pushed_id_ + 1; });
350-
351351
// Blocking point.
352352
consumer_->ConsumeData(std::move(sfile.val), cntx_);
353353

354-
DCHECK_EQ(last_pushed_id_ + 1, id);
355-
last_pushed_id_ = id;
356-
seq_cond_.notify_all();
357-
358354
VLOG(2) << "Pushed with Serialize() " << serialized;
359355

360356
return serialized;
361357
}
362358

359+
template <typename Callback> void SliceSnapshot::CallSerializerUnderLock(Callback cb) {
360+
util::fb2::LockGuard guard(big_value_mu_);
361+
cb(serializer_.get());
362+
}
363+
363364
bool SliceSnapshot::PushSerialized(bool force) {
364-
if (!force && serializer_->SerializedLen() < kMinBlobSize)
365-
return false;
365+
if (!force) {
366+
size_t len = 0;
367+
CallSerializerUnderLock(
368+
[&len](RdbSerializer* serializer) { len = serializer->SerializedLen(); });
369+
if (len < kMinBlobSize) {
370+
return false;
371+
}
372+
}
366373

367374
// Flush any of the leftovers to avoid interleavings
368-
size_t serialized = FlushSerialized(FlushState::kFlushEndEntry);
375+
size_t serialized = 0;
376+
{
377+
util::fb2::LockGuard guard(big_value_mu_);
378+
FlushSerialized(FlushState::kFlushEndEntry);
379+
}
369380

370381
if (!delayed_entries_.empty()) {
371382
// Async bucket serialization might have accumulated some delayed values.
372383
// Because we can finally block in this function, we'll await and serialize them
373384
do {
385+
/* We can preempt on SaveEntry so first we need to change all data before
386+
* calling it */
374387
auto& entry = delayed_entries_.back();
375-
serializer_->SaveEntry(entry.key, entry.value.Get(), entry.expire, entry.dbid,
376-
entry.mc_flags);
377388
delayed_entries_.pop_back();
389+
390+
CallSerializerUnderLock([&entry](RdbSerializer* serializer) {
391+
serializer->SaveEntry(entry.key, entry.value.Get(), entry.expire, entry.dbid,
392+
entry.mc_flags);
393+
});
378394
} while (!delayed_entries_.empty());
379395

380396
// blocking point.
397+
util::fb2::LockGuard guard(big_value_mu_);
381398
serialized += FlushSerialized(FlushState::kFlushEndEntry);
382399
}
383400
return serialized > 0;
384401
}
385402

386403
void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
387-
std::lock_guard guard(big_value_mu_);
388-
389404
PrimeTable* table = db_slice_->GetTables(db_index).first;
390405
const PrimeTable::bucket_iterator* bit = req.update();
391406

@@ -410,9 +425,9 @@ void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await)
410425
// To enable journal flushing to sync after non auto journal command is executed we call
411426
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
412427
// additional journal change to serialize, it simply invokes PushSerialized.
413-
std::lock_guard guard(big_value_mu_);
414428
if (item.opcode != journal::Op::NOOP) {
415-
serializer_->WriteJournalEntry(item.data);
429+
CallSerializerUnderLock(
430+
[&item](RdbSerializer* serializer) { CHECK(!serializer->WriteJournalEntry(item.data)); });
416431
}
417432

418433
if (await) {

src/server/snapshot.h

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,14 @@ class SliceSnapshot {
9898
void SwitchIncrementalFb(LSN lsn);
9999

100100
// Called on traversing cursor by IterateBucketsFb.
101-
bool BucketSaveCb(PrimeTable::bucket_iterator it);
101+
bool BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it);
102102

103103
// Serialize single bucket.
104104
// Returns number of serialized entries, updates bucket version to snapshot version.
105105
unsigned SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator bucket_it);
106106

107107
// Serialize entry into passed serializer.
108-
void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv,
109-
std::optional<uint64_t> expire, RdbSerializer* serializer);
108+
void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv);
110109

111110
// DbChange listener
112111
void OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req);
@@ -119,11 +118,16 @@ class SliceSnapshot {
119118
// Return true if pushed. Can block. Is called from the snapshot thread.
120119
bool PushSerialized(bool force);
121120

121+
using FlushState = SerializerBase::FlushState;
122+
122123
// Helper function that flushes the serialized items into the RecordStream.
123124
// Can block.
124-
using FlushState = SerializerBase::FlushState;
125+
// Lock big_value_mu_ before calling it
125126
size_t FlushSerialized(FlushState flush_state);
126127

128+
// Calls the provided callback with the serializer under a lock.
129+
template <typename Callback> void CallSerializerUnderLock(Callback cb);
130+
127131
public:
128132
uint64_t snapshot_version() const {
129133
return snapshot_version_;
@@ -150,25 +154,26 @@ class SliceSnapshot {
150154
};
151155

152156
DbSlice* db_slice_;
153-
DbTableArray db_array_;
154-
155-
DbIndex current_db_;
157+
const DbTableArray db_array_;
156158

159+
// Guarded by big_value_mu_
157160
std::unique_ptr<RdbSerializer> serializer_;
158-
std::vector<DelayedEntry> delayed_entries_; // collected during atomic bucket traversal
161+
162+
// collected during atomic bucket traversal
163+
std::vector<DelayedEntry> delayed_entries_;
159164

160165
// Used for sanity checks.
161166
bool serialize_bucket_running_ = false;
162167
util::fb2::Fiber snapshot_fb_; // IterateEntriesFb
163168
util::fb2::CondVarAny seq_cond_;
164-
CompressionMode compression_mode_;
169+
const CompressionMode compression_mode_;
165170
RdbTypeFreqMap type_freq_map_;
166171

167172
// version upper bound for entries that should be saved (not included).
168173
uint64_t snapshot_version_ = 0;
169174
uint32_t journal_cb_id_ = 0;
170175

171-
uint64_t rec_id_ = 1, last_pushed_id_ = 0;
176+
uint64_t rec_id_ = 1;
172177

173178
struct Stats {
174179
size_t loop_serialized = 0;
@@ -178,7 +183,7 @@ class SliceSnapshot {
178183
size_t keys_total = 0;
179184
} stats_;
180185

181-
ThreadLocalMutex big_value_mu_;
186+
mutable ThreadLocalMutex big_value_mu_;
182187

183188
SnapshotDataConsumerInterface* consumer_;
184189
Context* cntx_;

0 commit comments

Comments
 (0)