Skip to content

Commit e784dff

Browse files
fix(snapshot): Refactor the big_value_mu_ mutex
fixes #4152 Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>
1 parent 0fe5e86 commit e784dff

File tree

3 files changed

+49
-39
lines changed

3 files changed

+49
-39
lines changed

src/server/rdb_save.h

+1
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ class RdbSerializer : public SerializerBase {
225225
// Must be called in the thread to which `it` belongs.
226226
// Returns the serialized rdb_type or the error.
227227
// expire_ms = 0 means no expiry.
228+
// This function might preempt if flush_fun_ is used.
228229
io::Result<uint8_t> SaveEntry(const PrimeKey& pk, const PrimeValue& pv, uint64_t expire_ms,
229230
uint32_t mc_flags, DbIndex dbid);
230231

src/server/snapshot.cc

+35-27
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@ SliceSnapshot::SliceSnapshot(DbSlice* slice, CompressionMode compression_mode,
4141
std::function<void(std::string)> on_push_record,
4242
std::function<void()> on_snapshot_finish)
4343
: db_slice_(slice),
44+
db_array_(slice->databases()),
4445
compression_mode_(compression_mode),
4546
on_push_(on_push_record),
4647
on_snapshot_finish_(on_snapshot_finish) {
47-
db_array_ = slice->databases();
4848
tl_slice_snapshots.insert(this);
4949
}
5050

@@ -127,8 +127,9 @@ void SliceSnapshot::FinalizeJournalStream(bool cancel) {
127127

128128
journal->UnregisterOnChange(cb_id);
129129
if (!cancel) {
130+
util::fb2::LockGuard guard(big_value_mu_);
130131
serializer_->SendJournalOffset(journal->GetLsn());
131-
PushSerialized(true);
132+
PushSerializedUnderLock(true);
132133
}
133134
}
134135

@@ -164,16 +165,15 @@ void SliceSnapshot::IterateBucketsFb(const Cancellation* cll, bool send_full_syn
164165

165166
uint64_t last_yield = 0;
166167
PrimeTable* pt = &db_array_[db_indx]->prime;
167-
current_db_ = db_indx;
168168

169169
VLOG(1) << "Start traversing " << pt->size() << " items for index " << db_indx;
170170
do {
171171
if (cll->IsCancelled()) {
172172
return;
173173
}
174174

175-
PrimeTable::Cursor next =
176-
pt->TraverseBuckets(cursor, absl::bind_front(&SliceSnapshot::BucketSaveCb, this));
175+
PrimeTable::Cursor next = pt->TraverseBuckets(
176+
cursor, absl::bind_front(&SliceSnapshot::BucketSaveCb, this, db_indx));
177177
cursor = next;
178178
PushSerialized(false);
179179

@@ -195,8 +195,9 @@ void SliceSnapshot::IterateBucketsFb(const Cancellation* cll, bool send_full_syn
195195

196196
CHECK(!serialize_bucket_running_);
197197
if (send_full_sync_cut) {
198+
util::fb2::LockGuard guard(big_value_mu_);
198199
CHECK(!serializer_->SendFullSyncCut());
199-
PushSerialized(true);
200+
PushSerializedUnderLock(true);
200201
}
201202

202203
// serialized + side_saved must be equal to the total saved.
@@ -213,8 +214,11 @@ void SliceSnapshot::SwitchIncrementalFb(Context* cntx, LSN lsn) {
213214

214215
// The replica sends the LSN of the next entry is wants to receive.
215216
while (!cntx->IsCancelled() && journal->IsLSNInBuffer(lsn)) {
216-
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
217-
PushSerialized(false);
217+
{
218+
util::fb2::LockGuard guard(big_value_mu_);
219+
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
220+
PushSerializedUnderLock(false);
221+
}
218222
lsn++;
219223
}
220224

@@ -231,7 +235,7 @@ void SliceSnapshot::SwitchIncrementalFb(Context* cntx, LSN lsn) {
231235
// GetLsn() is always the next lsn that we expect to create.
232236
if (journal->GetLsn() == lsn) {
233237
{
234-
FiberAtomicGuard fg;
238+
util::fb2::LockGuard guard(big_value_mu_);
235239
serializer_->SendFullSyncCut();
236240
}
237241
auto journal_cb = absl::bind_front(&SliceSnapshot::OnJournalEntry, this);
@@ -247,9 +251,7 @@ void SliceSnapshot::SwitchIncrementalFb(Context* cntx, LSN lsn) {
247251
}
248252
}
249253

250-
bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
251-
std::lock_guard guard(big_value_mu_);
252-
254+
bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it) {
253255
++stats_.savecb_calls;
254256

255257
auto check = [&](auto v) {
@@ -266,7 +268,7 @@ bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
266268
return false;
267269
}
268270

269-
db_slice_->FlushChangeToEarlierCallbacks(current_db_, DbSlice::Iterator::FromPrime(it),
271+
db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
270272
snapshot_version_);
271273

272274
auto* blocking_counter = db_slice_->BlockingCounter();
@@ -275,7 +277,7 @@ bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
275277
// zero.
276278
std::lock_guard blocking_counter_guard(*blocking_counter);
277279

278-
stats_.loop_serialized += SerializeBucket(current_db_, it);
280+
stats_.loop_serialized += SerializeBucket(db_index, it);
279281

280282
return false;
281283
}
@@ -291,20 +293,19 @@ unsigned SliceSnapshot::SerializeBucket(DbIndex db_index, PrimeTable::bucket_ite
291293
while (!it.is_done()) {
292294
++result;
293295
// might preempt due to big value serialization.
294-
SerializeEntry(db_index, it->first, it->second, nullopt, serializer_.get());
296+
SerializeEntry(db_index, it->first, it->second);
295297
++it;
296298
}
297299
serialize_bucket_running_ = false;
298300
return result;
299301
}
300302

301-
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv,
302-
optional<uint64_t> expire, RdbSerializer* serializer) {
303+
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv) {
303304
if (pv.IsExternal() && pv.IsCool())
304-
return SerializeEntry(db_indx, pk, pv.GetCool().record->value, expire, serializer);
305+
return SerializeEntry(db_indx, pk, pv.GetCool().record->value);
305306

306-
time_t expire_time = expire.value_or(0);
307-
if (!expire && pv.HasExpire()) {
307+
time_t expire_time = 0;
308+
if (pv.HasExpire()) {
308309
auto eit = db_array_[db_indx]->expire.Find(pk);
309310
expire_time = db_slice_->ExpireTime(eit);
310311
}
@@ -317,11 +318,15 @@ void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const Pr
317318
EngineShard::tlocal()->tiered_storage()->Read(
318319
db_indx, pk.ToString(), pv,
319320
[future](const std::string& v) mutable { future.Resolve(PrimeValue(v)); });
320-
delayed_entries_.push_back(
321-
{db_indx, PrimeKey(pk.ToString()), std::move(future), expire_time, mc_flags});
321+
322+
auto key = PrimeKey(pk.ToString());
323+
324+
util::fb2::LockGuard guard(big_value_mu_);
325+
delayed_entries_.push_back({db_indx, std::move(key), std::move(future), expire_time, mc_flags});
322326
++type_freq_map_[RDB_TYPE_STRING];
323327
} else {
324-
io::Result<uint8_t> res = serializer->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
328+
util::fb2::LockGuard guard(big_value_mu_);
329+
io::Result<uint8_t> res = serializer_->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
325330
CHECK(res);
326331
++type_freq_map_[*res];
327332
}
@@ -360,6 +365,11 @@ size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
360365
}
361366

362367
bool SliceSnapshot::PushSerialized(bool force) {
368+
util::fb2::LockGuard guard(big_value_mu_);
369+
return PushSerializedUnderLock(force);
370+
}
371+
372+
bool SliceSnapshot::PushSerializedUnderLock(bool force) {
363373
if (!force && serializer_->SerializedLen() < kMinBlobSize)
364374
return false;
365375

@@ -383,8 +393,6 @@ bool SliceSnapshot::PushSerialized(bool force) {
383393
}
384394

385395
void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
386-
std::lock_guard guard(big_value_mu_);
387-
388396
PrimeTable* table = db_slice_->GetTables(db_index).first;
389397
const PrimeTable::bucket_iterator* bit = req.update();
390398

@@ -409,9 +417,9 @@ void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await)
409417
// To enable journal flushing to sync after non auto journal command is executed we call
410418
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
411419
// additional journal change to serialize, it simply invokes PushSerialized.
412-
std::lock_guard guard(big_value_mu_);
413420
if (item.opcode != journal::Op::NOOP) {
414-
serializer_->WriteJournalEntry(item.data);
421+
util::fb2::LockGuard guard(big_value_mu_);
422+
CHECK(!serializer_->WriteJournalEntry(item.data));
415423
}
416424

417425
if (await) {

src/server/snapshot.h

+13-12
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,14 @@ class SliceSnapshot {
8989
void SwitchIncrementalFb(Context* cntx, LSN lsn);
9090

9191
// Called on traversing cursor by IterateBucketsFb.
92-
bool BucketSaveCb(PrimeTable::bucket_iterator it);
92+
bool BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it);
9393

9494
// Serialize single bucket.
9595
// Returns number of serialized entries, updates bucket version to snapshot version.
9696
unsigned SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator bucket_it);
9797

9898
// Serialize entry into passed serializer.
99-
void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv,
100-
std::optional<uint64_t> expire, RdbSerializer* serializer);
99+
void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv);
101100

102101
// DbChange listener
103102
void OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req);
@@ -109,11 +108,12 @@ class SliceSnapshot {
109108
// Push regardless of buffer size if force is true.
110109
// Return true if pushed. Can block. Is called from the snapshot thread.
111110
bool PushSerialized(bool force);
111+
bool PushSerializedUnderLock(bool force) ABSL_EXCLUSIVE_LOCKS_REQUIRED(big_value_mu_);
112112

113113
// Helper function that flushes the serialized items into the RecordStream.
114114
// Can block.
115115
using FlushState = SerializerBase::FlushState;
116-
size_t FlushSerialized(FlushState flush_state);
116+
size_t FlushSerialized(FlushState flush_state); // ABSL_EXCLUSIVE_LOCKS_REQUIRED(big_value_mu_)
117117

118118
public:
119119
uint64_t snapshot_version() const {
@@ -141,24 +141,25 @@ class SliceSnapshot {
141141
};
142142

143143
DbSlice* db_slice_;
144-
DbTableArray db_array_;
144+
const DbTableArray db_array_;
145145

146-
DbIndex current_db_;
146+
std::unique_ptr<RdbSerializer> serializer_; // ABSL_GUARDED_BY(big_value_mu_);
147147

148-
std::unique_ptr<RdbSerializer> serializer_;
149-
std::vector<DelayedEntry> delayed_entries_; // collected during atomic bucket traversal
148+
// collected during atomic bucket traversal
149+
std::vector<DelayedEntry> delayed_entries_ ABSL_GUARDED_BY(big_value_mu_);
150150

151151
// Used for sanity checks.
152152
bool serialize_bucket_running_ = false;
153153
util::fb2::Fiber snapshot_fb_; // IterateEntriesFb
154154
util::fb2::CondVarAny seq_cond_;
155-
CompressionMode compression_mode_;
156-
RdbTypeFreqMap type_freq_map_;
155+
const CompressionMode compression_mode_;
156+
RdbTypeFreqMap type_freq_map_ ABSL_GUARDED_BY(big_value_mu_);
157157

158158
// version upper bound for entries that should be saved (not included).
159159
uint64_t snapshot_version_ = 0;
160160
uint32_t journal_cb_id_ = 0;
161161

162+
// TODO: remove or use mutex
162163
uint64_t rec_id_ = 1, last_pushed_id_ = 0;
163164

164165
struct Stats {
@@ -167,9 +168,9 @@ class SliceSnapshot {
167168
size_t side_saved = 0;
168169
size_t savecb_calls = 0;
169170
size_t keys_total = 0;
170-
} stats_;
171+
} stats_; // TODO: maybe need to lock
171172

172-
ThreadLocalMutex big_value_mu_;
173+
mutable ThreadLocalMutex big_value_mu_;
173174

174175
std::function<void(std::string)> on_push_;
175176
std::function<void()> on_snapshot_finish_;

0 commit comments

Comments
 (0)