Skip to content

Commit 1cc2c59

Browse files
fix(snapshot): Refactor the big_value_mu_ mutex
fixes #4152 Signed-off-by: Stepan Bagritsevich <stefan@dragonflydb.io>
1 parent 966a1a4 commit 1cc2c59

File tree

3 files changed

+44
-38
lines changed

3 files changed

+44
-38
lines changed

src/server/rdb_save.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ class RdbSerializer : public SerializerBase {
225225
// Must be called in the thread to which `it` belongs.
226226
// Returns the serialized rdb_type or the error.
227227
// expire_ms = 0 means no expiry.
228+
// This function might preempt if flush_fun_ is used.
228229
io::Result<uint8_t> SaveEntry(const PrimeKey& pk, const PrimeValue& pv, uint64_t expire_ms,
229230
uint32_t mc_flags, DbIndex dbid);
230231

src/server/snapshot.cc

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ constexpr size_t kMinBlobSize = 32_KB;
3838

3939
SliceSnapshot::SliceSnapshot(CompressionMode compression_mode, DbSlice* slice,
4040
SnapshotDataConsumerInterface* consumer, Context* cntx)
41-
: db_slice_(slice), compression_mode_(compression_mode), consumer_(consumer), cntx_(cntx) {
42-
db_array_ = slice->databases();
41+
: db_slice_(slice), db_array_(slice->databases()) compression_mode_(compression_mode), consumer_(consumer), cntx_(cntx) {
4342
tl_slice_snapshots.insert(this);
4443
}
4544

@@ -126,8 +125,9 @@ void SliceSnapshot::FinalizeJournalStream(bool cancel) {
126125

127126
journal->UnregisterOnChange(cb_id);
128127
if (!cancel) {
128+
util::fb2::LockGuard guard(big_value_mu_);
129129
serializer_->SendJournalOffset(journal->GetLsn());
130-
PushSerialized(true);
130+
PushSerializedUnderLock(true);
131131
}
132132
}
133133

@@ -163,7 +163,6 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
163163

164164
uint64_t last_yield = 0;
165165
PrimeTable* pt = &db_array_[db_indx]->prime;
166-
current_db_ = db_indx;
167166

168167
VLOG(1) << "Start traversing " << pt->size() << " items for index " << db_indx;
169168
do {
@@ -172,7 +171,7 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
172171
}
173172

174173
PrimeTable::Cursor next =
175-
pt->TraverseBuckets(cursor, [this](auto it) { return BucketSaveCb(it); });
174+
pt->TraverseBuckets(cursor, absl::bind_front(&SliceSnapshot::BucketSaveCb, this));
176175
cursor = next;
177176
PushSerialized(false);
178177

@@ -194,8 +193,9 @@ void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
194193

195194
CHECK(!serialize_bucket_running_);
196195
if (send_full_sync_cut) {
196+
util::fb2::LockGuard guard(big_value_mu_);
197197
CHECK(!serializer_->SendFullSyncCut());
198-
PushSerialized(true);
198+
PushSerializedUnderLock(true);
199199
}
200200

201201
// serialized + side_saved must be equal to the total saved.
@@ -211,7 +211,7 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
211211
VLOG(1) << "Starting incremental snapshot from lsn=" << lsn;
212212

213213
// The replica sends the LSN of the next entry is wants to receive.
214-
while (!cntx_->IsCancelled() && journal->IsLSNInBuffer(lsn)) {
214+
while (!cntx->IsCancelled() && journal->IsLSNInBuffer(lsn)) {
215215
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
216216
PushSerialized(false);
217217
lsn++;
@@ -230,7 +230,7 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
230230
// GetLsn() is always the next lsn that we expect to create.
231231
if (journal->GetLsn() == lsn) {
232232
{
233-
FiberAtomicGuard fg;
233+
util::fb2::LockGuard guard(big_value_mu_);
234234
serializer_->SendFullSyncCut();
235235
}
236236
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
@@ -248,9 +248,7 @@ void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
248248
}
249249
}
250250

251-
bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
252-
std::lock_guard guard(big_value_mu_);
253-
251+
bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it) {
254252
++stats_.savecb_calls;
255253

256254
auto check = [&](auto v) {
@@ -267,7 +265,7 @@ bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
267265
return false;
268266
}
269267

270-
db_slice_->FlushChangeToEarlierCallbacks(current_db_, DbSlice::Iterator::FromPrime(it),
268+
db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
271269
snapshot_version_);
272270

273271
auto* blocking_counter = db_slice_->BlockingCounter();
@@ -276,7 +274,7 @@ bool SliceSnapshot::BucketSaveCb(PrimeTable::bucket_iterator it) {
276274
// zero.
277275
std::lock_guard blocking_counter_guard(*blocking_counter);
278276

279-
stats_.loop_serialized += SerializeBucket(current_db_, it);
277+
stats_.loop_serialized += SerializeBucket(db_index, it);
280278

281279
return false;
282280
}
@@ -292,20 +290,19 @@ unsigned SliceSnapshot::SerializeBucket(DbIndex db_index, PrimeTable::bucket_ite
292290
while (!it.is_done()) {
293291
++result;
294292
// might preempt due to big value serialization.
295-
SerializeEntry(db_index, it->first, it->second, nullopt, serializer_.get());
293+
SerializeEntry(db_index, it->first, it->second);
296294
++it;
297295
}
298296
serialize_bucket_running_ = false;
299297
return result;
300298
}
301299

302-
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv,
303-
optional<uint64_t> expire, RdbSerializer* serializer) {
300+
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv) {
304301
if (pv.IsExternal() && pv.IsCool())
305-
return SerializeEntry(db_indx, pk, pv.GetCool().record->value, expire, serializer);
302+
return SerializeEntry(db_indx, pk, pv.GetCool().record->value);
306303

307-
time_t expire_time = expire.value_or(0);
308-
if (!expire && pv.HasExpire()) {
304+
time_t expire_time = 0;
305+
if (pv.HasExpire()) {
309306
auto eit = db_array_[db_indx]->expire.Find(pk);
310307
expire_time = db_slice_->ExpireTime(eit);
311308
}
@@ -318,11 +315,15 @@ void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const Pr
318315
EngineShard::tlocal()->tiered_storage()->Read(
319316
db_indx, pk.ToString(), pv,
320317
[future](const std::string& v) mutable { future.Resolve(PrimeValue(v)); });
321-
delayed_entries_.push_back(
322-
{db_indx, PrimeKey(pk.ToString()), std::move(future), expire_time, mc_flags});
318+
319+
auto key = PrimeKey(pk.ToString());
320+
321+
util::fb2::LockGuard guard(big_value_mu_);
322+
delayed_entries_.push_back({db_indx, std::move(key), std::move(future), expire_time, mc_flags});
323323
++type_freq_map_[RDB_TYPE_STRING];
324324
} else {
325-
io::Result<uint8_t> res = serializer->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
325+
util::fb2::LockGuard guard(big_value_mu_);
326+
io::Result<uint8_t> res = serializer_->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
326327
CHECK(res);
327328
++type_freq_map_[*res];
328329
}
@@ -361,6 +362,11 @@ size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
361362
}
362363

363364
bool SliceSnapshot::PushSerialized(bool force) {
365+
util::fb2::LockGuard guard(big_value_mu_);
366+
return PushSerializedUnderLock(force);
367+
}
368+
369+
bool SliceSnapshot::PushSerializedUnderLock(bool force) {
364370
if (!force && serializer_->SerializedLen() < kMinBlobSize)
365371
return false;
366372

@@ -384,8 +390,6 @@ bool SliceSnapshot::PushSerialized(bool force) {
384390
}
385391

386392
void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
387-
std::lock_guard guard(big_value_mu_);
388-
389393
PrimeTable* table = db_slice_->GetTables(db_index).first;
390394
const PrimeTable::bucket_iterator* bit = req.update();
391395

@@ -410,9 +414,9 @@ void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await)
410414
// To enable journal flushing to sync after non auto journal command is executed we call
411415
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
412416
// additional journal change to serialize, it simply invokes PushSerialized.
413-
std::lock_guard guard(big_value_mu_);
414417
if (item.opcode != journal::Op::NOOP) {
415-
serializer_->WriteJournalEntry(item.data);
418+
util::fb2::LockGuard guard(big_value_mu_);
419+
CHECK(!serializer_->WriteJournalEntry(item.data));
416420
}
417421

418422
if (await) {

src/server/snapshot.h

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,14 @@ class SliceSnapshot {
9898
void SwitchIncrementalFb(LSN lsn);
9999

100100
// Called on traversing cursor by IterateBucketsFb.
101-
bool BucketSaveCb(PrimeTable::bucket_iterator it);
101+
bool BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it);
102102

103103
// Serialize single bucket.
104104
// Returns number of serialized entries, updates bucket version to snapshot version.
105105
unsigned SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator bucket_it);
106106

107107
// Serialize entry into passed serializer.
108-
void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv,
109-
std::optional<uint64_t> expire, RdbSerializer* serializer);
108+
void SerializeEntry(DbIndex db_index, const PrimeKey& pk, const PrimeValue& pv);
110109

111110
// DbChange listener
112111
void OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req);
@@ -118,11 +117,12 @@ class SliceSnapshot {
118117
// Push regardless of buffer size if force is true.
119118
// Return true if pushed. Can block. Is called from the snapshot thread.
120119
bool PushSerialized(bool force);
120+
bool PushSerializedUnderLock(bool force) ABSL_EXCLUSIVE_LOCKS_REQUIRED(big_value_mu_);
121121

122122
// Helper function that flushes the serialized items into the RecordStream.
123123
// Can block.
124124
using FlushState = SerializerBase::FlushState;
125-
size_t FlushSerialized(FlushState flush_state);
125+
size_t FlushSerialized(FlushState flush_state); // ABSL_EXCLUSIVE_LOCKS_REQUIRED(big_value_mu_)
126126

127127
public:
128128
uint64_t snapshot_version() const {
@@ -150,24 +150,25 @@ class SliceSnapshot {
150150
};
151151

152152
DbSlice* db_slice_;
153-
DbTableArray db_array_;
153+
const DbTableArray db_array_;
154154

155-
DbIndex current_db_;
155+
std::unique_ptr<RdbSerializer> serializer_; // ABSL_GUARDED_BY(big_value_mu_);
156156

157-
std::unique_ptr<RdbSerializer> serializer_;
158-
std::vector<DelayedEntry> delayed_entries_; // collected during atomic bucket traversal
157+
// collected during atomic bucket traversal
158+
std::vector<DelayedEntry> delayed_entries_ ABSL_GUARDED_BY(big_value_mu_);
159159

160160
// Used for sanity checks.
161161
bool serialize_bucket_running_ = false;
162162
util::fb2::Fiber snapshot_fb_; // IterateEntriesFb
163163
util::fb2::CondVarAny seq_cond_;
164-
CompressionMode compression_mode_;
165-
RdbTypeFreqMap type_freq_map_;
164+
const CompressionMode compression_mode_;
165+
RdbTypeFreqMap type_freq_map_ ABSL_GUARDED_BY(big_value_mu_);
166166

167167
// version upper bound for entries that should be saved (not included).
168168
uint64_t snapshot_version_ = 0;
169169
uint32_t journal_cb_id_ = 0;
170170

171+
// TODO: remove or use mutex
171172
uint64_t rec_id_ = 1, last_pushed_id_ = 0;
172173

173174
struct Stats {
@@ -176,9 +177,9 @@ class SliceSnapshot {
176177
size_t side_saved = 0;
177178
size_t savecb_calls = 0;
178179
size_t keys_total = 0;
179-
} stats_;
180+
} stats_; // TODO: maybe need to lock
180181

181-
ThreadLocalMutex big_value_mu_;
182+
mutable ThreadLocalMutex big_value_mu_;
182183

183184
SnapshotDataConsumerInterface* consumer_;
184185
Context* cntx_;

0 commit comments

Comments
 (0)