Skip to content

Commit 1f7b8b2

Browse files
committed
chore: Pipelining fixes
Fixes #4998. 1. Reduces agressive yielding when reading multiple requests since it humpers pipeline efficiency. Now we yield consistently based on cpu time spend since the last resume point (via flag with sane defaults). 2. Increases socket read buffer size effectively allowing processing more requests in bulk. `./dragonfly --cluster_mode=emulated` latencies (usec) for pipeline sizes 80-199: p50: 1887, p75: 2367, p90: 2897, p99: 6266 `./dragonfly --cluster_mode=emulated --experimental_cluster_shard_by_slot` latencies (usec) for pipeline sizes 80-199: p50: 813, p75: 976, p90: 1216, p99: 3528 Signed-off-by: Roman Gershman <roman@dragonflydb.io>
1 parent d5c3752 commit 1f7b8b2

File tree

6 files changed

+268
-122
lines changed

6 files changed

+268
-122
lines changed

src/facade/dragonfly_connection.cc

+44-45
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <numeric>
1414
#include <variant>
1515

16+
#include "base/cycle_clock.h"
1617
#include "base/flags.h"
1718
#include "base/histogram.h"
1819
#include "base/io_buf.h"
@@ -23,7 +24,9 @@
2324
#include "facade/memcache_parser.h"
2425
#include "facade/redis_parser.h"
2526
#include "facade/service_interface.h"
27+
#include "glog/logging.h"
2628
#include "io/file.h"
29+
#include "util/fibers/fibers.h"
2730
#include "util/fibers/proactor_base.h"
2831

2932
#ifdef DFLY_USE_SSL
@@ -93,6 +96,10 @@ ABSL_FLAG(bool, migrate_connections, true,
9396
"they operate. Currently this is only supported for Lua script invocations, and can "
9497
"happen at most once per connection.");
9598

99+
ABSL_FLAG(uint32_t, max_busy_read_usec, 100,
100+
"Maximum time we read and parse from "
101+
"a socket without yielding. In microseconds.");
102+
96103
using namespace util;
97104
using namespace std;
98105
using absl::GetFlag;
@@ -146,7 +153,7 @@ struct TrafficLogger {
146153

147154
void TrafficLogger::ResetLocked() {
148155
if (log_file) {
149-
log_file->Close();
156+
std::ignore = log_file->Close();
150157
log_file.reset();
151158
}
152159
}
@@ -196,7 +203,7 @@ void OpenTrafficLogger(string_view base_path) {
196203

197204
// Write version, incremental numbering :)
198205
uint8_t version[1] = {2};
199-
tl_traffic_logger.log_file->Write(version);
206+
std::ignore = tl_traffic_logger.log_file->Write(version);
200207
}
201208

202209
void LogTraffic(uint32_t id, bool has_more, absl::Span<RespExpr> resp,
@@ -876,6 +883,7 @@ pair<string, string> Connection::GetClientInfoBeforeAfterTid() const {
876883
absl::StrAppend(&after, " irqmatch=", int(cpu == my_cpu_id));
877884
if (dispatch_q_.size()) {
878885
absl::StrAppend(&after, " pipeline=", dispatch_q_.size());
886+
absl::StrAppend(&after, " pbuf=", pending_pipeline_bytes_);
879887
}
880888
absl::StrAppend(&after, " age=", now - creation_time_, " idle=", now - last_interaction_);
881889
string_view phase_name = PHASE_NAMES[phase_];
@@ -1028,7 +1036,7 @@ void Connection::ConnectionFlow() {
10281036
if (io_buf_.InputLen() > 0) {
10291037
phase_ = PROCESS;
10301038
if (redis_parser_) {
1031-
parse_status = ParseRedis();
1039+
parse_status = ParseRedis(10000);
10321040
} else {
10331041
DCHECK(memcache_parser_);
10341042
parse_status = ParseMemcache();
@@ -1136,19 +1144,6 @@ void Connection::DispatchSingle(bool has_more, absl::FunctionRef<void()> invoke_
11361144
// Dispatch async if we're handling a pipeline or if we can't dispatch sync.
11371145
if (optimize_for_async || !can_dispatch_sync) {
11381146
SendAsync(cmd_msg_cb());
1139-
1140-
auto epoch = fb2::FiberSwitchEpoch();
1141-
1142-
if (async_fiber_epoch_ == epoch) {
1143-
// If we pushed too many items without context switching - yield
1144-
if (++async_streak_len_ >= 10 && !cc_->async_dispatch) {
1145-
async_streak_len_ = 0;
1146-
ThisFiber::Yield();
1147-
}
1148-
} else {
1149-
async_streak_len_ = 0;
1150-
async_fiber_epoch_ = epoch;
1151-
}
11521147
} else {
11531148
ShrinkPipelinePool(); // Gradually release pipeline request pool.
11541149
{
@@ -1164,20 +1159,17 @@ void Connection::DispatchSingle(bool has_more, absl::FunctionRef<void()> invoke_
11641159
}
11651160
}
11661161

1167-
Connection::ParserStatus Connection::ParseRedis() {
1162+
Connection::ParserStatus Connection::ParseRedis(unsigned max_busy_cycles) {
11681163
uint32_t consumed = 0;
11691164
RedisParser::Result result = RedisParser::OK;
11701165

1171-
// Re-use connection local resources to reduce allocations
1172-
RespVec& parse_args = tmp_parse_args_;
1173-
CmdArgVec& cmd_vec = tmp_cmd_vec_;
1174-
1175-
auto dispatch_sync = [this, &parse_args, &cmd_vec] {
1176-
RespExpr::VecToArgList(parse_args, &cmd_vec);
1177-
service_->DispatchCommand(absl::MakeSpan(cmd_vec), reply_builder_.get(), cc_.get());
1166+
auto dispatch_sync = [this] {
1167+
RespExpr::VecToArgList(tmp_parse_args_, &tmp_cmd_vec_);
1168+
service_->DispatchCommand(absl::MakeSpan(tmp_cmd_vec_), reply_builder_.get(), cc_.get());
11781169
};
1179-
auto dispatch_async = [this, &parse_args, tlh = mi_heap_get_backing()]() -> MessageHandle {
1180-
return {FromArgs(std::move(parse_args), tlh)};
1170+
1171+
auto dispatch_async = [this, tlh = mi_heap_get_backing()]() -> MessageHandle {
1172+
return {FromArgs(std::move(tmp_parse_args_), tlh)};
11811173
};
11821174

11831175
ReadBuffer read_buffer = GetReadBuffer();
@@ -1186,10 +1178,10 @@ Connection::ParserStatus Connection::ParseRedis() {
11861178
if (read_buffer.ShouldAdvance()) { // can happen only with io_uring/bundles
11871179
read_buffer.slice = NextBundleBuffer(read_buffer.available_bytes);
11881180
}
1189-
result = redis_parser_->Parse(read_buffer.slice, &consumed, &parse_args);
1181+
result = redis_parser_->Parse(read_buffer.slice, &consumed, &tmp_parse_args_);
11901182
request_consumed_bytes_ += consumed;
1191-
if (result == RedisParser::OK && !parse_args.empty()) {
1192-
if (RespExpr& first = parse_args.front(); first.type == RespExpr::STRING)
1183+
if (result == RedisParser::OK && !tmp_parse_args_.empty()) {
1184+
if (RespExpr& first = tmp_parse_args_.front(); first.type == RespExpr::STRING)
11931185
DVLOG(2) << "Got Args with first token " << ToSV(first.GetBuf());
11941186

11951187
if (io_req_size_hist)
@@ -1198,12 +1190,20 @@ Connection::ParserStatus Connection::ParseRedis() {
11981190
bool has_more = consumed < read_buffer.available_bytes;
11991191

12001192
if (tl_traffic_logger.log_file && IsMain() /* log only on the main interface */) {
1201-
LogTraffic(id_, has_more, absl::MakeSpan(parse_args), service_->GetContextInfo(cc_.get()));
1193+
LogTraffic(id_, has_more, absl::MakeSpan(tmp_parse_args_),
1194+
service_->GetContextInfo(cc_.get()));
12021195
}
12031196

12041197
DispatchSingle(has_more, dispatch_sync, dispatch_async);
12051198
}
12061199
read_buffer.Consume(consumed);
1200+
1201+
// We must yield from time to time to allow other fibers to run.
1202+
// Specifically, if a client sends a huge chunk of data resulting in a very long pipeline,
1203+
// we want to yield to allow AsyncFiber to actually execute on the pending pipeline.
1204+
if (ThisFiber::GetRunningTimeCycles() > max_busy_cycles) {
1205+
ThisFiber::Yield();
1206+
}
12071207
} while (RedisParser::OK == result && read_buffer.available_bytes > 0 &&
12081208
!reply_builder_->GetError());
12091209

@@ -1390,6 +1390,9 @@ auto Connection::IoLoop() -> variant<error_code, ParserStatus> {
13901390
ParserStatus parse_status = OK;
13911391

13921392
size_t max_iobfuf_len = GetFlag(FLAGS_max_client_iobuf_len);
1393+
unsigned max_busy_read_cycles =
1394+
(base::CycleClock::Frequency() * GetFlag(FLAGS_max_busy_read_usec)) / 1000000U;
1395+
13931396
auto* peer = socket_.get();
13941397
recv_buf_.res_len = 0;
13951398

@@ -1404,12 +1407,16 @@ auto Connection::IoLoop() -> variant<error_code, ParserStatus> {
14041407
bool is_iobuf_full = io_buf_.AppendLen() == 0;
14051408

14061409
if (redis_parser_) {
1407-
parse_status = ParseRedis();
1410+
parse_status = ParseRedis(max_busy_read_cycles);
14081411
} else {
14091412
DCHECK(memcache_parser_);
14101413
parse_status = ParseMemcache();
14111414
}
14121415

1416+
if (reply_builder_->GetError()) {
1417+
return reply_builder_->GetError();
1418+
}
1419+
14131420
if (parse_status == NEED_MORE) {
14141421
parse_status = OK;
14151422

@@ -1429,33 +1436,21 @@ auto Connection::IoLoop() -> variant<error_code, ParserStatus> {
14291436
[&]() { io_buf_.Reserve(std::min(max_iobfuf_len, parser_hint)); });
14301437
}
14311438

1432-
// If we got a partial request and we couldn't parse the length, just
1433-
// double the capacity.
14341439
// If we got a partial request because iobuf was full, grow it up to
14351440
// a reasonable limit to save on Recv() calls.
1436-
if (io_buf_.AppendLen() < 64u || (is_iobuf_full && capacity < 4096)) {
1441+
if (is_iobuf_full && capacity < max_iobfuf_len / 2) {
14371442
// Last io used most of the io_buf to the end.
14381443
UpdateIoBufCapacity(io_buf_, stats_, [&]() {
14391444
io_buf_.Reserve(capacity * 2); // Valid growth range.
14401445
});
14411446
}
14421447

14431448
DCHECK_GT(io_buf_.AppendLen(), 0U);
1444-
} else if (io_buf_.AppendLen() == 0) {
1445-
// We have a full buffer and we can not progress with parsing.
1446-
// This means that we have request too large.
1447-
LOG(ERROR) << "Request is too large, closing connection";
1448-
parse_status = ERROR;
1449-
break;
14501449
}
14511450
} else if (parse_status != OK) {
14521451
break;
14531452
}
1454-
ec = reply_builder_->GetError();
1455-
} while (peer->IsOpen() && !ec);
1456-
1457-
if (ec)
1458-
return ec;
1453+
} while (peer->IsOpen());
14591454

14601455
return parse_status;
14611456
}
@@ -1833,6 +1828,7 @@ void Connection::SendAsync(MessageHandle msg) {
18331828
// Squashing is only applied to redis commands
18341829
if (std::holds_alternative<PipelineMessagePtr>(msg.handle)) {
18351830
pending_pipeline_cmd_cnt_++;
1831+
pending_pipeline_bytes_ += used_mem;
18361832
}
18371833

18381834
if (msg.IsControl()) {
@@ -1869,7 +1865,10 @@ void Connection::RecycleMessage(MessageHandle msg) {
18691865

18701866
// Retain pipeline message in pool.
18711867
if (auto* pipe = get_if<PipelineMessagePtr>(&msg.handle); pipe) {
1868+
DCHECK_GE(pending_pipeline_bytes_, used_mem);
1869+
DCHECK_GE(pending_pipeline_cmd_cnt_, 1u);
18721870
pending_pipeline_cmd_cnt_--;
1871+
pending_pipeline_bytes_ -= used_mem;
18731872
if (stats_->pipeline_cmd_cache_bytes < qbp.pipeline_cache_limit) {
18741873
stats_->pipeline_cmd_cache_bytes += (*pipe)->StorageCapacity();
18751874
pipeline_req_pool_.push_back(std::move(*pipe));

src/facade/dragonfly_connection.h

+2-5
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ class Connection : public util::Connection {
367367
// Create new pipeline request, re-use from pool when possible.
368368
PipelineMessagePtr FromArgs(RespVec args, mi_heap_t* heap);
369369

370-
ParserStatus ParseRedis();
370+
ParserStatus ParseRedis(unsigned max_busy_cycles);
371371
ParserStatus ParseMemcache();
372372

373373
void OnBreakCb(int32_t mask);
@@ -427,6 +427,7 @@ class Connection : public util::Connection {
427427
util::fb2::Fiber async_fb_; // async fiber (if started)
428428

429429
uint64_t pending_pipeline_cmd_cnt_ = 0; // how many queued Redis async commands in dispatch_q
430+
size_t pending_pipeline_bytes_ = 0; // how many bytes of the queued Redis async commands
430431

431432
// how many bytes of the current request have been consumed
432433
size_t request_consumed_bytes_ = 0;
@@ -455,10 +456,6 @@ class Connection : public util::Connection {
455456

456457
unsigned parser_error_ = 0;
457458

458-
// amount of times we enqued requests asynchronously during the same async_fiber_epoch_.
459-
unsigned async_streak_len_ = 0;
460-
uint64_t async_fiber_epoch_ = 0;
461-
462459
BreakerCb breaker_cb_;
463460

464461
// Used by redis parser to avoid allocations

src/server/main_service.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@ ABSL_FLAG(size_t, serialization_max_chunk_size, 64_KB,
116116
"Maximum size of a value that may be serialized at once during snapshotting or full "
117117
"sync. Values bigger than this threshold will be serialized using streaming "
118118
"serialization. 0 - to disable streaming mode");
119-
ABSL_FLAG(uint32_t, max_squashed_cmd_num, 32,
120-
"Max number of commands squashed in command squash optimizaiton");
119+
ABSL_FLAG(uint32_t, max_squashed_cmd_num, 100,
120+
"Max number of commands squashed in a single shard during squash optimizaiton");
121121

122122
namespace dfly {
123123

src/server/server_family.cc

+14-4
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,20 @@ void PrintPrometheusMetrics(uint64_t uptime, const Metrics& m, DflyCmd* dfly_cmd
13141314
AppendMetricWithoutLabels("pipeline_commands_duration_seconds", "",
13151315
conn_stats.pipelined_cmd_latency * 1e-6, MetricType::COUNTER,
13161316
&resp->body());
1317+
1318+
AppendMetricWithoutLabels("cmd_squash_hop_total", "", m.coordinator_stats.multi_squash_executions,
1319+
MetricType::COUNTER, &resp->body());
1320+
1321+
AppendMetricWithoutLabels("cmd_squash_commands_total", "", m.coordinator_stats.squashed_commands,
1322+
MetricType::COUNTER, &resp->body());
1323+
1324+
AppendMetricWithoutLabels("cmd_squash_hop_duration_seconds", "",
1325+
m.coordinator_stats.multi_squash_exec_hop_usec * 1e-6,
1326+
MetricType::COUNTER, &resp->body());
1327+
AppendMetricWithoutLabels("cmd_squash_hop_reply_seconds", "",
1328+
m.coordinator_stats.multi_squash_exec_reply_usec * 1e-6,
1329+
MetricType::COUNTER, &resp->body());
1330+
13171331
AppendMetricWithoutLabels("commands_squashing_replies_bytes", "",
13181332
MultiCommandSquasher::GetRepliesMemSize(), MetricType::GAUGE,
13191333
&resp->body());
@@ -2486,7 +2500,6 @@ string ServerFamily::FormatInfoMetrics(const Metrics& m, std::string_view sectio
24862500
append("total_commands_processed", conn_stats.command_cnt_main + conn_stats.command_cnt_other);
24872501
append("instantaneous_ops_per_sec", m.qps);
24882502
append("total_pipelined_commands", conn_stats.pipelined_cmd_cnt);
2489-
append("total_pipelined_squashed_commands", m.coordinator_stats.squashed_commands);
24902503
append("pipeline_throttle_total", conn_stats.pipeline_throttle_count);
24912504
append("pipelined_latency_usec", conn_stats.pipelined_cmd_latency);
24922505
append("total_net_input_bytes", conn_stats.io_read_bytes);
@@ -2628,9 +2641,6 @@ string ServerFamily::FormatInfoMetrics(const Metrics& m, std::string_view sectio
26282641
append("eval_shardlocal_coordination_total",
26292642
m.coordinator_stats.eval_shardlocal_coordination_cnt);
26302643
append("eval_squashed_flushes", m.coordinator_stats.eval_squashed_flushes);
2631-
append("multi_squash_execution_total", m.coordinator_stats.multi_squash_executions);
2632-
append("multi_squash_execution_hop_usec", m.coordinator_stats.multi_squash_exec_hop_usec);
2633-
append("multi_squash_execution_reply_usec", m.coordinator_stats.multi_squash_exec_reply_usec);
26342644
};
26352645

26362646
auto add_repl_info = [&] {

tests/dragonfly/connection_test.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -1118,14 +1118,14 @@ async def wait_for_stuck_on_send():
11181118

11191119

11201120
# Test that the cache pipeline does not grow or shrink under constant pipeline load.
1121-
@dfly_args({"proactor_threads": 1, "pipeline_squash": 9})
1121+
@dfly_args({"proactor_threads": 1, "pipeline_squash": 9, "max_busy_read_usec": 1000})
11221122
async def test_pipeline_cache_only_async_squashed_dispatches(df_factory):
11231123
server = df_factory.create()
11241124
server.start()
11251125

11261126
client = server.client()
11271127

1128-
async def push_pipeline(size=1):
1128+
async def push_pipeline(size):
11291129
p = client.pipeline(transaction=True)
11301130
for i in range(size):
11311131
p.info()
@@ -1136,14 +1136,15 @@ async def push_pipeline(size=1):
11361136
# should be zero because:
11371137
# We always dispatch the items that will be squashed, so when `INFO` gets called
11381138
# the cache is empty because the pipeline consumed it throughout its execution
1139-
for i in range(0, 30):
1139+
# high max_busy_read_usec ensures that the connection fiber has enough time to push
1140+
# all the commands to reach the squashing limit.
1141+
for i in range(0, 10):
11401142
# it's actually 11 commands. 8 INFO + 2 from the MULTI/EXEC block that is injected
1141-
# by the client. Connection fiber yields to dispatch/async fiber when
1142-
# ++async_streak_len_ >= 10. The minimum to squash is 9 so it will squash the pipeline
1143+
# by the client. The minimum to squash is 9 so it will squash the pipeline
11431144
# and INFO ALL should return zero for all the squashed commands in the pipeline
11441145
res = await push_pipeline(8)
1145-
for i in range(1):
1146-
assert res[i]["pipeline_cache_bytes"] == 0
1146+
for r in res:
1147+
assert r["pipeline_cache_bytes"] == 0
11471148

11481149
# Non zero because we reclaimed/recycled the messages back to the cache
11491150
info = await client.info()

0 commit comments

Comments
 (0)