Skip to content

Commit 1f3c657

Browse files
authored
Merge pull request MaterializeInc#3392 from nacrooks/reducemetadata
Optimise Kafka metadata requests This PR 1) adds jitter to metadata requests to ensure that workers do not all query fetch_metadata simultaneously 2) ensures that fetch_watermarks is called only for workers who are responsible for this partition 3) fixes an error in which the mutex protecting partition_count was held while doing the metadata fetch
2 parents e3e0822 + 2edc9a3 commit 1f3c657

File tree

3 files changed

+38
-23
lines changed

3 files changed

+38
-23
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/dataflow/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ ore = { path = "../ore" }
2929
pdqselect = "0.1.0"
3030
prometheus = { git = "https://github.com/MaterializeInc/rust-prometheus.git", default-features = false }
3131
prometheus-static-metric = { git = "https://github.com/MaterializeInc/rust-prometheus.git" }
32+
rand = "0.7.3"
3233
rdkafka = { git = "https://github.com/fede1024/rust-rdkafka.git", features = ["cmake-build", "ssl-vendored", "gssapi-vendored", "libz-static"] }
3334
regex = "1.3.9"
3435
repr = { path = "../repr" }

src/dataflow/src/source/kafka.rs

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
// the Business Source License, use of this software will be governed
88
// by the Apache License, Version 2.0.
99

10+
use rand::Rng;
1011
use std::collections::{HashMap, HashSet, VecDeque};
1112
use std::convert::TryInto;
1213
use std::sync::atomic::{AtomicBool, Ordering};
@@ -369,13 +370,17 @@ impl DataPlaneInfo {
369370
let refresh = self.refresh_metadata_info.clone();
370371
let id = self.source_id.clone();
371372
let topic = self.topic_name.clone();
373+
let worker_id = self.worker_id;
374+
let worker_count = self.worker_count;
372375
thread::spawn(move || {
373376
metadata_fetch(
374377
timestamping_stopped,
375378
consumer,
376379
refresh,
377380
&id,
378381
&topic,
382+
worker_id,
383+
worker_count,
379384
metadata_refresh_frequency,
380385
)
381386
});
@@ -421,8 +426,6 @@ impl DataPlaneInfo {
421426
}
422427

423428
/// Returns true if this worker is responsible for this partition
424-
/// If multi-worker reading is not enabled, this worker is *always* responsible for the
425-
/// partition
426429
/// Ex: if pid=0 and worker_id = 0, then true
427430
/// if pid=1 and worker_id = 0, then false
428431
fn has_partition(&self, partition_id: i32) -> bool {
@@ -760,12 +763,15 @@ fn activate_source_timestamping<G>(
760763

761764
/// This function is responsible for refreshing the number of known partitions. It marks the source
762765
/// has needing to be refreshed if new partitions are detected.
766+
#[allow(clippy::too_many_arguments)]
763767
fn metadata_fetch(
764768
timestamping_stopped: Arc<AtomicBool>,
765769
consumer: Arc<BaseConsumer<GlueConsumerContext>>,
766770
partition_count: Arc<Mutex<Option<i32>>>,
767771
id: &str,
768772
topic: &str,
773+
worker_id: i32,
774+
worker_count: i32,
769775
wait: Duration,
770776
) {
771777
debug!(
@@ -783,6 +789,7 @@ fn metadata_fetch(
783789
}
784790

785791
let mut partition_kafka_metadata: HashMap<i32, IntGauge> = HashMap::new();
792+
let mut rng = rand::thread_rng();
786793

787794
while !timestamping_stopped.load(Ordering::SeqCst) {
788795
let metadata = consumer.fetch_metadata(Some(&topic), Duration::from_secs(30));
@@ -806,37 +813,40 @@ fn metadata_fetch(
806813
continue;
807814
}
808815
new_partition_count = metadata_topic.partitions().len();
809-
let mut refresh_data = partition_count.lock().expect("lock poisoned");
810816

811817
// Upgrade partition metrics
812818
for p in 0..new_partition_count {
813819
let pid = p.try_into().unwrap();
814-
match consumer.fetch_watermarks(&topic, pid, Duration::from_secs(1)) {
815-
Ok((_, high)) => {
816-
if let Some(max_available_offset) =
817-
partition_kafka_metadata.get_mut(&pid)
818-
{
819-
max_available_offset.set(high)
820-
} else {
821-
let max_offset = MAX_AVAILABLE_OFFSET.with_label_values(&[
822-
topic,
823-
&id,
824-
&pid.to_string(),
825-
]);
826-
max_offset.set(high);
827-
partition_kafka_metadata.insert(pid, max_offset);
820+
// Only check metadata updates for partitions that the worker owns
821+
if (pid % worker_count) == worker_id {
822+
match consumer.fetch_watermarks(&topic, pid, Duration::from_secs(1)) {
823+
Ok((_, high)) => {
824+
if let Some(max_available_offset) =
825+
partition_kafka_metadata.get_mut(&pid)
826+
{
827+
max_available_offset.set(high)
828+
} else {
829+
let max_offset = MAX_AVAILABLE_OFFSET.with_label_values(&[
830+
topic,
831+
&id,
832+
&pid.to_string(),
833+
]);
834+
max_offset.set(high);
835+
partition_kafka_metadata.insert(pid, max_offset);
836+
}
828837
}
838+
Err(e) => warn!(
839+
"error loading watermarks topic={} partition={} error={}",
840+
topic, p, e
841+
),
829842
}
830-
Err(e) => warn!(
831-
"error loading watermarks topic={} partition={} error={}",
832-
topic, p, e
833-
),
834843
}
835844
}
836845

837846
// Kafka partition are i32, and Kafka consequently cannot support more than i32
838847
// partitions
839-
*refresh_data = Some(new_partition_count.try_into().unwrap());
848+
*partition_count.lock().expect("lock poisoned") =
849+
Some(new_partition_count.try_into().unwrap());
840850
}
841851
Err(e) => {
842852
new_partition_count = 0;
@@ -845,7 +855,10 @@ fn metadata_fetch(
845855
}
846856

847857
if new_partition_count > 0 {
848-
thread::sleep(wait);
858+
// Add jitter to spread-out metadata requests from workers. Brokers can get overloaded
859+
// if all workers make simultaneous metadata request calls.
860+
let sleep_jitter = rng.gen_range(Duration::from_secs(0), Duration::from_secs(15));
861+
thread::sleep(wait + sleep_jitter);
849862
} else {
850863
// If no partitions have been detected yet, sleep for a second rather than
851864
// the specified "wait" period of time, as we know that there should at least be one

0 commit comments

Comments
 (0)