Skip to content

Commit bd20f80

Browse files
chore: udpate docs for the telemetry crate (#25431)
* chore: udpate docs for the telemetry crate * chore: Update influxdb3_telemetry/src/stats.rs Co-authored-by: Michael Gattozzi <mgattozzi@influxdata.com> * chore: Update influxdb3_telemetry/src/sender.rs Co-authored-by: Michael Gattozzi <mgattozzi@influxdata.com> --------- Co-authored-by: Michael Gattozzi <mgattozzi@influxdata.com>
1 parent 42672e0 commit bd20f80

File tree

5 files changed

+64
-6
lines changed

5 files changed

+64
-6
lines changed

influxdb3_telemetry/src/bucket.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ use observability_deps::tracing::debug;
22

33
use crate::stats::Stats;
44

5+
/// This bucket holds all the event metrics like reads/writes. As
6+
/// new read or write comes in we update the stats for them. Then once
7+
/// a minute when a sample is taken these metrics are reset to collect
8+
/// the events again till next sample is taken.
59
#[derive(Debug, Default)]
610
pub(crate) struct EventsBucket {
711
pub writes: PerMinuteWrites,

influxdb3_telemetry/src/sampler.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ impl CpuAndMemorySampler {
1515
Self { system }
1616
}
1717

18+
/// This method picks the memory and cpu usage for this process using the
19+
/// pid.
1820
pub fn get_cpu_and_mem_used(&mut self) -> Result<(f32, u64)> {
1921
let pid = sysinfo::get_current_pid().map_err(TelemetryError::CannotGetPid)?;
2022
self.system.refresh_pids_specifics(

influxdb3_telemetry/src/sender.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ impl TelemetrySender {
3030
}
3131
}
3232

33+
/// This is the actual payload that is sent to the telemetry
34+
/// server
3335
#[derive(Serialize, Debug)]
3436
pub(crate) struct TelemetryPayload {
3537
pub os: Arc<str>,
@@ -68,6 +70,8 @@ pub(crate) struct TelemetryPayload {
6870
pub parquet_row_count: u64,
6971
}
7072

73+
/// This function runs in the background and if any call fails
74+
/// there is no retrying mechanism and it is ok to lose a few samples
7175
pub(crate) async fn send_telemetry_in_background(
7276
store: Arc<TelemetryStore>,
7377
duration_secs: Duration,

influxdb3_telemetry/src/stats.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
use num::{Num, NumCast};
22

3+
/// This type is responsible for calculating stats in a rolling fashion.
4+
/// By rolling, it means that there is already some stats calculated
5+
/// which needs to be further aggregated. This is commonly the case when
6+
/// the sampling is done at a higher precision interval (say 1 minute) and
7+
/// then further aggregated (say 1 hour).
8+
///
9+
/// For example the number of lines written per hour is collected as new
10+
/// write requests come in. However, the bucket [`crate::bucket::EventsBucket`]
11+
/// holds `lines` as [`crate::stats::Stats<u64>`], to hold min/max/avg lines
12+
/// written per minute. Then when taking samples per minute to calculate
13+
/// hourly aggregates, [`RollingStats<T>`] is used. To see how it is calculated
14+
/// see the [`RollingStats<T>::update`] method
315
#[derive(Debug, Default)]
416
pub(crate) struct RollingStats<T> {
517
pub min: T,
@@ -13,6 +25,12 @@ impl<T: Default + Num + Copy + NumCast + PartialOrd> RollingStats<T> {
1325
RollingStats::default()
1426
}
1527

28+
/// Update the rolling stats [`Self::min`]/[`Self::max`]/[`Self::avg`] using
29+
/// reference to an higher precision stats that is passed in. This is usually a
30+
/// per minute interval stats. One thing to note here is the [`Self::num_samples`]
31+
/// is updated locally here to calculate the rolling average for usually
32+
/// an hour for a metric. Refer to [`crate::metrics::Writes`] or
33+
/// [`crate::metrics::Queries`] to see how it is used
1634
pub fn update(&mut self, higher_precision_stats: &Stats<T>) -> Option<()> {
1735
if self.num_samples == 0 {
1836
self.min = higher_precision_stats.min;
@@ -41,6 +59,8 @@ impl<T: Default + Num + Copy + NumCast + PartialOrd> RollingStats<T> {
4159
}
4260
}
4361

62+
/// This is basic stats to keep a tab on min/max/avg for a specific
63+
/// metric
4464
#[derive(Debug, Default)]
4565
pub(crate) struct Stats<T> {
4666
pub min: T,
@@ -54,6 +74,8 @@ impl<T: Default + Num + Copy + NumCast + PartialOrd> Stats<T> {
5474
Stats::default()
5575
}
5676

77+
/// Update the [`Self::min`]/[`Self::max`]/[`Self::avg`] from a
78+
/// new value that is sampled.
5779
pub fn update(&mut self, new_val: T) -> Option<()> {
5880
if self.num_samples == 0 {
5981
self.min = new_val;
@@ -75,6 +97,22 @@ impl<T: Default + Num + Copy + NumCast + PartialOrd> Stats<T> {
7597
}
7698
}
7799

100+
/// Generic function to calculate min/max/avg from another set of stats.
101+
/// This function works for all types of numbers (unsigned/signed/floats).
102+
/// It calculates min/max/avg by using already calculated min/max/avg for
103+
/// possibly a higher resolution.
104+
///
105+
/// For eg.
106+
///
107+
/// Let's say we're looking at the stats for number of lines written.
108+
/// And we have 1st sample's minimum was 20 and the 3rd sample's
109+
/// minimum was 10. This means in the 1st sample for a whole minute
110+
/// 20 was the minimum number of lines written in a single request and in
111+
/// the 3rd sample (3rd minute) 10 is the minimum number of lines written
112+
/// in a single request. These are already stats at per minute interval, when we
113+
/// calculate the minimum number of lines for the whole hour we compare the samples
114+
/// taken at per minute interval for whole hour. In this case 10 will be the new
115+
/// minimum for the whole hour.
78116
fn rollup_stats<T: Num + Copy + NumCast + PartialOrd>(
79117
current_min: T,
80118
current_max: T,
@@ -91,6 +129,10 @@ fn rollup_stats<T: Num + Copy + NumCast + PartialOrd>(
91129
Some((min, max, avg))
92130
}
93131

132+
/// Generic function to calculate min/max/avg from a new sampled value.
133+
/// This function works for all types of numbers (unsigned/signed/floats).
134+
/// One thing to note here is the average function, it is an incremental average
135+
/// to avoid holding all the samples in memory.
94136
fn stats<T: Num + Copy + NumCast + PartialOrd>(
95137
current_min: T,
96138
current_max: T,

influxdb3_telemetry/src/store.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,18 @@ use crate::{
1111
sender::{send_telemetry_in_background, TelemetryPayload},
1212
};
1313

14-
/// This store is responsible for holding all the stats which
15-
/// will be sent in the background to the server.
14+
/// This store is responsible for holding all the stats which will be sent in the background
15+
/// to the server. There are primarily 4 different types of data held in the store:
16+
/// - static info (like instance ids, OS etc): These are passed in to create telemetry store.
17+
/// - hourly samples (like parquet file metrics): These are sampled at the point of creating
18+
/// payload before sending to the server.
19+
/// - rates (cpu/mem): These are sampled every minute but these are regular time
20+
/// series data. These metrics are backed by [`crate::stats::Stats<T>`] type.
21+
/// - events (reads/writes): These are just raw events and in order to convert it into a
22+
/// time series, it is collected in a bucket first and then sampled at per minute interval.
23+
/// These metrics are usually backed by [`crate::stats::RollingStats<T>`] type.
24+
/// There are couple of metrics like number of writes/reads that is backed by just
25+
/// [`crate::stats::Stats<T>`] type as they are just counters for per minute
1626
#[derive(Debug)]
1727
pub struct TelemetryStore {
1828
inner: parking_lot::Mutex<TelemetryStoreInner>,
@@ -151,14 +161,10 @@ impl TelemetryStoreInner {
151161
influx_version,
152162
storage_type,
153163
cores,
154-
// cpu
155164
cpu: Cpu::default(),
156-
// mem
157165
memory: Memory::default(),
158166
per_minute_events_bucket: EventsBucket::new(),
159-
// writes
160167
writes: Writes::default(),
161-
// reads
162168
reads: Queries::default(),
163169
}
164170
}

0 commit comments

Comments
 (0)