Skip to content

Update metrics generation configuration. #311

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 23, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 109 additions & 11 deletions alloy/config.alloy
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@ local.file "endpoints" {
filename = "/etc/alloy/endpoints.json"
}

// Enable this section to carry out live debugging when configuring Alloy.
livedebugging {
enabled = false
}

///////////////////////////////////////////////////////////////////////////////
// Metrics scraping

Expand Down Expand Up @@ -179,15 +184,17 @@ otelcol.receiver.otlp "otlp_receiver" {
// named 'default'.
output {
traces = [
// Uncomment the next line to generate service graph metrics from the Alloy. By default this is generated
// by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file.
//otelcol.connector.servicegraph.tracemetrics.input,
// Uncomment the next line to generate span metrics from the Alloy. By default this is generated
// by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file.
//otelcol.connector.spanmetrics.tracemetrics.input,
// Uncomment the next line to generate service graph and span metrics from Alloy. By default these are
// generated by the Tempo component, so be sure to remove the relevant configuration in the
// `tempo/tempo.yaml` file.
//otelcol.processor.transform.spanmetrics.input,
// The following would be used for tail sampling only traces containing errors.
// Uncomment the following line, then comment out the line below it (the batch processor) to use
// tail sampling.
// NOTE: In this configuration, if metrics are also being generated (see above), then the order in which
// these components are definied is not important. However, in a situation where a serial pipeline is
// defined, metrics generation *must* occur before tail sampling to ensure a true view of all trace
// requests is captured before traces are dropped by the sampler.
//otelcol.processor.tail_sampling.errors.input,
otelcol.processor.batch.default.input,
otelcol.connector.spanlogs.autologging.input,
Expand Down Expand Up @@ -357,11 +364,51 @@ otelcol.processor.tail_sampling "errors" {
}
}

// This processor strips out all resource attributes except for the service.name attribute. This is required because
// otherwise the span metrics connector will attempt to generate separate metrics on a per-resource basis, rather than
// a per-span attribute basis (ie. multiple metrics for the same service). This will lead to clashes where
// Mimir will not be able to correctly distinguish between the metrics and therefore drop them as duplicates.
otelcol.processor.transform "spanmetrics" {
// Ignore any errors that occur when transforming the trace data.
error_mode = "ignore"

// Operate only on trace data.
trace_statements {
// Only operate on resource attributes.
context = "resource"

// Only the service.name resource attribute is required by the span metrics connector for metrics generation
// per-service, so we strip all other resource attributes.
statements = [
`keep_keys(attributes, ["service.name", "ip"])`,
]
}

// Output to the span metrics and service graph connectors.
output {
traces = [
otelcol.connector.spanmetrics.tracemetrics.input,
otelcol.connector.servicegraph.tracemetrics.input,
]
}
}

// The Spanmetrics Connector will generate RED metrics based on the incoming trace span data.
otelcol.connector.spanmetrics "tracemetrics" {
// The namespace explicit adds a prefix to all the generated span metrics names.
// In this case, we'll ensure they match as closely as possible those generated by Tempo.
namespace = "traces.spanmetrics"
// The namespace to prefix span metrics with.
// This changes depending on whether we're sending data to Grafana Cloud or not. For local usage, we want to mimic
// the naming convention of the span metrics generator in Tempo, which looks like:
// traces_spanmetrics_<x>
// However, for Grafana Cloud, there are certain products that expect metrics generated by Alloy to be of the form:
// traces_span_metrics_<units>_<x>
// where <units> is in seconds (not the default of milliseconds).
// Because of this, we use the local.file endpoints to add this information depending on target.
namespace = json_path(local.file.endpoints.content, ".spanmetrics_namespace")[0]

// Two DPM. Prior versions of the OTel connector would flush whenever receiving data, which could cause extremely
// large DPM values. Be aware that when used with Grafana Cloud, this will essentially *double* the number of
// active series count for the `trace.spanmetrics` metric namespace.
metrics_flush_interval = "30s"

// Each extra dimension (metrics label) to be added to the generated metrics from matching span attributes. These
// need to be defined with a name and optionally a default value (in the following cases, we do not want a default
Expand All @@ -382,6 +429,7 @@ otelcol.connector.spanmetrics "tracemetrics" {
// A histogram block must be present, either explicitly defining bucket values or via an exponential block.
// We do the latter here.
histogram {
unit = "s"
explicit {
}
}
Expand All @@ -391,15 +439,23 @@ otelcol.connector.spanmetrics "tracemetrics" {
enabled = true
}

// Whilst this connector does correctly generate RED metrics, the naming convention is subtly different to Tempo,
// even with the prefix namespace.

// Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure
// it gets transformed into Prometheus format data.
output {
metrics = [otelcol.exporter.prometheus.tracemetrics.input]
//metrics = [otelcol.exporter.prometheus.tracemetrics.input]
metrics = [otelcol.processor.transform.metric_rename.input]
}
}

// The Servicegraph Connector will generate service graph metrics (edges and nodes) based on incoming trace spans.
otelcol.connector.servicegraph "tracemetrics" {
// One DPM. Prior versions of the OTel connector would flush whenever receiving data, which could cause extremely
// large DPM values.
metrics_flush_interval = "60s"

// Extra dimensions (metrics labels) to be added to the generated metrics from matching span attributes.
// For this component, this is defined as an array. There are no default values and the labels will not be generated
// for missing span attributes.
Expand All @@ -417,8 +473,49 @@ otelcol.connector.servicegraph "tracemetrics" {
}
}

// This processor renames the span metrics names to match those that are generated by Tempo.
otelcol.processor.transform "metric_rename" {
// Ignore any errors that occur when transforming the metric data
error_mode = "ignore"

// Operate only on metric statements.
metric_statements {
// Use the metric context to operate on the metric data.
context = "metric"
// Tempo generates the `traces_spanmetrics_latency_[bucket/sum/count]` metrics, but the OTel connector
// generates the `traces.spanmetrics.duration.[bucket/sum/count]` metrics. We need to rename the metrics
// to match the Tempo naming convention.
// We also need to rename the `traces.spanmetrics.calls` metric to `traces.spanmetrics.calls.total`, because
// the counter metric requires a total suffix, but we need to strip the suffix from the
// traces_spanmetrics_latency histogram so that our dashboards/Grafana Cloud will still operate correctly.
statements = [
`set(metric.name, "traces.spanmetrics.latency") where metric.name == "traces.spanmetrics.duration"`,
`set(metric.name, "traces.spanmetrics.calls.total") where metric.name == "traces.spanmetrics.calls"`,
]
}


// Forward to the Prometheus exporter
output {
metrics = [otelcol.exporter.prometheus.tracemetrics.input]
}
}

// The OpenTelemetry Prometheus exporter will transform incoming OTLP metrics data into Prometheus format data.
// Technically, we could send this straight to the Prometheus remote writer, but this shows an example of an
// intermidiary step to transform the metrics (not applying suffixes), although the remote writer will also do this
// if configured to do so.
otelcol.exporter.prometheus "tracemetrics" {
// Whether or not to add unit suffixes to the metrics.
// This changes depending on whether we're sending data to Grafana Cloud or not. For local usage, we want to mimic
// the naming convention of the span metrics generator in Tempo, which looks like:
// traces_spanmetrics_<x>
// However, for Grafana Cloud, there are certain products that expect metrics generated by Alloy to be of the form:
// traces_span_metrics_<units>_<x>
// where <units> is in seconds (not the default of milliseconds).
// Because of this, we use the local.file endpoints to add this information depending on target.
add_metric_suffixes = json_path(local.file.endpoints.content, ".add_spanmetric_suffixes")[0]

// Forward to our local Prometheus remote writer which will send the metrics to Mimir.
forward_to = [prometheus.remote_write.mimir.receiver]
}
Expand All @@ -438,9 +535,9 @@ pyroscope.receive_http "mythical" {
forward_to = [pyroscope.write.mythical.receiver]
}


// Scrape the Mythical application services for profiling data.
// This is deprecated for later versions of Grafana Pyroscope, but kept here for reference.
/*
pyroscope.scrape "mythical" {
// Denotes the targets to be scraped, in this case the mythical server, requester and recorder.
targets = [
Expand Down Expand Up @@ -477,6 +574,7 @@ pyroscope.scrape "mythical" {
// Forward all scraped data to the Pyroscope exporter.
forward_to = [pyroscope.write.mythical.receiver]
}
*/

// The Pyroscope exporter writes data with any additional information to the local Pyroscope instance.
pyroscope.write "mythical" {
Expand Down
4 changes: 3 additions & 1 deletion alloy/endpoints-cloud.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@
"username": "<profilesUsername>",
"password": "<profilesPassword>"
}
}
},
"add_spanmetric_suffixes" : true,
"spanmetrics_namespace": "traces.span.metrics"
}
4 changes: 3 additions & 1 deletion alloy/endpoints.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@
"username": "",
"password": ""
}
}
},
"add_spanmetric_suffixes" : false,
"spanmetrics_namespace": "traces.spanmetrics"
}
8 changes: 7 additions & 1 deletion docker-compose-cloud.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ services:
# auto-logs from those traces.
# Includes Metrics, Logs, Traces and Profiles.
alloy:
image: grafana/alloy:v1.6.1
image: grafana/alloy:v1.8.1
ports:
- "12347:12345"
- "12348:12348"
Expand Down Expand Up @@ -70,6 +70,8 @@ services:
- LOGS_TARGET=http://alloy:3100/loki/api/v1/push
- TRACING_COLLECTOR_HOST=alloy
- TRACING_COLLECTOR_PORT=4317
- PROFILE_COLLECTOR_HOST=alloy
- PROFILE_COLLECTOR_PORT=4040
- OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
- OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.4

Expand All @@ -94,6 +96,8 @@ services:
- LOGS_TARGET=http://alloy:3100/loki/api/v1/push
- TRACING_COLLECTOR_HOST=alloy
- TRACING_COLLECTOR_PORT=4317
- PROFILE_COLLECTOR_HOST=alloy
- PROFILE_COLLECTOR_PORT=4040
- OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
- OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.5

Expand All @@ -116,5 +120,7 @@ services:
- LOGS_TARGET=http://alloy:3100/loki/api/v1/push
- TRACING_COLLECTOR_HOST=alloy
- TRACING_COLLECTOR_PORT=4317
- PROFILE_COLLECTOR_HOST=alloy
- PROFILE_COLLECTOR_PORT=4040
- OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
- OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.5
2 changes: 1 addition & 1 deletion grafana/definitions/traces-in-dashboards.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
"uid": "mimir"
},
"editorMode": "code",
"expr": "sum by (http_method)(rate(traces_spanmetrics_calls_total{service=\"mythical-server\",http_method=~\"${httpMethod}\"}[1m]))",
"expr": "sum by (http_method)(rate(traces_spanmetrics_calls_total{service_name=\"mythical-server\",http_method=~\"${httpMethod}\"}[1m]))",
"legendFormat": "",
"range": true,
"refId": "A"
Expand Down
38 changes: 29 additions & 9 deletions otel/otel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ processors:
# Use the default values for it.
batch:


# The tail sampler processor will only keep traces where spans match the defined policies.
tail_sampling:
decision_wait: 30s # The time to wait for a decision to be made.
Expand All @@ -109,12 +108,29 @@ processors:
},
]

# The transform processor is used to rename the span metrics to match the Tempo naming convention.
transform:
# Only operate on metric statements.
metric_statements:
# Operate on the metric data.
- context: metric
statements:
# Rename the `traces.spanmetrics.duration` metric to `traces.spanmetrics.latency`.
- set(metric.name, "traces.spanmetrics.latency") where metric.name == "traces.spanmetrics.duration"
# Rename the `traces.spanmetrics.calls` metric to `traces.spanmetrics.calls.total` to pre-suffix name.
- set(metric.name, "traces.spanmetrics.calls.total") where metric.name == "traces.spanmetrics.calls"


# Define processors to process received data.
# See https://opentelemetry.io/docs/collector/configuration/#connectors
connectors:
# The spanmetrics connector is used to output span metrics based on received trace spans.
spanmetrics:
namespace: traces.spanmetrics # Prefix all metrics with `traces.spanmetrics` (this becomes `traces_spanmetrics`).
# Explicitly flush metrics every 30 seconds. Note, this will double active series count for the `trace.spanmetrics`
# metric namespace.
metrics_flush_interval: 30s

# Determine the type of histogram to use for span metrics.
histogram:
explicit: # Explicit histograms have pre-defined bucket sizes (use default here).
Expand All @@ -130,6 +146,9 @@ connectors:

# The servicegraph connector is used to output service node metrics based on received trace spans.
servicegraph:
# Explicitly flush metrics every 60 seconds. Note, this will double active series count for the
# `trace.servicegraph` metric namespace.
metrics_flush_interval: 60s
# Defines which exporter the processor will write metrics to.
metrics_exporter: prometheusremotewrite
# Defines additional label dimensions of the metrics from trace span attributes present.
Expand Down Expand Up @@ -159,6 +178,10 @@ exporters:

# Exporter for sending Prometheus data to Mimir.
prometheusremotewrite:
# Don't add suffixes to the metrics. We've already renamed the `traces.spanmetrics.calls` metric to
# `traces.spanmetrics.calls.total`, and we don't want to add the `_milliseconds` suffix to the
# `traces.spanmetrics.latency` metric.
add_metric_suffixes: false
# Send to the locally running Mimir service.
endpoint: http://mimir:9009/api/v1/push
# TLS is not enabled for the instance.
Expand All @@ -177,23 +200,20 @@ service:
processors: [batch]
# Comment out other `processor` definitions and uncomment the line below to use tail sampling.
#processors: [tail_sampling, batch]
# Comment out other `processor` definitions and uncomment the line below to generate service graph metrics
# from within the OpenTelemetry Collector.
#processors: [servicegraph, batch]
# Export to the `otlp/grafana` exporter.
exporters: [otlp/grafana]
#exporters: [otlp/grafana]
# Comment out other `exporters` definitions and uncomment the line below to generate span metrics
# from within the OpenTelemetry Collector as well as exporting traces to Tempo.
#exporters: [otlp/grafana, spanmetrics]
exporters: [otlp/grafana, spanmetrics, servicegraph]

# Define the metrics pipeline.
metrics:
# Receive metrics from the `prometheus` receiver.
receivers: [otlp, prometheus]
#receivers: [otlp, prometheus]
# Comment out other `receivers` definitions and uncomment the line below to import spanmetrics as well
# as prometheus metrics.
#receivers: [otlp, prometheus, spanmetrics]
receivers: [otlp, prometheus, spanmetrics, servicegraph]
# Use the `batch` processor to process received metrics.
processors: [batch]
processors: [transform, batch]
# Export to the `prometheusremtotewrite` exporter.
exporters: [prometheusremotewrite]