diff --git a/alloy/config.alloy b/alloy/config.alloy index 3592806..28c60d4 100644 --- a/alloy/config.alloy +++ b/alloy/config.alloy @@ -6,6 +6,11 @@ local.file "endpoints" { filename = "/etc/alloy/endpoints.json" } +// Enable this section to carry out live debugging when configuring Alloy. +livedebugging { + enabled = false +} + /////////////////////////////////////////////////////////////////////////////// // Metrics scraping @@ -179,15 +184,17 @@ otelcol.receiver.otlp "otlp_receiver" { // named 'default'. output { traces = [ - // Uncomment the next line to generate service graph metrics from the Alloy. By default this is generated - // by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file. - //otelcol.connector.servicegraph.tracemetrics.input, - // Uncomment the next line to generate span metrics from the Alloy. By default this is generated - // by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file. - //otelcol.connector.spanmetrics.tracemetrics.input, + // Uncomment the next line to generate service graph and span metrics from Alloy. By default these are + // generated by the Tempo component, so be sure to remove the relevant configuration in the + // `tempo/tempo.yaml` file. + //otelcol.processor.transform.spanmetrics.input, // The following would be used for tail sampling only traces containing errors. // Uncomment the following line, then comment out the line below it (the batch processor) to use // tail sampling. + // NOTE: In this configuration, if metrics are also being generated (see above), then the order in which + // these components are definied is not important. However, in a situation where a serial pipeline is + // defined, metrics generation *must* occur before tail sampling to ensure a true view of all trace + // requests is captured before traces are dropped by the sampler. //otelcol.processor.tail_sampling.errors.input, otelcol.processor.batch.default.input, otelcol.connector.spanlogs.autologging.input, @@ -357,11 +364,51 @@ otelcol.processor.tail_sampling "errors" { } } +// This processor strips out all resource attributes except for the service.name attribute. This is required because +// otherwise the span metrics connector will attempt to generate separate metrics on a per-resource basis, rather than +// a per-span attribute basis (ie. multiple metrics for the same service). This will lead to clashes where +// Mimir will not be able to correctly distinguish between the metrics and therefore drop them as duplicates. +otelcol.processor.transform "spanmetrics" { + // Ignore any errors that occur when transforming the trace data. + error_mode = "ignore" + + // Operate only on trace data. + trace_statements { + // Only operate on resource attributes. + context = "resource" + + // Only the service.name resource attribute is required by the span metrics connector for metrics generation + // per-service, so we strip all other resource attributes. + statements = [ + `keep_keys(attributes, ["service.name", "ip"])`, + ] + } + + // Output to the span metrics and service graph connectors. + output { + traces = [ + otelcol.connector.spanmetrics.tracemetrics.input, + otelcol.connector.servicegraph.tracemetrics.input, + ] + } +} + // The Spanmetrics Connector will generate RED metrics based on the incoming trace span data. otelcol.connector.spanmetrics "tracemetrics" { - // The namespace explicit adds a prefix to all the generated span metrics names. - // In this case, we'll ensure they match as closely as possible those generated by Tempo. - namespace = "traces.spanmetrics" + // The namespace to prefix span metrics with. + // This changes depending on whether we're sending data to Grafana Cloud or not. For local usage, we want to mimic + // the naming convention of the span metrics generator in Tempo, which looks like: + // traces_spanmetrics_ + // However, for Grafana Cloud, there are certain products that expect metrics generated by Alloy to be of the form: + // traces_span_metrics__ + // where is in seconds (not the default of milliseconds). + // Because of this, we use the local.file endpoints to add this information depending on target. + namespace = json_path(local.file.endpoints.content, ".spanmetrics_namespace")[0] + + // Two DPM. Prior versions of the OTel connector would flush whenever receiving data, which could cause extremely + // large DPM values. Be aware that when used with Grafana Cloud, this will essentially *double* the number of + // active series count for the `trace.spanmetrics` metric namespace. + metrics_flush_interval = "30s" // Each extra dimension (metrics label) to be added to the generated metrics from matching span attributes. These // need to be defined with a name and optionally a default value (in the following cases, we do not want a default @@ -382,6 +429,7 @@ otelcol.connector.spanmetrics "tracemetrics" { // A histogram block must be present, either explicitly defining bucket values or via an exponential block. // We do the latter here. histogram { + unit = "s" explicit { } } @@ -391,15 +439,23 @@ otelcol.connector.spanmetrics "tracemetrics" { enabled = true } + // Whilst this connector does correctly generate RED metrics, the naming convention is subtly different to Tempo, + // even with the prefix namespace. + // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure // it gets transformed into Prometheus format data. output { - metrics = [otelcol.exporter.prometheus.tracemetrics.input] + //metrics = [otelcol.exporter.prometheus.tracemetrics.input] + metrics = [otelcol.processor.transform.metric_rename.input] } } // The Servicegraph Connector will generate service graph metrics (edges and nodes) based on incoming trace spans. otelcol.connector.servicegraph "tracemetrics" { + // One DPM. Prior versions of the OTel connector would flush whenever receiving data, which could cause extremely + // large DPM values. + metrics_flush_interval = "60s" + // Extra dimensions (metrics labels) to be added to the generated metrics from matching span attributes. // For this component, this is defined as an array. There are no default values and the labels will not be generated // for missing span attributes. @@ -417,8 +473,49 @@ otelcol.connector.servicegraph "tracemetrics" { } } +// This processor renames the span metrics names to match those that are generated by Tempo. +otelcol.processor.transform "metric_rename" { + // Ignore any errors that occur when transforming the metric data + error_mode = "ignore" + + // Operate only on metric statements. + metric_statements { + // Use the metric context to operate on the metric data. + context = "metric" + // Tempo generates the `traces_spanmetrics_latency_[bucket/sum/count]` metrics, but the OTel connector + // generates the `traces.spanmetrics.duration.[bucket/sum/count]` metrics. We need to rename the metrics + // to match the Tempo naming convention. + // We also need to rename the `traces.spanmetrics.calls` metric to `traces.spanmetrics.calls.total`, because + // the counter metric requires a total suffix, but we need to strip the suffix from the + // traces_spanmetrics_latency histogram so that our dashboards/Grafana Cloud will still operate correctly. + statements = [ + `set(metric.name, "traces.spanmetrics.latency") where metric.name == "traces.spanmetrics.duration"`, + `set(metric.name, "traces.spanmetrics.calls.total") where metric.name == "traces.spanmetrics.calls"`, + ] + } + + + // Forward to the Prometheus exporter + output { + metrics = [otelcol.exporter.prometheus.tracemetrics.input] + } +} + // The OpenTelemetry Prometheus exporter will transform incoming OTLP metrics data into Prometheus format data. +// Technically, we could send this straight to the Prometheus remote writer, but this shows an example of an +// intermidiary step to transform the metrics (not applying suffixes), although the remote writer will also do this +// if configured to do so. otelcol.exporter.prometheus "tracemetrics" { + // Whether or not to add unit suffixes to the metrics. + // This changes depending on whether we're sending data to Grafana Cloud or not. For local usage, we want to mimic + // the naming convention of the span metrics generator in Tempo, which looks like: + // traces_spanmetrics_ + // However, for Grafana Cloud, there are certain products that expect metrics generated by Alloy to be of the form: + // traces_span_metrics__ + // where is in seconds (not the default of milliseconds). + // Because of this, we use the local.file endpoints to add this information depending on target. + add_metric_suffixes = json_path(local.file.endpoints.content, ".add_spanmetric_suffixes")[0] + // Forward to our local Prometheus remote writer which will send the metrics to Mimir. forward_to = [prometheus.remote_write.mimir.receiver] } @@ -438,9 +535,9 @@ pyroscope.receive_http "mythical" { forward_to = [pyroscope.write.mythical.receiver] } - // Scrape the Mythical application services for profiling data. // This is deprecated for later versions of Grafana Pyroscope, but kept here for reference. +/* pyroscope.scrape "mythical" { // Denotes the targets to be scraped, in this case the mythical server, requester and recorder. targets = [ @@ -477,6 +574,7 @@ pyroscope.scrape "mythical" { // Forward all scraped data to the Pyroscope exporter. forward_to = [pyroscope.write.mythical.receiver] } +*/ // The Pyroscope exporter writes data with any additional information to the local Pyroscope instance. pyroscope.write "mythical" { diff --git a/alloy/endpoints-cloud.json b/alloy/endpoints-cloud.json index ee098eb..d4563c1 100644 --- a/alloy/endpoints-cloud.json +++ b/alloy/endpoints-cloud.json @@ -27,5 +27,7 @@ "username": "", "password": "" } - } + }, + "add_spanmetric_suffixes" : true, + "spanmetrics_namespace": "traces.span.metrics" } diff --git a/alloy/endpoints.json b/alloy/endpoints.json index 9bc9f1a..e41aa9f 100644 --- a/alloy/endpoints.json +++ b/alloy/endpoints.json @@ -27,5 +27,7 @@ "username": "", "password": "" } - } + }, + "add_spanmetric_suffixes" : false, + "spanmetrics_namespace": "traces.spanmetrics" } diff --git a/docker-compose-cloud.yml b/docker-compose-cloud.yml index 8229656..8d322e5 100644 --- a/docker-compose-cloud.yml +++ b/docker-compose-cloud.yml @@ -7,7 +7,7 @@ services: # auto-logs from those traces. # Includes Metrics, Logs, Traces and Profiles. alloy: - image: grafana/alloy:v1.6.1 + image: grafana/alloy:v1.9.1 ports: - "12347:12345" - "12348:12348" @@ -70,6 +70,8 @@ services: - LOGS_TARGET=http://alloy:3100/loki/api/v1/push - TRACING_COLLECTOR_HOST=alloy - TRACING_COLLECTOR_PORT=4317 + - PROFILE_COLLECTOR_HOST=alloy + - PROFILE_COLLECTOR_PORT=4040 - OTEL_EXPORTER_OTLP_TRACES_INSECURE=true - OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.4 @@ -94,6 +96,8 @@ services: - LOGS_TARGET=http://alloy:3100/loki/api/v1/push - TRACING_COLLECTOR_HOST=alloy - TRACING_COLLECTOR_PORT=4317 + - PROFILE_COLLECTOR_HOST=alloy + - PROFILE_COLLECTOR_PORT=4040 - OTEL_EXPORTER_OTLP_TRACES_INSECURE=true - OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.5 @@ -116,5 +120,7 @@ services: - LOGS_TARGET=http://alloy:3100/loki/api/v1/push - TRACING_COLLECTOR_HOST=alloy - TRACING_COLLECTOR_PORT=4317 + - PROFILE_COLLECTOR_HOST=alloy + - PROFILE_COLLECTOR_PORT=4040 - OTEL_EXPORTER_OTLP_TRACES_INSECURE=true - OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.5 diff --git a/docker-compose-otel.yml b/docker-compose-otel.yml index a451862..bf07d7d 100644 --- a/docker-compose-otel.yml +++ b/docker-compose-otel.yml @@ -19,7 +19,7 @@ services: # The Grafana dashboarding server. grafana: - image: grafana/grafana:11.6.0 + image: grafana/grafana:12.0.2 volumes: - "./grafana/definitions:/var/lib/grafana/dashboards" - "./grafana/provisioning:/etc/grafana/provisioning" @@ -135,7 +135,7 @@ services: # The Tempo service stores traces send to it by Grafana opentelemetry-collector, and takes # queries from Grafana to visualise those traces. tempo: - image: grafana/tempo:2.7.2 + image: grafana/tempo:2.8.1 ports: - "3200:3200" - "55680:55680" @@ -148,7 +148,7 @@ services: # The Loki service stores logs sent to it, and takes queries from Grafana # to visualise those logs. loki: - image: grafana/loki:3.4.3 + image: grafana/loki:3.5.1 command: ["--pattern-ingester.enabled=true", "-config.file=/etc/loki/loki.yaml"] ports: - "3100:3100" @@ -164,7 +164,7 @@ services: - "./mimir/mimir.yaml:/etc/mimir.yaml" pyroscope: - image: grafana/pyroscope:1.13.1 + image: grafana/pyroscope:1.13.5 ports: - "4040:4040" command: ["server"] diff --git a/docker-compose.yml b/docker-compose.yml index 8af8b23..f97a812 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,7 +7,7 @@ services: # auto-logs from those traces. # Includes Metrics, Logs, Traces and Profiles. alloy: - image: grafana/alloy:v1.8.1 + image: grafana/alloy:v1.9.1 ports: - "12347:12345" - "12348:12348" @@ -27,7 +27,7 @@ services: # The Grafana dashboarding server. grafana: - image: grafana/grafana:11.6.0 + image: grafana/grafana:12.0.2 volumes: - "./grafana/definitions:/var/lib/grafana/dashboards" - "./grafana/provisioning:/etc/grafana/provisioning" @@ -146,7 +146,7 @@ services: # The Tempo service stores traces send to it by Grafana Alloy, and takes # queries from Grafana to visualise those traces. tempo: - image: grafana/tempo:2.7.2 + image: grafana/tempo:2.8.1 ports: - "3200:3200" - "9411:9411" @@ -160,7 +160,7 @@ services: # The Loki service stores logs sent to it, and takes queries from Grafana # to visualise those logs. loki: - image: grafana/loki:3.4.3 + image: grafana/loki:3.5.1 command: ["--pattern-ingester.enabled=true", "-config.file=/etc/loki/loki.yaml"] ports: - "3100:3100" @@ -188,7 +188,7 @@ services: command: ["run", "-o", "experimental-prometheus-rw", "/scripts/mythical-loadtest.js"] pyroscope: - image: grafana/pyroscope:1.13.1 + image: grafana/pyroscope:1.13.5 ports: - "4040:4040" command: ["server"] diff --git a/grafana/definitions/traces-in-dashboards.json b/grafana/definitions/traces-in-dashboards.json index 1fe5a67..a9174ab 100644 --- a/grafana/definitions/traces-in-dashboards.json +++ b/grafana/definitions/traces-in-dashboards.json @@ -108,7 +108,7 @@ "uid": "mimir" }, "editorMode": "code", - "expr": "sum by (http_method)(rate(traces_spanmetrics_calls_total{service=\"mythical-server\",http_method=~\"${httpMethod}\"}[1m]))", + "expr": "sum by (http_method)(rate(traces_spanmetrics_calls_total{service_name=\"mythical-server\",http_method=~\"${httpMethod}\"}[1m]))", "legendFormat": "", "range": true, "refId": "A" diff --git a/otel/otel.yml b/otel/otel.yml index 3da8c6c..868433c 100644 --- a/otel/otel.yml +++ b/otel/otel.yml @@ -87,7 +87,6 @@ processors: # Use the default values for it. batch: - # The tail sampler processor will only keep traces where spans match the defined policies. tail_sampling: decision_wait: 30s # The time to wait for a decision to be made. @@ -109,12 +108,29 @@ processors: }, ] + # The transform processor is used to rename the span metrics to match the Tempo naming convention. + transform: + # Only operate on metric statements. + metric_statements: + # Operate on the metric data. + - context: metric + statements: + # Rename the `traces.spanmetrics.duration` metric to `traces.spanmetrics.latency`. + - set(metric.name, "traces.spanmetrics.latency") where metric.name == "traces.spanmetrics.duration" + # Rename the `traces.spanmetrics.calls` metric to `traces.spanmetrics.calls.total` to pre-suffix name. + - set(metric.name, "traces.spanmetrics.calls.total") where metric.name == "traces.spanmetrics.calls" + + # Define processors to process received data. # See https://opentelemetry.io/docs/collector/configuration/#connectors connectors: # The spanmetrics connector is used to output span metrics based on received trace spans. spanmetrics: namespace: traces.spanmetrics # Prefix all metrics with `traces.spanmetrics` (this becomes `traces_spanmetrics`). + # Explicitly flush metrics every 30 seconds. Note, this will double active series count for the `trace.spanmetrics` + # metric namespace. + metrics_flush_interval: 30s + # Determine the type of histogram to use for span metrics. histogram: explicit: # Explicit histograms have pre-defined bucket sizes (use default here). @@ -130,6 +146,9 @@ connectors: # The servicegraph connector is used to output service node metrics based on received trace spans. servicegraph: + # Explicitly flush metrics every 60 seconds. Note, this will double active series count for the + # `trace.servicegraph` metric namespace. + metrics_flush_interval: 60s # Defines which exporter the processor will write metrics to. metrics_exporter: prometheusremotewrite # Defines additional label dimensions of the metrics from trace span attributes present. @@ -159,6 +178,10 @@ exporters: # Exporter for sending Prometheus data to Mimir. prometheusremotewrite: + # Don't add suffixes to the metrics. We've already renamed the `traces.spanmetrics.calls` metric to + # `traces.spanmetrics.calls.total`, and we don't want to add the `_milliseconds` suffix to the + # `traces.spanmetrics.latency` metric. + add_metric_suffixes: false # Send to the locally running Mimir service. endpoint: http://mimir:9009/api/v1/push # TLS is not enabled for the instance. @@ -177,14 +200,11 @@ service: processors: [batch] # Comment out other `processor` definitions and uncomment the line below to use tail sampling. #processors: [tail_sampling, batch] - # Comment out other `processor` definitions and uncomment the line below to generate service graph metrics - # from within the OpenTelemetry Collector. - #processors: [servicegraph, batch] # Export to the `otlp/grafana` exporter. exporters: [otlp/grafana] # Comment out other `exporters` definitions and uncomment the line below to generate span metrics # from within the OpenTelemetry Collector as well as exporting traces to Tempo. - #exporters: [otlp/grafana, spanmetrics] + #exporters: [otlp/grafana, spanmetrics, servicegraph] # Define the metrics pipeline. metrics: @@ -192,8 +212,9 @@ service: receivers: [otlp, prometheus] # Comment out other `receivers` definitions and uncomment the line below to import spanmetrics as well # as prometheus metrics. - #receivers: [otlp, prometheus, spanmetrics] - # Use the `batch` processor to process received metrics. - processors: [batch] + #receivers: [otlp, prometheus, spanmetrics, servicegraph] + # Use the `batch` processor to process received metrics, use the transform metric to ensure that spanmetric + # metrics are in the correct format for Grafana Cloud (doesn't take effect unless receivers above are used.) + processors: [transform, batch] # Export to the `prometheusremtotewrite` exporter. exporters: [prometheusremotewrite]