grafana · hedss · Jun 23, 2025 · May 26, 2025 · Jun 23, 2025
@@ -6,6 +6,11 @@ local.file "endpoints" {
     filename = "/etc/alloy/endpoints.json"
 }
 
+// Enable this section to carry out live debugging when configuring Alloy.
+livedebugging {
+    enabled = false
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Metrics scraping
 
@@ -179,15 +184,17 @@ otelcol.receiver.otlp "otlp_receiver" {
     // named 'default'.
     output {
         traces = [
-            // Uncomment the next line to generate service graph metrics from the Alloy. By default this is generated
-            // by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file.
-            //otelcol.connector.servicegraph.tracemetrics.input,
-            // Uncomment the next line to generate span metrics from the Alloy. By default this is generated
-            // by the Tempo component, so be sure to remove the relevant configuration in the `tempo/tempo.yaml` file.
-            //otelcol.connector.spanmetrics.tracemetrics.input,
+            // Uncomment the next line to generate service graph and span metrics from Alloy. By default these are
+            // generated by the Tempo component, so be sure to remove the relevant configuration in the
+            // `tempo/tempo.yaml` file.
+            //otelcol.processor.transform.spanmetrics.input,
             // The following would be used for tail sampling only traces containing errors.
             // Uncomment the following line, then comment out the line below it (the batch processor) to use
             // tail sampling.
+            // NOTE: In this configuration, if metrics are also being generated (see above), then the order in which
+            // these components are definied is not important. However, in a situation where a serial pipeline is
+            //       defined, metrics generation *must* occur before tail sampling to ensure a true view of all trace
+            //       requests is captured before traces are dropped by the sampler.
             //otelcol.processor.tail_sampling.errors.input,
             otelcol.processor.batch.default.input,
             otelcol.connector.spanlogs.autologging.input,
@@ -357,11 +364,51 @@ otelcol.processor.tail_sampling "errors" {
     }
 }
 
+// This processor strips out all resource attributes except for the service.name attribute. This is required because
+// otherwise the span metrics connector will attempt to generate separate metrics on a per-resource basis, rather than
+// a per-span attribute basis (ie. multiple metrics for the same service). This will lead to clashes where
+// Mimir will not be able to correctly distinguish between the metrics and therefore drop them as duplicates.
+otelcol.processor.transform "spanmetrics" {
+    // Ignore any errors that occur when transforming the trace data.
+    error_mode = "ignore"
+
+    // Operate only on trace data.
+    trace_statements {
+        // Only operate on resource attributes.
+        context = "resource"
+
+        // Only the service.name resource attribute is required by the span metrics connector for metrics generation
+        // per-service, so we strip all other resource attributes.
+        statements = [
+            `keep_keys(attributes, ["service.name", "ip"])`,
+        ]
+    }
+
+    // Output to the span metrics and service graph connectors.
+    output {
+        traces = [
+            otelcol.connector.spanmetrics.tracemetrics.input,
+            otelcol.connector.servicegraph.tracemetrics.input,
+        ]
+    }
+}
+
 // The Spanmetrics Connector will generate RED metrics based on the incoming trace span data.
 otelcol.connector.spanmetrics "tracemetrics" {
-    // The namespace explicit adds a prefix to all the generated span metrics names.
-    // In this case, we'll ensure they match as closely as possible those generated by Tempo.
-    namespace = "traces.spanmetrics"
+    // The namespace to prefix span metrics with.
+    // This changes depending on whether we're sending data to Grafana Cloud or not. For local usage, we want to mimic
+    // the naming convention of the span metrics generator in Tempo, which looks like:
+    // traces_spanmetrics_<x>
+    // However, for Grafana Cloud, there are certain products that expect metrics generated by Alloy to be of the form:
+    // traces_span_metrics_<units>_<x>
+    // where <units> is in seconds (not the default of milliseconds).
+    // Because of this, we use the local.file endpoints to add this information depending on target.
+    namespace = json_path(local.file.endpoints.content, ".spanmetrics_namespace")[0]
+
+    // Two DPM. Prior versions of the OTel connector would flush whenever receiving data, which could cause extremely
+    // large DPM values. Be aware that when used with Grafana Cloud, this will essentially *double* the number of
+    // active series count for the `trace.spanmetrics` metric namespace.
+    metrics_flush_interval = "30s"
 
     // Each extra dimension (metrics label) to be added to the generated metrics from matching span attributes. These
     // need to be defined with a name and optionally a default value (in the following cases, we do not want a default
@@ -382,6 +429,7 @@ otelcol.connector.spanmetrics "tracemetrics" {
     // A histogram block must be present, either explicitly defining bucket values or via an exponential block.
     // We do the latter here.
     histogram {
+        unit = "s"
         explicit {
         }
     }
@@ -391,15 +439,23 @@ otelcol.connector.spanmetrics "tracemetrics" {
         enabled = true
     }
 
+    // Whilst this connector does correctly generate RED metrics, the naming convention is subtly different to Tempo,
+    // even with the prefix namespace.
+
     // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure
     // it gets transformed into Prometheus format data.
     output {
-        metrics = [otelcol.exporter.prometheus.tracemetrics.input]
+        //metrics = [otelcol.exporter.prometheus.tracemetrics.input]
+        metrics = [otelcol.processor.transform.metric_rename.input]
     }
 }
 
 // The Servicegraph Connector will generate service graph metrics (edges and nodes) based on incoming trace spans.
 otelcol.connector.servicegraph "tracemetrics" {
+    // One DPM. Prior versions of the OTel connector would flush whenever receiving data, which could cause extremely
+    // large DPM values.
+    metrics_flush_interval = "60s"
+
     // Extra dimensions (metrics labels) to be added to the generated metrics from matching span attributes.
     // For this component, this is defined as an array. There are no default values and the labels will not be generated
     // for missing span attributes.
@@ -417,8 +473,49 @@ otelcol.connector.servicegraph "tracemetrics" {
     }
 }
 
+// This processor renames the span metrics names to match those that are generated by Tempo.
+otelcol.processor.transform "metric_rename" {
+    // Ignore any errors that occur when transforming the metric data
+    error_mode = "ignore"
+
+    // Operate only on metric statements.
+    metric_statements {
+        // Use the metric context to operate on the metric data.
+        context = "metric"
+        // Tempo generates the `traces_spanmetrics_latency_[bucket/sum/count]` metrics, but the OTel connector
+        // generates the `traces.spanmetrics.duration.[bucket/sum/count]` metrics. We need to rename the metrics
+        // to match the Tempo naming convention.
+        // We also need to rename the `traces.spanmetrics.calls` metric to `traces.spanmetrics.calls.total`, because
+        // the counter metric requires a total suffix, but we need to strip the suffix from the
+        // traces_spanmetrics_latency histogram so that our dashboards/Grafana Cloud will still operate correctly.
+        statements = [
+            `set(metric.name, "traces.spanmetrics.latency") where metric.name == "traces.spanmetrics.duration"`,
+            `set(metric.name, "traces.spanmetrics.calls.total") where metric.name == "traces.spanmetrics.calls"`,
+        ]
+    }
+
+
+    // Forward to the Prometheus exporter
+    output {
+        metrics = [otelcol.exporter.prometheus.tracemetrics.input]
+    }
+}
+
 // The OpenTelemetry Prometheus exporter will transform incoming OTLP metrics data into Prometheus format data.
+// Technically, we could send this straight to the Prometheus remote writer, but this shows an example of an
+// intermidiary step to transform the metrics (not applying suffixes), although the remote writer will also do this
+// if configured to do so.
 otelcol.exporter.prometheus "tracemetrics" {
+    // Whether or not to add unit suffixes to the metrics.
+    // This changes depending on whether we're sending data to Grafana Cloud or not. For local usage, we want to mimic
+    // the naming convention of the span metrics generator in Tempo, which looks like:
+    // traces_spanmetrics_<x>
+    // However, for Grafana Cloud, there are certain products that expect metrics generated by Alloy to be of the form:
+    // traces_span_metrics_<units>_<x>
+    // where <units> is in seconds (not the default of milliseconds).
+    // Because of this, we use the local.file endpoints to add this information depending on target.
+    add_metric_suffixes = json_path(local.file.endpoints.content, ".add_spanmetric_suffixes")[0]
+
     // Forward to our local Prometheus remote writer which will send the metrics to Mimir.
     forward_to = [prometheus.remote_write.mimir.receiver]
 }
@@ -438,9 +535,9 @@ pyroscope.receive_http "mythical" {
     forward_to = [pyroscope.write.mythical.receiver]
 }
 
-
 // Scrape the Mythical application services for profiling data.
 // This is deprecated for later versions of Grafana Pyroscope, but kept here for reference.
+/*
 pyroscope.scrape "mythical" {
     // Denotes the targets to be scraped, in this case the mythical server, requester and recorder.
     targets = [
@@ -477,6 +574,7 @@ pyroscope.scrape "mythical" {
     // Forward all scraped data to the Pyroscope exporter.
     forward_to = [pyroscope.write.mythical.receiver]
 }
+*/
 
 // The Pyroscope exporter writes data with any additional information to the local Pyroscope instance.
 pyroscope.write "mythical" {

@@ -27,5 +27,7 @@
             "username": "<profilesUsername>",
             "password": "<profilesPassword>"
         }
-    }
+    },
+    "add_spanmetric_suffixes" : true,
+    "spanmetrics_namespace": "traces.span.metrics"
 }
@@ -27,5 +27,7 @@
             "username": "",
             "password": ""
         }
-    }
+    },
+    "add_spanmetric_suffixes" : false,
+    "spanmetrics_namespace": "traces.spanmetrics"
 }
@@ -7,7 +7,7 @@ services:
   # auto-logs from those traces.
   # Includes Metrics, Logs, Traces and Profiles.
   alloy:
-    image: grafana/alloy:v1.6.1
+    image: grafana/alloy:v1.8.1
     ports:
       - "12347:12345"
       - "12348:12348"
@@ -70,6 +70,8 @@ services:
       - LOGS_TARGET=http://alloy:3100/loki/api/v1/push
       - TRACING_COLLECTOR_HOST=alloy
       - TRACING_COLLECTOR_PORT=4317
+      - PROFILE_COLLECTOR_HOST=alloy
+      - PROFILE_COLLECTOR_PORT=4040
       - OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
       - OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.4
 
@@ -94,6 +96,8 @@ services:
       - LOGS_TARGET=http://alloy:3100/loki/api/v1/push
       - TRACING_COLLECTOR_HOST=alloy
       - TRACING_COLLECTOR_PORT=4317
+      - PROFILE_COLLECTOR_HOST=alloy
+      - PROFILE_COLLECTOR_PORT=4040
       - OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
       - OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.5
 
@@ -116,5 +120,7 @@ services:
       - LOGS_TARGET=http://alloy:3100/loki/api/v1/push
       - TRACING_COLLECTOR_HOST=alloy
       - TRACING_COLLECTOR_PORT=4317
+      - PROFILE_COLLECTOR_HOST=alloy
+      - PROFILE_COLLECTOR_PORT=4040
       - OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
       - OTEL_RESOURCE_ATTRIBUTES=ip=1.2.3.5
@@ -108,7 +108,7 @@
             "uid": "mimir"
           },
           "editorMode": "code",
-          "expr": "sum by (http_method)(rate(traces_spanmetrics_calls_total{service=\"mythical-server\",http_method=~\"${httpMethod}\"}[1m]))",
+          "expr": "sum by (http_method)(rate(traces_spanmetrics_calls_total{service_name=\"mythical-server\",http_method=~\"${httpMethod}\"}[1m]))",
           "legendFormat": "",
           "range": true,
           "refId": "A"

@@ -87,7 +87,6 @@ processors:
   # Use the default values for it.
   batch:
 
-
   # The tail sampler processor will only keep traces where spans match the defined policies.
   tail_sampling:
     decision_wait: 30s    # The time to wait for a decision to be made.
@@ -109,12 +108,29 @@ processors:
       },
     ]
 
+  # The transform processor is used to rename the span metrics to match the Tempo naming convention.
+  transform:
+    # Only operate on metric statements.
+    metric_statements:
+      # Operate on the metric data.
+      - context: metric
+        statements:
+          # Rename the `traces.spanmetrics.duration` metric to `traces.spanmetrics.latency`.
+          - set(metric.name, "traces.spanmetrics.latency") where metric.name == "traces.spanmetrics.duration"
+          # Rename the `traces.spanmetrics.calls` metric to `traces.spanmetrics.calls.total` to pre-suffix name.
+          - set(metric.name, "traces.spanmetrics.calls.total") where metric.name == "traces.spanmetrics.calls"
+
+
 # Define processors to process received data.
 # See https://opentelemetry.io/docs/collector/configuration/#connectors
 connectors:
   # The spanmetrics connector is used to output span metrics based on received trace spans.
   spanmetrics:
     namespace: traces.spanmetrics   # Prefix all metrics with `traces.spanmetrics` (this becomes `traces_spanmetrics`).
+    # Explicitly flush metrics every 30 seconds. Note, this will double active series count for the `trace.spanmetrics`
+    # metric namespace.
+    metrics_flush_interval: 30s
+
     # Determine the type of histogram to use for span metrics.
     histogram:
       explicit:                     # Explicit histograms have pre-defined bucket sizes (use default here).
@@ -130,6 +146,9 @@ connectors:
 
   # The servicegraph connector is used to output service node metrics based on received trace spans.
   servicegraph:
+      # Explicitly flush metrics every 60 seconds. Note, this will double active series count for the
+      # `trace.servicegraph` metric namespace.
+      metrics_flush_interval: 60s
       # Defines which exporter the processor will write metrics to.
       metrics_exporter: prometheusremotewrite
       # Defines additional label dimensions of the metrics from trace span attributes present.
@@ -159,6 +178,10 @@ exporters:
 
   # Exporter for sending Prometheus data to Mimir.
   prometheusremotewrite:
+    # Don't add suffixes to the metrics. We've already renamed the `traces.spanmetrics.calls` metric to
+    # `traces.spanmetrics.calls.total`, and we don't want to add the `_milliseconds` suffix to the
+    # `traces.spanmetrics.latency` metric.
+    add_metric_suffixes: false
     # Send to the locally running Mimir service.
     endpoint: http://mimir:9009/api/v1/push
     # TLS is not enabled for the instance.
@@ -177,23 +200,20 @@ service:
       processors: [batch]
       # Comment out other `processor` definitions and uncomment the line below to use tail sampling.
       #processors: [tail_sampling, batch]
-      # Comment out other `processor` definitions and uncomment the line below to generate service graph metrics
-      # from within the OpenTelemetry Collector.
-      #processors: [servicegraph, batch]
       # Export to the `otlp/grafana` exporter.
-      exporters: [otlp/grafana]
+      #exporters: [otlp/grafana]
       # Comment out other `exporters` definitions and uncomment the line below to generate span metrics
       # from within the OpenTelemetry Collector as well as exporting traces to Tempo.
-      #exporters: [otlp/grafana, spanmetrics]
+      exporters: [otlp/grafana, spanmetrics, servicegraph]
 
     # Define the metrics pipeline.
     metrics:
       # Receive metrics from the `prometheus` receiver.
-      receivers: [otlp, prometheus]
+      #receivers: [otlp, prometheus]
       # Comment out other `receivers` definitions and uncomment the line below to import spanmetrics as well
       # as prometheus metrics.
-      #receivers: [otlp, prometheus, spanmetrics]
+      receivers: [otlp, prometheus, spanmetrics, servicegraph]
       # Use the `batch` processor to process received metrics.
-      processors: [batch]
+      processors: [transform, batch]
       # Export to the `prometheusremtotewrite` exporter.
       exporters: [prometheusremotewrite]