-
Notifications
You must be signed in to change notification settings - Fork 3.1k
[prometheusremotewritereceiver] concurrency bug #43383
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
0d28a7c
f610e7a
a3c5cb4
ea22049
72d3386
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| # Use this changelog template to create an entry for release notes. | ||
|
|
||
| # One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' | ||
| change_type: bug_fix | ||
|
|
||
| # The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) | ||
| component: receiver/prometheusremotewrite | ||
|
|
||
| # A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). | ||
| note: Fixed a concurrency bug in the Prometheus remote write receiver where concurrent requests with identical job/instance labels would return empty responses after the first successful request. | ||
|
|
||
| # Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. | ||
| issues: [42159] | ||
|
|
||
| # (Optional) One or more lines of additional information to render under the primary note. | ||
| # These lines will be padded with 2 spaces and then inserted directly into the document. | ||
| # Use pipe (|) for multiline entries. | ||
| subtext: | ||
|
|
||
| # If your change doesn't affect end users or the exported elements of any package, | ||
| # you should instead start your pull request title with [chore] or use the "Skip Changelog" label. | ||
| # Optional: The change log or logs in which this entry should be included. | ||
| # e.g. '[user]' or '[user, api]' | ||
| # Include 'user' if the change is relevant to end users. | ||
| # Include 'api' if there is a change to a library API. | ||
| # Default: '[user]' | ||
| change_logs: [user] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ import ( | |
| "math" | ||
| "net/http" | ||
| "net/http/httptest" | ||
| "strconv" | ||
| "sync" | ||
| "testing" | ||
| "time" | ||
|
|
@@ -1495,44 +1496,6 @@ func TestTargetInfoWithMultipleRequests(t *testing.T) { | |
| }, | ||
| }, | ||
| }, | ||
| { | ||
| name: "normal metric first, target_info second", | ||
| requests: []*writev2.Request{ | ||
| { | ||
| Symbols: []string{ | ||
| "", | ||
| "job", "production/service_a", // 1, 2 | ||
| "instance", "host1", // 3, 4 | ||
| "__name__", "normal_metric", // 5, 6 | ||
| "foo", "bar", // 7, 8 | ||
| }, | ||
| Timeseries: []writev2.TimeSeries{ | ||
| { | ||
| Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE}, | ||
| LabelsRefs: []uint32{5, 6, 1, 2, 3, 4, 7, 8}, | ||
| Samples: []writev2.Sample{{Value: 2, Timestamp: 2}}, | ||
| }, | ||
| }, | ||
| }, | ||
| { | ||
| Symbols: []string{ | ||
| "", | ||
| "job", "production/service_a", // 1, 2 | ||
| "instance", "host1", // 3, 4 | ||
| "machine_type", "n1-standard-1", // 5, 6 | ||
| "cloud_provider", "gcp", // 7, 8 | ||
| "region", "us-central1", // 9, 10 | ||
| "__name__", "target_info", // 11, 12 | ||
| }, | ||
| Timeseries: []writev2.TimeSeries{ | ||
| { | ||
| Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE}, | ||
| LabelsRefs: []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| } | ||
|
|
||
| // Using the same expected metrics for both tests, because we are just checking if the order of the requests changes the result. | ||
|
|
@@ -1592,7 +1555,7 @@ func TestTargetInfoWithMultipleRequests(t *testing.T) { | |
| assert.Equal(t, http.StatusNoContent, resp.StatusCode, string(body)) | ||
| } | ||
|
|
||
| assert.NoError(t, pmetrictest.CompareMetrics(expectedMetrics, mockConsumer.metrics[0])) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using the above comment as reference, the current change now is in the same page as the expected behavior! This because the expected final metric, is the junction of the target_info and the normal_metric, following the name of the test: |
||
| assert.NoError(t, pmetrictest.CompareMetrics(expectedMetrics, mockConsumer.metrics[1])) | ||
| }) | ||
| } | ||
| } | ||
|
|
@@ -1788,7 +1751,7 @@ func TestLRUCacheResourceMetrics(t *testing.T) { | |
| } | ||
|
|
||
| // As target_info and metric1 have the same job/instance, they generate the same end metric: mockConsumer.metrics[0]. | ||
| assert.NoError(t, pmetrictest.CompareMetrics(expectedMetrics1, mockConsumer.metrics[0])) | ||
| assert.NoError(t, pmetrictest.CompareMetrics(expectedMetrics1, mockConsumer.metrics[1])) | ||
| // As metric2 have different job/instance, it generates a different end metric: mockConsumer.metrics[2]. At this point, the cache is full it should evict the target_info metric to store the metric2. | ||
| assert.NoError(t, pmetrictest.CompareMetrics(expectedMetrics2, mockConsumer.metrics[2])) | ||
| // As just have 1 slot in the cache, but the cache for metric1 was evicted, this metric1_1 should generate a new resource metric, even having the same job/instance than the metric1. | ||
|
|
@@ -1819,3 +1782,104 @@ func buildMetaDataMapByID(ms pmetric.Metrics) map[string]map[string]any { | |
| } | ||
| return result | ||
| } | ||
|
|
||
| // TestConcurrentRequestsforSameResourceAttributes reproduces the concurrency bug where subsequent requests | ||
| // fail to reach the next consumer after the first successful request. | ||
| func TestConcurrentRequestsforSameResourceAttributes(t *testing.T) { | ||
| mockConsumer := &mockConsumer{} | ||
| prwReceiver := setupMetricsReceiver(t) | ||
| prwReceiver.nextConsumer = mockConsumer | ||
|
|
||
| ts := httptest.NewServer(http.HandlerFunc(prwReceiver.handlePRW)) | ||
| defer ts.Close() | ||
|
|
||
| // Create multiple requests with the same job/instance labels (triggering cache key collision) | ||
| createRequest := func(metricName string, value float64, timestamp int64) *writev2.Request { | ||
| return &writev2.Request{ | ||
| Symbols: []string{ | ||
| "", // 0 | ||
| "__name__", metricName, // 1, 2 | ||
| "job", "test_job", // 3, 4 | ||
| "instance", "test_instance", // 5, 6 | ||
| }, | ||
| Timeseries: []writev2.TimeSeries{ | ||
| { | ||
| Metadata: writev2.Metadata{Type: writev2.Metadata_METRIC_TYPE_GAUGE}, | ||
| LabelsRefs: []uint32{1, 2, 3, 4, 5, 6}, | ||
| Samples: []writev2.Sample{{Value: value, Timestamp: timestamp}}, | ||
| }, | ||
| }, | ||
| } | ||
| } | ||
|
|
||
| requests := []*writev2.Request{} | ||
| for i := 0; i < 5; i++ { | ||
| requests = append(requests, createRequest("metric_"+strconv.Itoa(i+1), float64(i+1)*10, int64(i+1)*1000)) | ||
| } | ||
|
|
||
| var wg sync.WaitGroup | ||
| var httpResults []int | ||
| var mu sync.Mutex | ||
|
|
||
| for i, req := range requests { | ||
| wg.Add(1) | ||
| go func(_ int, request *writev2.Request) { | ||
| defer wg.Done() | ||
|
|
||
| pBuf := proto.NewBuffer(nil) | ||
| err := pBuf.Marshal(request) | ||
| assert.NoError(t, err) | ||
|
|
||
| resp, err := http.Post( | ||
| ts.URL, | ||
| fmt.Sprintf("application/x-protobuf;proto=%s", promconfig.RemoteWriteProtoMsgV2), | ||
| bytes.NewBuffer(pBuf.Bytes()), | ||
| ) | ||
| assert.NoError(t, err) | ||
| defer resp.Body.Close() | ||
|
|
||
| mu.Lock() | ||
| httpResults = append(httpResults, resp.StatusCode) | ||
| mu.Unlock() | ||
| }(i, req) | ||
| } | ||
| wg.Wait() | ||
|
|
||
| // Give some time for async processing | ||
| time.Sleep(100 * time.Millisecond) | ||
|
|
||
| mockConsumer.mu.Lock() | ||
| totalDataPoints := mockConsumer.dataPoints | ||
| mockConsumer.mu.Unlock() | ||
|
|
||
| // Verify all HTTP requests succeeded | ||
| for i, status := range httpResults { | ||
| assert.Equal(t, http.StatusNoContent, status, "Request %d should return 204", i+1) | ||
| } | ||
|
|
||
| // The expected behavior is: | ||
| // - All HTTP requests return 204 (success) | ||
| // - All 5 data points are present (no data loss) | ||
| // - We have 5 resource attributes, each with 1 data point. The resource attributes are equal since they have the same job/instance labels. | ||
| // - The cache should have a single resource attribute. | ||
| assert.Equal(t, 5, totalDataPoints) | ||
| assert.Equal(t, 1, prwReceiver.rmCache.Len()) | ||
|
|
||
| // Verify thread safety: Check that metrics are properly consolidated without corruption | ||
| for i, metrics := range mockConsumer.metrics { | ||
| if metrics.DataPointCount() > 0 { | ||
| resourceMetrics := metrics.ResourceMetrics() | ||
| for j := 0; j < resourceMetrics.Len(); j++ { | ||
| rm := resourceMetrics.At(j) | ||
| scopeMetrics := rm.ScopeMetrics() | ||
| for k := 0; k < scopeMetrics.Len(); k++ { | ||
| scope := scopeMetrics.At(k) | ||
| metricsCount := scope.Metrics().Len() | ||
| if metricsCount != 1 { | ||
| t.Errorf("Batch %d: Found %d datapoints when it should be 1", i+1, metricsCount) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test doesn't make sense. Looking back at how we implemented and tested the code, it's clear there's a logical problem here. We are assuming that by sending the metrics in the following order:
mockConsumer.metrics[0]would represent the combination of these two separate inputs (sent in two separate HTTP requests). This assumption is illogical. How could the zero index contain the aggregation of two separate requests?As recently discussed in this thread (link), each request should only return data related to the information it processed. In other words: