fix(trace): Handle too many errors (#95004)

wmak · web-flow · commit 07f27ea3c0c4 · 2025-07-11T14:54:37.000-04:00
- When there's over 10k errors and the id we want isn't in that 10k we
aren't rendering it. This biasess the error query so that we'll return
the requested eventId when requested
diff --git a/src/sentry/api/endpoints/organization_trace.py b/src/sentry/api/endpoints/organization_trace.py
@@ -5,6 +5,7 @@
 
 import sentry_sdk
 from django.http import HttpRequest, HttpResponse
+from rest_framework.exceptions import ParseError
 from rest_framework.request import Request
 from rest_framework.response import Response
 from snuba_sdk import Column, Function
@@ -27,9 +28,12 @@
 from sentry.snuba.referrer import Referrer
 from sentry.snuba.spans_rpc import run_trace_query
 from sentry.utils.numbers import base32_encode
+from sentry.utils.validators import is_event_id
 
 # 1 worker each for spans, errors, performance issues
 _query_thread_pool = ThreadPoolExecutor(max_workers=3)
+# Mostly here for testing
+ERROR_LIMIT = 10_000
 
 
 class SerializedEvent(TypedDict):
@@ -204,30 +208,39 @@ def serialize_rpc_event(
         else:
             return self.serialize_rpc_issue(event, group_cache)
 
-    def errors_query(self, snuba_params: SnubaParams, trace_id: str) -> DiscoverQueryBuilder:
+    def errors_query(
+        self, snuba_params: SnubaParams, trace_id: str, error_id: str | None
+    ) -> DiscoverQueryBuilder:
         """Run an error query, getting all the errors for a given trace id"""
         # TODO: replace this with EAP calls, this query is copied from the old trace view
+        columns = [
+            "id",
+            "project.name",
+            "project.id",
+            "timestamp",
+            "timestamp_ms",
+            "trace.span",
+            "transaction",
+            "issue",
+            "title",
+            "message",
+            "tags[level]",
+        ]
+        orderby = ["id"]
+        # If there's an error_id included in the request, bias the orderby to try to return that error_id over others so
+        # that we can render it in the trace view, even if we hit the error_limit
+        if error_id is not None:
+            columns.append(f'to_other(id, "{error_id}", 0, 1) AS target')
+            orderby.insert(0, "-target")
         return DiscoverQueryBuilder(
             Dataset.Events,
             params={},
             snuba_params=snuba_params,
             query=f"trace:{trace_id}",
-            selected_columns=[
-                "id",
-                "project.name",
-                "project.id",
-                "timestamp",
-                "timestamp_ms",
-                "trace.span",
-                "transaction",
-                "issue",
-                "title",
-                "message",
-                "tags[level]",
-            ],
+            selected_columns=columns,
             # Don't add timestamp to this orderby as snuba will have to split the time range up and make multiple queries
-            orderby=["id"],
-            limit=10_000,
+            orderby=orderby,
+            limit=ERROR_LIMIT,
             config=QueryBuilderConfig(
                 auto_fields=True,
             ),
@@ -292,7 +305,9 @@ def run_perf_issues_query(self, occurrence_query: DiscoverQueryBuilder):
         return result
 
     @sentry_sdk.tracing.trace
-    def query_trace_data(self, snuba_params: SnubaParams, trace_id: str) -> list[SerializedEvent]:
+    def query_trace_data(
+        self, snuba_params: SnubaParams, trace_id: str, error_id: str | None = None
+    ) -> list[SerializedEvent]:
         """Queries span/error data for a given trace"""
         # This is a hack, long term EAP will store both errors and performance_issues eventually but is not ready
         # currently. But we want to move performance data off the old tables immediately. To keep the code simpler I'm
@@ -302,7 +317,7 @@ def query_trace_data(self, snuba_params: SnubaParams, trace_id: str) -> list[Ser
         # the thread pool, database connections can hang around as the threads are not cleaned
         # up. Because of that, tests can fail during tear down as there are active connections
         # to the database preventing a DROP.
-        errors_query = self.errors_query(snuba_params, trace_id)
+        errors_query = self.errors_query(snuba_params, trace_id, error_id)
         occurrence_query = self.perf_issues_query(snuba_params, trace_id)
 
         spans_future = _query_thread_pool.submit(
@@ -380,10 +395,14 @@ def get(self, request: Request, organization: Organization, trace_id: str) -> Ht
 
         update_snuba_params_with_timestamp(request, snuba_params)
 
+        error_id = request.GET.get("errorId")
+        if error_id is not None and not is_event_id(error_id):
+            raise ParseError(f"eventId: {error_id} needs to be a valid uuid")
+
         def data_fn(offset: int, limit: int) -> list[SerializedEvent]:
             """offset and limit don't mean anything on this endpoint currently"""
             with handle_query_errors():
-                spans = self.query_trace_data(snuba_params, trace_id)
+                spans = self.query_trace_data(snuba_params, trace_id, error_id)
             return spans
 
         return self.paginate(
diff --git a/tests/snuba/api/endpoints/test_organization_trace.py b/tests/snuba/api/endpoints/test_organization_trace.py
@@ -1,4 +1,5 @@
 import logging
+from unittest import mock
 from uuid import uuid4
 
 from django.urls import reverse
@@ -232,3 +233,37 @@ def test_with_only_errors(self):
         data = response.data
         assert len(data) == 1
         assert data[0]["event_id"] == error.event_id
+
+    def test_with_target_error(self):
+        start, _ = self.get_start_end_from_day_ago(1000)
+        error_data = load_data(
+            "javascript",
+            timestamp=start,
+        )
+        error_data["contexts"]["trace"] = {
+            "type": "trace",
+            "trace_id": self.trace_id,
+            "span_id": "a" * 16,
+        }
+        error_data["tags"] = [["transaction", "/transaction/gen1-0"]]
+        error = self.store_event(error_data, project_id=self.project.id)
+        for _ in range(5):
+            self.store_event(error_data, project_id=self.project.id)
+
+        with mock.patch("sentry.api.endpoints.organization_trace.ERROR_LIMIT", 1):
+            with self.feature(self.FEATURES):
+                response = self.client_get(
+                    data={"timestamp": self.day_ago, "errorId": error.event_id},
+                )
+        assert response.status_code == 200, response.content
+        data = response.data
+        assert len(data) == 1
+        assert data[0]["event_id"] == error.event_id
+
+    def test_with_invalid_error_id(self):
+        with self.feature(self.FEATURES):
+            response = self.client_get(
+                data={"timestamp": self.day_ago, "errorId": ",blah blah,"},
+            )
+
+        assert response.status_code == 400, response.content