chore(issues): Clean up unique id experiment (#94206)

mrduncan · web-flow · commit 377e7d8bd378 · 2025-06-24T12:59:31.000-07:00
Disabled in getsentry/sentry-options-automator#4283
diff --git a/src/sentry/grouping/parameterization.py b/src/sentry/grouping/parameterization.py
@@ -2,17 +2,11 @@
 import re
 from collections import defaultdict
 from collections.abc import Callable, Sequence
-from functools import lru_cache
-
-import tiktoken
 
 __all__ = [
     "ParameterizationCallable",
-    "ParameterizationCallableExperiment",
-    "ParameterizationExperiment",
     "ParameterizationRegex",
     "Parameterizer",
-    "UniqueIdExperiment",
 ]
 
 
@@ -213,87 +207,12 @@ class ParameterizationCallable:
     counter: int = 0
 
 
-@dataclasses.dataclass
-class ParameterizationCallableExperiment(ParameterizationCallable):
-    def run(self, content: str, callback: Callable[[str, int], None]) -> str:
-        content, count = self.apply(content)
-        if count:
-            callback(self.name, count)
-        return content
-
-
-class _UniqueId:
-    # just a namespace for the uniq_id logic, no need to instantiate
-
-    NAME = "uniq_id"
-
-    @staticmethod
-    @lru_cache(maxsize=1)
-    def tiktoken_encoding() -> tiktoken.Encoding:
-        return tiktoken.get_encoding("cl100k_base")
-
-    @staticmethod
-    def num_tokens_from_string(token_str: str) -> int:
-        """Returns the number of tokens in a text string."""
-        num_tokens = len(_UniqueId.tiktoken_encoding().encode(token_str))
-        return num_tokens
-
-    # These are all somewhat arbitrary based on examples.
-    TOKEN_LENGTH_MINIMUM = (
-        4  # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
-    )
-    TOKEN_LENGTH_RATIO_DEFAULT = 0.5
-    TOKEN_LENGTH_LONG = 10
-    TOKEN_LENGTH_RATIO_LONG = 0.4
-
-    @staticmethod
-    def is_probably_uniq_id(token_str: str) -> bool:
-        token_str = token_str.strip("\"'[]{}():;")
-        if len(token_str) < _UniqueId.TOKEN_LENGTH_MINIMUM:
-            return False
-        if (
-            token_str[0] == "<" and token_str[-1] == ">"
-        ):  # Don't replace already-parameterized tokens
-            return False
-        token_length_ratio = _UniqueId.num_tokens_from_string(token_str) / len(token_str)
-        if (
-            len(token_str) > _UniqueId.TOKEN_LENGTH_LONG
-            and token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_LONG
-        ):
-            return True
-        return token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_DEFAULT
-
-    @staticmethod
-    def replace_uniq_ids_in_str(string: str) -> tuple[str, int]:
-        """
-        Return result and count of replacements
-        """
-        strings = string.split(" ")
-        count = 0
-        for i, s in enumerate(strings):
-            if _UniqueId.is_probably_uniq_id(s):
-                strings[i] = "<uniq_id>"
-                count += 1
-        return (" ".join(strings), count)
-
-
-UniqueIdExperiment = ParameterizationCallableExperiment(
-    name=_UniqueId.NAME, apply=_UniqueId.replace_uniq_ids_in_str
-)
-
-
-ParameterizationExperiment = ParameterizationCallableExperiment
-
-
 class Parameterizer:
     def __init__(
         self,
         regex_pattern_keys: Sequence[str],
-        experiments: Sequence[ParameterizationExperiment] = (),
     ):
         self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys)
-        self._experiments = experiments
-
         self.matches_counter: defaultdict[str, int] = defaultdict(int)
 
     def _make_regex_from_patterns(self, pattern_keys: Sequence[str]) -> re.Pattern[str]:
@@ -335,41 +254,5 @@ def _handle_regex_match(match: re.Match[str]) -> str:
 
         return self._parameterization_regex.sub(_handle_regex_match, content)
 
-    def parametrize_w_experiments(
-        self, content: str, should_run: Callable[[str], bool] = lambda _: True
-    ) -> str:
-        """
-        Apply all experiments to the content.
-
-        @param content: The string to apply experiments to.
-        @returns: The content with all experiments applied.
-        """
-
-        def _incr_counter(key: str, count: int) -> None:
-            self.matches_counter[key] += count
-
-        def _handle_regex_match(match: re.Match[str]) -> str:
-            # Find the first (should be only) non-None match entry, and sub in the placeholder. For
-            # example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
-            # replacement for the original value in the string.
-            for key, value in match.groupdict().items():
-                if value is not None:
-                    self.matches_counter[key] += 1
-                    return f"<{key}>"
-            return ""
-
-        for experiment in self._experiments:
-            if not should_run(experiment.name):
-                continue
-
-            content = experiment.run(content, _incr_counter)
-
-        return content
-
-    def get_successful_experiments(self) -> Sequence[ParameterizationExperiment]:
-        return [e for e in self._experiments if self.matches_counter[e.name] > 0]
-
-    def parameterize_all(
-        self, content: str, should_run: Callable[[str], bool] = lambda _: True
-    ) -> str:
-        return self.parametrize_w_experiments(self.parametrize_w_regex(content), should_run)
+    def parameterize_all(self, content: str) -> str:
+        return self.parametrize_w_regex(content)
diff --git a/src/sentry/grouping/strategies/message.py b/src/sentry/grouping/strategies/message.py
@@ -3,19 +3,16 @@
 from itertools import islice
 from typing import TYPE_CHECKING, Any
 
-from sentry import analytics
 from sentry.grouping.component import MessageGroupingComponent
-from sentry.grouping.parameterization import Parameterizer, UniqueIdExperiment
+from sentry.grouping.parameterization import Parameterizer
 from sentry.grouping.strategies.base import (
     GroupingContext,
     ReturnedVariants,
     produces_variants,
     strategy,
 )
 from sentry.interfaces.message import Message
-from sentry.options.rollout import in_rollout_group
 from sentry.utils import metrics
-from sentry.utils.settings import is_self_hosted
 
 if TYPE_CHECKING:
     from sentry.eventstore.models import Event
@@ -38,27 +35,9 @@
     "bool",
 )
 
-EXPERIMENT_PROJECTS = [  # Active internal Sentry projects
-    1,
-    11276,
-    54785,
-    155735,
-    162676,
-    221969,
-    300688,
-    1267915,
-    1269704,
-    1492057,
-    6424467,
-    6690737,
-    4503972821204992,
-    4505469596663808,
-    4506400311934976,
-]
-
 
 @metrics.wraps("grouping.normalize_message_for_grouping")
-def normalize_message_for_grouping(message: str, event: Event, share_analytics: bool = True) -> str:
+def normalize_message_for_grouping(message: str, event: Event) -> str:
     """Replace values from a group's message with placeholders (to hide P.I.I. and
     improve grouping when no stacktrace is available) and trim to at most 2 lines.
     """
@@ -72,34 +51,9 @@ def normalize_message_for_grouping(message: str, event: Event, share_analytics:
     if trimmed != message:
         trimmed += "..."
 
-    def _should_run_experiment(experiment_name: str) -> bool:
-        return bool(
-            not is_self_hosted()
-            and event.project_id
-            and (
-                in_rollout_group(
-                    f"grouping.experiments.parameterization.{experiment_name}", event.project_id
-                )
-                or event.project_id in EXPERIMENT_PROJECTS
-            )
-        )
+    parameterizer = Parameterizer(regex_pattern_keys=REGEX_PATTERN_KEYS)
 
-    parameterizer = Parameterizer(
-        regex_pattern_keys=REGEX_PATTERN_KEYS,
-        experiments=(UniqueIdExperiment,),
-    )
-
-    normalized = parameterizer.parameterize_all(trimmed, _should_run_experiment)
-
-    for experiment in parameterizer.get_successful_experiments():
-        if share_analytics and experiment.counter < 100:
-            experiment.counter += 1
-            analytics.record(
-                "grouping.experiments.parameterization",
-                experiment_name=experiment.name,
-                project_id=event.project_id,
-                event_id=event.event_id,
-            )
+    normalized = parameterizer.parameterize_all(trimmed)
 
     for key, value in parameterizer.matches_counter.items():
         # `key` can only be one of the keys from `_parameterization_regex`, thus, not a large
diff --git a/src/sentry/grouping/strategies/newstyle.py b/src/sentry/grouping/strategies/newstyle.py
@@ -569,12 +569,7 @@ def single_exception(
 
             raw = exception.value
             if raw is not None:
-                favors_other_component = stacktrace_component.contributes or (
-                    ns_error_component is not None and ns_error_component.contributes
-                )
-                normalized = normalize_message_for_grouping(
-                    raw, event, share_analytics=(not favors_other_component)
-                )
+                normalized = normalize_message_for_grouping(raw, event)
                 hint = "stripped event-specific values" if raw != normalized else None
                 if normalized:
                     value_component.update(values=[normalized], hint=hint)
diff --git a/tests/sentry/grouping/test_parameterization.py b/tests/sentry/grouping/test_parameterization.py
@@ -1,15 +1,12 @@
 import pytest
 
-from sentry.grouping.parameterization import Parameterizer, UniqueIdExperiment
+from sentry.grouping.parameterization import Parameterizer
 from sentry.grouping.strategies.message import REGEX_PATTERN_KEYS
 
 
 @pytest.fixture
 def parameterizer():
-    return Parameterizer(
-        regex_pattern_keys=REGEX_PATTERN_KEYS,
-        experiments=(UniqueIdExperiment,),
-    )
+    return Parameterizer(regex_pattern_keys=REGEX_PATTERN_KEYS)
 
 
 @pytest.mark.parametrize(
@@ -165,74 +162,6 @@ def test_parameterize_standard(name, input, expected, parameterizer):
     assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
 
 
-@pytest.mark.parametrize(
-    ("name", "input", "expected"),
-    [
-        (
-            "Uniq ID - sql savepoint",
-            '''SQL: RELEASE SAVEPOINT "s140177518376768_x2"''',
-            """SQL: RELEASE SAVEPOINT <uniq_id>""",
-        ),
-        (
-            "Uniq ID - api gateway",
-            """API gateway VdLchF7iDo8sVkg= blah""",
-            """API gateway <uniq_id> blah""",
-        ),
-        (
-            "Uniq ID - fb trace",
-            """fbtrace_id Aba64NMEPMmBwi_cPLaGeeK AugPfq0jxGbto4u3kxn8u6p blah""",
-            """fbtrace_id <uniq_id> <uniq_id> blah""",
-        ),
-        (
-            "Uniq ID - word with numerical pre/suffix",
-            """1password python3 abc123 123abc""",
-            """1password python3 abc123 123abc""",
-        ),
-        (
-            "Uniq ID - cloudflare trace",
-            """cloudflare trace 230b030023ae2822-SJC 819cc532aex26akb-SNP blah""",
-            """cloudflare trace <uniq_id> <uniq_id> blah""",
-        ),
-        (
-            "Uniq ID - JWT",
-            """blah eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c""",
-            """blah <uniq_id>""",
-        ),
-        (
-            "Uniq ID - Nothing to replace",
-            """I am the test words 1password python3 abc123 123abc""",
-            """I am the test words 1password python3 abc123 123abc""",
-        ),
-        (
-            "Uniq ID - react element",
-            """Permission denied to access property "__reactFiber$b6c78e70asw" """,
-            """Permission denied to access property <uniq_id> """,
-        ),
-        (
-            "Uniq ID - no change variable name",
-            """TypeError: Cannot read property 'startRTM' of undefined""",
-            """TypeError: Cannot read property 'startRTM' of undefined""",
-        ),
-        (
-            "Uniq ID - json ignored properly",
-            """[401,""]""",
-            """[<int>,""]""",
-        ),
-        (
-            "Uniq ID - no change",
-            """Blocked 'script' from 'wasm-eval:'""",
-            """Blocked 'script' from 'wasm-eval:'""",
-        ),
-    ],
-)
-def test_parameterize_experiment(name, input, expected, parameterizer):
-    assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
-    if "<uniq_id>" in expected:
-        experiments = parameterizer.get_successful_experiments()
-        assert len(experiments) == 1
-        assert experiments[0] == UniqueIdExperiment
-
-
 # These are test cases that we should fix
 @pytest.mark.xfail()
 @pytest.mark.parametrize(
@@ -249,11 +178,6 @@ def test_parameterize_experiment(name, input, expected, parameterizer):
             """Tb.Worker {"msg" => "(#239323) Received ...""",
             """Tb.Worker {"msg" => "(#<int>) Received ...""",
         ),
-        (
-            "Uniq ID - Snuba query",
-            """Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776026)) AS `_snuba_tags_raw[9223372036854776026]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), 9223372036854775936)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[9223372036854776026]`, 'tolerable') AND equals(_snuba_metric_id, 9223372036854775936)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, 9223372036854775936))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), 1383997) AND in((project_id AS _snuba_project_id), [6726638]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('2024-03-18T23:22:00', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776069)) AS `_snuba_tags_raw[9223372036854776069]`), '2d896d92') AND in(_s...}""",
-            """Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), <int>)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[<int>]`, 'tolerable') AND equals(_snuba_metric_id, <int>)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, <int>))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), <int>) AND in((project_id AS _snuba_project_id), [<int>]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('<date>', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), '<uniq_id>') AND in(_s...}""",
-        ),
     ],
 )
 def test_fail_parameterize(name, input, expected, parameterizer):
@@ -266,7 +190,6 @@ def test_fail_parameterize(name, input, expected, parameterizer):
     ("name", "input", "expected"),
     [
         ("Not an Int", "Encoding: utf-8", "Encoding: utf-8"),  # produces "Encoding: utf<int>"
-        ("Not a Uniq ID", "X-Amz-Apigw-Id", "X-Amz-Apigw-Id"),  # produces "<uniq_id>"
     ],
 )
 def test_too_aggressive_parameterize(name, input, expected, parameterizer):