Skip to content

Commit 377e7d8

Browse files
authored
chore(issues): Clean up unique id experiment (#94206)
Disabled in getsentry/sentry-options-automator#4283
1 parent 8ae9cd3 commit 377e7d8

File tree

4 files changed

+9
-254
lines changed

4 files changed

+9
-254
lines changed

src/sentry/grouping/parameterization.py

Lines changed: 2 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,11 @@
22
import re
33
from collections import defaultdict
44
from collections.abc import Callable, Sequence
5-
from functools import lru_cache
6-
7-
import tiktoken
85

96
__all__ = [
107
"ParameterizationCallable",
11-
"ParameterizationCallableExperiment",
12-
"ParameterizationExperiment",
138
"ParameterizationRegex",
149
"Parameterizer",
15-
"UniqueIdExperiment",
1610
]
1711

1812

@@ -213,87 +207,12 @@ class ParameterizationCallable:
213207
counter: int = 0
214208

215209

216-
@dataclasses.dataclass
217-
class ParameterizationCallableExperiment(ParameterizationCallable):
218-
def run(self, content: str, callback: Callable[[str, int], None]) -> str:
219-
content, count = self.apply(content)
220-
if count:
221-
callback(self.name, count)
222-
return content
223-
224-
225-
class _UniqueId:
226-
# just a namespace for the uniq_id logic, no need to instantiate
227-
228-
NAME = "uniq_id"
229-
230-
@staticmethod
231-
@lru_cache(maxsize=1)
232-
def tiktoken_encoding() -> tiktoken.Encoding:
233-
return tiktoken.get_encoding("cl100k_base")
234-
235-
@staticmethod
236-
def num_tokens_from_string(token_str: str) -> int:
237-
"""Returns the number of tokens in a text string."""
238-
num_tokens = len(_UniqueId.tiktoken_encoding().encode(token_str))
239-
return num_tokens
240-
241-
# These are all somewhat arbitrary based on examples.
242-
TOKEN_LENGTH_MINIMUM = (
243-
4 # Tokens smaller than this are unlikely to be unique ids regardless of other attributes
244-
)
245-
TOKEN_LENGTH_RATIO_DEFAULT = 0.5
246-
TOKEN_LENGTH_LONG = 10
247-
TOKEN_LENGTH_RATIO_LONG = 0.4
248-
249-
@staticmethod
250-
def is_probably_uniq_id(token_str: str) -> bool:
251-
token_str = token_str.strip("\"'[]{}():;")
252-
if len(token_str) < _UniqueId.TOKEN_LENGTH_MINIMUM:
253-
return False
254-
if (
255-
token_str[0] == "<" and token_str[-1] == ">"
256-
): # Don't replace already-parameterized tokens
257-
return False
258-
token_length_ratio = _UniqueId.num_tokens_from_string(token_str) / len(token_str)
259-
if (
260-
len(token_str) > _UniqueId.TOKEN_LENGTH_LONG
261-
and token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_LONG
262-
):
263-
return True
264-
return token_length_ratio > _UniqueId.TOKEN_LENGTH_RATIO_DEFAULT
265-
266-
@staticmethod
267-
def replace_uniq_ids_in_str(string: str) -> tuple[str, int]:
268-
"""
269-
Return result and count of replacements
270-
"""
271-
strings = string.split(" ")
272-
count = 0
273-
for i, s in enumerate(strings):
274-
if _UniqueId.is_probably_uniq_id(s):
275-
strings[i] = "<uniq_id>"
276-
count += 1
277-
return (" ".join(strings), count)
278-
279-
280-
UniqueIdExperiment = ParameterizationCallableExperiment(
281-
name=_UniqueId.NAME, apply=_UniqueId.replace_uniq_ids_in_str
282-
)
283-
284-
285-
ParameterizationExperiment = ParameterizationCallableExperiment
286-
287-
288210
class Parameterizer:
289211
def __init__(
290212
self,
291213
regex_pattern_keys: Sequence[str],
292-
experiments: Sequence[ParameterizationExperiment] = (),
293214
):
294215
self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys)
295-
self._experiments = experiments
296-
297216
self.matches_counter: defaultdict[str, int] = defaultdict(int)
298217

299218
def _make_regex_from_patterns(self, pattern_keys: Sequence[str]) -> re.Pattern[str]:
@@ -335,41 +254,5 @@ def _handle_regex_match(match: re.Match[str]) -> str:
335254

336255
return self._parameterization_regex.sub(_handle_regex_match, content)
337256

338-
def parametrize_w_experiments(
339-
self, content: str, should_run: Callable[[str], bool] = lambda _: True
340-
) -> str:
341-
"""
342-
Apply all experiments to the content.
343-
344-
@param content: The string to apply experiments to.
345-
@returns: The content with all experiments applied.
346-
"""
347-
348-
def _incr_counter(key: str, count: int) -> None:
349-
self.matches_counter[key] += count
350-
351-
def _handle_regex_match(match: re.Match[str]) -> str:
352-
# Find the first (should be only) non-None match entry, and sub in the placeholder. For
353-
# example, given the groupdict item `('hex', '0x40000015')`, this returns '<hex>' as a
354-
# replacement for the original value in the string.
355-
for key, value in match.groupdict().items():
356-
if value is not None:
357-
self.matches_counter[key] += 1
358-
return f"<{key}>"
359-
return ""
360-
361-
for experiment in self._experiments:
362-
if not should_run(experiment.name):
363-
continue
364-
365-
content = experiment.run(content, _incr_counter)
366-
367-
return content
368-
369-
def get_successful_experiments(self) -> Sequence[ParameterizationExperiment]:
370-
return [e for e in self._experiments if self.matches_counter[e.name] > 0]
371-
372-
def parameterize_all(
373-
self, content: str, should_run: Callable[[str], bool] = lambda _: True
374-
) -> str:
375-
return self.parametrize_w_experiments(self.parametrize_w_regex(content), should_run)
257+
def parameterize_all(self, content: str) -> str:
258+
return self.parametrize_w_regex(content)

src/sentry/grouping/strategies/message.py

Lines changed: 4 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,16 @@
33
from itertools import islice
44
from typing import TYPE_CHECKING, Any
55

6-
from sentry import analytics
76
from sentry.grouping.component import MessageGroupingComponent
8-
from sentry.grouping.parameterization import Parameterizer, UniqueIdExperiment
7+
from sentry.grouping.parameterization import Parameterizer
98
from sentry.grouping.strategies.base import (
109
GroupingContext,
1110
ReturnedVariants,
1211
produces_variants,
1312
strategy,
1413
)
1514
from sentry.interfaces.message import Message
16-
from sentry.options.rollout import in_rollout_group
1715
from sentry.utils import metrics
18-
from sentry.utils.settings import is_self_hosted
1916

2017
if TYPE_CHECKING:
2118
from sentry.eventstore.models import Event
@@ -38,27 +35,9 @@
3835
"bool",
3936
)
4037

41-
EXPERIMENT_PROJECTS = [ # Active internal Sentry projects
42-
1,
43-
11276,
44-
54785,
45-
155735,
46-
162676,
47-
221969,
48-
300688,
49-
1267915,
50-
1269704,
51-
1492057,
52-
6424467,
53-
6690737,
54-
4503972821204992,
55-
4505469596663808,
56-
4506400311934976,
57-
]
58-
5938

6039
@metrics.wraps("grouping.normalize_message_for_grouping")
61-
def normalize_message_for_grouping(message: str, event: Event, share_analytics: bool = True) -> str:
40+
def normalize_message_for_grouping(message: str, event: Event) -> str:
6241
"""Replace values from a group's message with placeholders (to hide P.I.I. and
6342
improve grouping when no stacktrace is available) and trim to at most 2 lines.
6443
"""
@@ -72,34 +51,9 @@ def normalize_message_for_grouping(message: str, event: Event, share_analytics:
7251
if trimmed != message:
7352
trimmed += "..."
7453

75-
def _should_run_experiment(experiment_name: str) -> bool:
76-
return bool(
77-
not is_self_hosted()
78-
and event.project_id
79-
and (
80-
in_rollout_group(
81-
f"grouping.experiments.parameterization.{experiment_name}", event.project_id
82-
)
83-
or event.project_id in EXPERIMENT_PROJECTS
84-
)
85-
)
54+
parameterizer = Parameterizer(regex_pattern_keys=REGEX_PATTERN_KEYS)
8655

87-
parameterizer = Parameterizer(
88-
regex_pattern_keys=REGEX_PATTERN_KEYS,
89-
experiments=(UniqueIdExperiment,),
90-
)
91-
92-
normalized = parameterizer.parameterize_all(trimmed, _should_run_experiment)
93-
94-
for experiment in parameterizer.get_successful_experiments():
95-
if share_analytics and experiment.counter < 100:
96-
experiment.counter += 1
97-
analytics.record(
98-
"grouping.experiments.parameterization",
99-
experiment_name=experiment.name,
100-
project_id=event.project_id,
101-
event_id=event.event_id,
102-
)
56+
normalized = parameterizer.parameterize_all(trimmed)
10357

10458
for key, value in parameterizer.matches_counter.items():
10559
# `key` can only be one of the keys from `_parameterization_regex`, thus, not a large

src/sentry/grouping/strategies/newstyle.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -569,12 +569,7 @@ def single_exception(
569569

570570
raw = exception.value
571571
if raw is not None:
572-
favors_other_component = stacktrace_component.contributes or (
573-
ns_error_component is not None and ns_error_component.contributes
574-
)
575-
normalized = normalize_message_for_grouping(
576-
raw, event, share_analytics=(not favors_other_component)
577-
)
572+
normalized = normalize_message_for_grouping(raw, event)
578573
hint = "stripped event-specific values" if raw != normalized else None
579574
if normalized:
580575
value_component.update(values=[normalized], hint=hint)

tests/sentry/grouping/test_parameterization.py

Lines changed: 2 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
import pytest
22

3-
from sentry.grouping.parameterization import Parameterizer, UniqueIdExperiment
3+
from sentry.grouping.parameterization import Parameterizer
44
from sentry.grouping.strategies.message import REGEX_PATTERN_KEYS
55

66

77
@pytest.fixture
88
def parameterizer():
9-
return Parameterizer(
10-
regex_pattern_keys=REGEX_PATTERN_KEYS,
11-
experiments=(UniqueIdExperiment,),
12-
)
9+
return Parameterizer(regex_pattern_keys=REGEX_PATTERN_KEYS)
1310

1411

1512
@pytest.mark.parametrize(
@@ -165,74 +162,6 @@ def test_parameterize_standard(name, input, expected, parameterizer):
165162
assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
166163

167164

168-
@pytest.mark.parametrize(
169-
("name", "input", "expected"),
170-
[
171-
(
172-
"Uniq ID - sql savepoint",
173-
'''SQL: RELEASE SAVEPOINT "s140177518376768_x2"''',
174-
"""SQL: RELEASE SAVEPOINT <uniq_id>""",
175-
),
176-
(
177-
"Uniq ID - api gateway",
178-
"""API gateway VdLchF7iDo8sVkg= blah""",
179-
"""API gateway <uniq_id> blah""",
180-
),
181-
(
182-
"Uniq ID - fb trace",
183-
"""fbtrace_id Aba64NMEPMmBwi_cPLaGeeK AugPfq0jxGbto4u3kxn8u6p blah""",
184-
"""fbtrace_id <uniq_id> <uniq_id> blah""",
185-
),
186-
(
187-
"Uniq ID - word with numerical pre/suffix",
188-
"""1password python3 abc123 123abc""",
189-
"""1password python3 abc123 123abc""",
190-
),
191-
(
192-
"Uniq ID - cloudflare trace",
193-
"""cloudflare trace 230b030023ae2822-SJC 819cc532aex26akb-SNP blah""",
194-
"""cloudflare trace <uniq_id> <uniq_id> blah""",
195-
),
196-
(
197-
"Uniq ID - JWT",
198-
"""blah eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c""",
199-
"""blah <uniq_id>""",
200-
),
201-
(
202-
"Uniq ID - Nothing to replace",
203-
"""I am the test words 1password python3 abc123 123abc""",
204-
"""I am the test words 1password python3 abc123 123abc""",
205-
),
206-
(
207-
"Uniq ID - react element",
208-
"""Permission denied to access property "__reactFiber$b6c78e70asw" """,
209-
"""Permission denied to access property <uniq_id> """,
210-
),
211-
(
212-
"Uniq ID - no change variable name",
213-
"""TypeError: Cannot read property 'startRTM' of undefined""",
214-
"""TypeError: Cannot read property 'startRTM' of undefined""",
215-
),
216-
(
217-
"Uniq ID - json ignored properly",
218-
"""[401,""]""",
219-
"""[<int>,""]""",
220-
),
221-
(
222-
"Uniq ID - no change",
223-
"""Blocked 'script' from 'wasm-eval:'""",
224-
"""Blocked 'script' from 'wasm-eval:'""",
225-
),
226-
],
227-
)
228-
def test_parameterize_experiment(name, input, expected, parameterizer):
229-
assert expected == parameterizer.parameterize_all(input), f"Case {name} Failed"
230-
if "<uniq_id>" in expected:
231-
experiments = parameterizer.get_successful_experiments()
232-
assert len(experiments) == 1
233-
assert experiments[0] == UniqueIdExperiment
234-
235-
236165
# These are test cases that we should fix
237166
@pytest.mark.xfail()
238167
@pytest.mark.parametrize(
@@ -249,11 +178,6 @@ def test_parameterize_experiment(name, input, expected, parameterizer):
249178
"""Tb.Worker {"msg" => "(#239323) Received ...""",
250179
"""Tb.Worker {"msg" => "(#<int>) Received ...""",
251180
),
252-
(
253-
"Uniq ID - Snuba query",
254-
"""Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776026)) AS `_snuba_tags_raw[9223372036854776026]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), 9223372036854775936)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[9223372036854776026]`, 'tolerable') AND equals(_snuba_metric_id, 9223372036854775936)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, 9223372036854775936))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), 1383997) AND in((project_id AS _snuba_project_id), [6726638]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('2024-03-18T23:22:00', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, 9223372036854776069)) AS `_snuba_tags_raw[9223372036854776069]`), '2d896d92') AND in(_s...}""",
255-
"""Error running query: SELECT (divide(plus(sumMergeIf((value AS _snuba_value), equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), 'satisfactory') AND equals((metric_id AS _snuba_metric_id), <int>)), divide(sumMergeIf(_snuba_value, equals(`_snuba_tags_raw[<int>]`, 'tolerable') AND equals(_snuba_metric_id, <int>)), 2)), sumMergeIf(_snuba_value, equals(_snuba_metric_id, <int>))) AS `_snuba_c:transactions/on_demand@none`) FROM generic_metric_counters_aggregated_dist WHERE equals(granularity, 1) AND equals((org_id AS _snuba_org_id), <int>) AND in((project_id AS _snuba_project_id), [<int>]) AND greaterOrEquals((timestamp AS _snuba_timestamp), toDateTime('2024-03-18T22:52:00', 'Universal')) AND less(_snuba_timestamp, toDateTime('<date>', 'Universal')) AND equals((arrayElement(tags.raw_value, indexOf(tags.key, <int>)) AS `_snuba_tags_raw[<int>]`), '<uniq_id>') AND in(_s...}""",
256-
),
257181
],
258182
)
259183
def test_fail_parameterize(name, input, expected, parameterizer):
@@ -266,7 +190,6 @@ def test_fail_parameterize(name, input, expected, parameterizer):
266190
("name", "input", "expected"),
267191
[
268192
("Not an Int", "Encoding: utf-8", "Encoding: utf-8"), # produces "Encoding: utf<int>"
269-
("Not a Uniq ID", "X-Amz-Apigw-Id", "X-Amz-Apigw-Id"), # produces "<uniq_id>"
270193
],
271194
)
272195
def test_too_aggressive_parameterize(name, input, expected, parameterizer):

0 commit comments

Comments
 (0)