Skip to content

Commit fdd385e

Browse files
authored
feat(issues): Add hex parameterization experiment (#93970)
Our existing hex parameterization supports hex values of any length however they must begin with either `0x` or `0X`. We frequently see hex values in messages which do not have this prefix (ex: trace ids or span ids). This adds support for hex values which do not have the `0x` prefix. To ensure it does not over-parameterize the following requirements: - Must be exactly 4 or 8 bytes (8 or 16 characters) - Must contain both a number and a letter - Must use lowercase letters (anecdotally I've never seen examples where uppercase would be beneficial) The addition of `EXPERIMENT_PARAMETERIZATION_REGEXES_MAP` and other parameters will all be removed once the rollout of this is completed.
1 parent b53da03 commit fdd385e

File tree

4 files changed

+61
-6
lines changed

4 files changed

+61
-6
lines changed

src/sentry/grouping/parameterization.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,37 @@ def compiled_pattern(self) -> re.Pattern[str]:
180180
),
181181
]
182182

183+
EXPERIMENT_PARAMETERIZATION_REGEXES = [
184+
(
185+
ParameterizationRegex(
186+
name="hex",
187+
raw_pattern=r"""
188+
# Hex value with 0x or 0X prefix
189+
(\b0[xX][0-9a-fA-F]+\b) |
190+
191+
# Hex value without 0x or 0X prefix exactly 4 or 8 bytes long.
192+
#
193+
# We don't need to lookahead for a-f since we if it contains at
194+
# least one number it must contain at least one a-f otherwise it
195+
# would have matched "int".
196+
#
197+
# (?=.*[0-9]): At least one 0-9 is in the match.
198+
# [0-9a-f]{8/16}: Exactly 8 or 16 hex characters (0-9, a-f).
199+
(\b(?=.*[0-9])[0-9a-f]{8}\b) |
200+
(\b(?=.*[0-9])[0-9a-f]{16}\b)
201+
""",
202+
)
203+
if r.name == "hex"
204+
else r
205+
)
206+
for r in DEFAULT_PARAMETERIZATION_REGEXES.copy()
207+
]
208+
183209

184210
DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES}
211+
EXPERIMENT_PARAMETERIZATION_REGEXES_MAP = {
212+
r.name: r.pattern for r in EXPERIMENT_PARAMETERIZATION_REGEXES
213+
}
185214

186215

187216
@dataclasses.dataclass
@@ -273,14 +302,15 @@ def __init__(
273302
self,
274303
regex_pattern_keys: Sequence[str],
275304
experiments: Sequence[ParameterizationExperiment] = (),
305+
enable_regex_experiments: bool = False,
276306
):
307+
self._enable_regex_experiments = enable_regex_experiments
277308
self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys)
278309
self._experiments = experiments
279310

280311
self.matches_counter: defaultdict[str, int] = defaultdict(int)
281312

282-
@staticmethod
283-
def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]:
313+
def _make_regex_from_patterns(self, pattern_keys: Sequence[str]) -> re.Pattern[str]:
284314
"""
285315
Takes list of pattern keys and returns a compiled regex pattern that matches any of them.
286316
@@ -292,9 +322,14 @@ def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]:
292322
so we can use newlines and indentation for better legibility in patterns above.
293323
"""
294324

295-
return re.compile(
296-
rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
297-
)
325+
if self._enable_regex_experiments:
326+
return re.compile(
327+
rf"(?x){'|'.join(EXPERIMENT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
328+
)
329+
else:
330+
return re.compile(
331+
rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
332+
)
298333

299334
def parametrize_w_regex(self, content: str) -> str:
300335
"""

src/sentry/grouping/strategies/message.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,9 @@ def _should_run_experiment(experiment_name: str) -> bool:
8585
)
8686

8787
parameterizer = Parameterizer(
88-
regex_pattern_keys=REGEX_PATTERN_KEYS, experiments=(UniqueIdExperiment,)
88+
regex_pattern_keys=REGEX_PATTERN_KEYS,
89+
experiments=(UniqueIdExperiment,),
90+
enable_regex_experiments=_should_run_experiment("regex"),
8991
)
9092

9193
normalized = parameterizer.parameterize_all(trimmed, _should_run_experiment)

src/sentry/options/defaults.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2624,6 +2624,11 @@
26242624
default=0.0,
26252625
flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE,
26262626
)
2627+
register(
2628+
"grouping.experiments.parameterization.regex",
2629+
default=0.0,
2630+
flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE,
2631+
)
26272632

26282633
# TODO: For now, only a small number of projects are going through a grouping config transition at
26292634
# any given time, so we're sampling at 100% in order to be able to get good signal. Once we've fully

tests/sentry/grouping/test_parameterization.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ def parameterizer():
99
return Parameterizer(
1010
regex_pattern_keys=REGEX_PATTERN_KEYS,
1111
experiments=(UniqueIdExperiment,),
12+
enable_regex_experiments=True,
1213
)
1314

1415

@@ -108,6 +109,18 @@ def parameterizer():
108109
"""blah <date> had a problem""",
109110
),
110111
("hex", """blah 0x9af8c3b had a problem""", """blah <hex> had a problem"""),
112+
("hex", """blah 9af8c3b0 had a problem""", """blah <hex> had a problem"""),
113+
("hex", """blah 9af8c3b09af8c3b0 had a problem""", """blah <hex> had a problem"""),
114+
(
115+
"hex - missing numbers",
116+
"""blah aaffccbb had a problem""",
117+
"""blah aaffccbb had a problem""",
118+
),
119+
(
120+
"hex - not 4 or 8 bytes",
121+
"""blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""",
122+
"""blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""",
123+
),
111124
("float", """blah 0.23 had a problem""", """blah <float> had a problem"""),
112125
("int", """blah 23 had a problem""", """blah <int> had a problem"""),
113126
(

0 commit comments

Comments
 (0)