Skip to content

Commit 05ca85a

Browse files
authored
ref(grouping): Clarify Seer grouping deletion code (#95152)
This will make my next pull request easier to review. Ref [inc-1236](https://sentry.slack.com/archives/C095HFVE80Y/p1752007736483209)
1 parent 9f24114 commit 05ca85a

File tree

4 files changed

+35
-24
lines changed

4 files changed

+35
-24
lines changed

src/sentry/seer/similarity/grouping_records.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
logger = logging.getLogger(__name__)
2222

2323
POST_BULK_GROUPING_RECORDS_TIMEOUT = 10000
24+
DELETE_HASH_METRIC = "grouping.similarity.delete_records_by_hash"
2425

2526

2627
class CreateGroupingRecordData(TypedDict):
@@ -125,7 +126,7 @@ def delete_project_grouping_records(
125126
return False
126127

127128

128-
def delete_grouping_records_by_hash(project_id: int, hashes: Sequence[str]) -> bool:
129+
def call_seer_to_delete_these_hashes(project_id: int, hashes: Sequence[str]) -> bool:
129130
extra = {"project_id": project_id, "hashes": hashes}
130131
try:
131132
body = {"project_id": project_id, "hash_list": hashes}
@@ -142,6 +143,11 @@ def delete_grouping_records_by_hash(project_id: int, hashes: Sequence[str]) -> b
142143
"seer.delete_grouping_records.hashes.timeout",
143144
extra=extra,
144145
)
146+
metrics.incr(
147+
DELETE_HASH_METRIC,
148+
sample_rate=options.get("seer.similarity.metrics_sample_rate"),
149+
tags={"success": False, "reason": "ReadTimeoutError"},
150+
)
145151
return False
146152

147153
if response.status >= 200 and response.status < 300:
@@ -150,15 +156,15 @@ def delete_grouping_records_by_hash(project_id: int, hashes: Sequence[str]) -> b
150156
extra=extra,
151157
)
152158
metrics.incr(
153-
"grouping.similarity.delete_records_by_hash",
159+
DELETE_HASH_METRIC,
154160
sample_rate=options.get("seer.similarity.metrics_sample_rate"),
155161
tags={"success": True},
156162
)
157163
return True
158164
else:
159165
logger.error("seer.delete_grouping_records.hashes.failure", extra=extra)
160166
metrics.incr(
161-
"grouping.similarity.delete_records_by_hash",
167+
DELETE_HASH_METRIC,
162168
sample_rate=options.get("seer.similarity.metrics_sample_rate"),
163169
tags={"success": False},
164170
)

src/sentry/tasks/delete_seer_grouping_records.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from sentry.models.group import Group
77
from sentry.models.grouphash import GroupHash
88
from sentry.seer.similarity.grouping_records import (
9-
delete_grouping_records_by_hash,
9+
call_seer_to_delete_these_hashes,
1010
delete_project_grouping_records,
1111
)
1212
from sentry.seer.similarity.utils import ReferrerOptions, killswitch_enabled
@@ -16,15 +16,13 @@
1616
from sentry.taskworker.namespaces import seer_tasks
1717
from sentry.utils.query import RangeQuerySetWrapper
1818

19-
BATCH_SIZE = 1000
20-
2119
logger = logging.getLogger(__name__)
2220

2321

2422
@instrumented_task(
2523
name="sentry.tasks.delete_seer_grouping_records_by_hash",
2624
queue="delete_seer_grouping_records_by_hash",
27-
max_retries=0,
25+
max_retries=0, # XXX: Why do we not retry?
2826
silo_mode=SiloMode.REGION,
2927
soft_time_limit=60 * 15,
3028
time_limit=60 * (15 + 5),
@@ -42,17 +40,17 @@ def delete_seer_grouping_records_by_hash(
4240
) -> None:
4341
"""
4442
Task to delete seer grouping records by hash list.
45-
Calls the seer delete by hash endpoint with batches of hashes of size `BATCH_SIZE`.
43+
Calls the seer delete by hash endpoint with batches of hashes of size `batch_size`.
4644
"""
4745
if killswitch_enabled(project_id, ReferrerOptions.DELETION) or options.get(
4846
"seer.similarity-embeddings-delete-by-hash-killswitch.enabled"
4947
):
5048
return
5149

52-
batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size")
50+
batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size") or 100
5351
len_hashes = len(hashes)
5452
end_index = min(last_deleted_index + batch_size, len_hashes)
55-
delete_grouping_records_by_hash(project_id, hashes[last_deleted_index:end_index])
53+
call_seer_to_delete_these_hashes(project_id, hashes[last_deleted_index:end_index])
5654
if end_index < len_hashes:
5755
delete_seer_grouping_records_by_hash.apply_async(args=[project_id, hashes, end_index])
5856

@@ -71,15 +69,16 @@ def call_delete_seer_grouping_records_by_hash(
7169
and not options.get("seer.similarity-embeddings-delete-by-hash-killswitch.enabled")
7270
):
7371
group_hashes = []
72+
batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size") or 100
7473

7574
for group_hash in RangeQuerySetWrapper(
7675
GroupHash.objects.filter(project_id=project.id, group__id__in=group_ids),
77-
step=BATCH_SIZE,
76+
step=batch_size,
7877
):
7978
group_hashes.append(group_hash.hash)
8079

81-
# Schedule task when we reach BATCH_SIZE
82-
if len(group_hashes) >= BATCH_SIZE:
80+
# Schedule task when we reach batch_size
81+
if len(group_hashes) >= batch_size:
8382
delete_seer_grouping_records_by_hash.apply_async(args=[project.id, group_hashes, 0])
8483
group_hashes = []
8584

tests/sentry/seer/similarity/test_grouping_records.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from sentry.seer.similarity.grouping_records import (
1212
POST_BULK_GROUPING_RECORDS_TIMEOUT,
1313
CreateGroupingRecordsRequest,
14-
delete_grouping_records_by_hash,
14+
call_seer_to_delete_these_hashes,
1515
post_bulk_grouping_records,
1616
)
1717
from sentry.testutils.pytest.fixtures import django_db_all
@@ -178,7 +178,7 @@ def test_delete_grouping_records_by_hash_success(
178178
)
179179

180180
project_id, hashes = 1, ["1", "2"]
181-
response = delete_grouping_records_by_hash(project_id, hashes)
181+
response = call_seer_to_delete_these_hashes(project_id, hashes)
182182
assert response is True
183183
mock_logger.info.assert_called_with(
184184
"seer.delete_grouping_records.hashes.success",
@@ -189,6 +189,7 @@ def test_delete_grouping_records_by_hash_success(
189189
)
190190

191191

192+
@django_db_all
192193
@mock.patch("sentry.seer.similarity.grouping_records.logger")
193194
@mock.patch("sentry.seer.similarity.grouping_records.seer_grouping_connection_pool.urlopen")
194195
def test_delete_grouping_records_by_hash_timeout(
@@ -198,7 +199,7 @@ def test_delete_grouping_records_by_hash_timeout(
198199
DUMMY_POOL, SEER_HASH_GROUPING_RECORDS_DELETE_URL, "read timed out"
199200
)
200201
project_id, hashes = 1, ["1", "2"]
201-
response = delete_grouping_records_by_hash(project_id, hashes)
202+
response = call_seer_to_delete_these_hashes(project_id, hashes)
202203
assert response is False
203204
mock_logger.exception.assert_called_with(
204205
"seer.delete_grouping_records.hashes.timeout",
@@ -223,7 +224,7 @@ def test_delete_grouping_records_by_hash_failure(
223224
status=500,
224225
)
225226
project_id, hashes = 1, ["1", "2"]
226-
response = delete_grouping_records_by_hash(project_id, hashes)
227+
response = call_seer_to_delete_these_hashes(project_id, hashes)
227228
assert response is False
228229
mock_logger.error.assert_called_with(
229230
"seer.delete_grouping_records.hashes.failure",

tests/sentry/tasks/test_delete_seer_grouping_records.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from time import time
22
from unittest.mock import MagicMock, patch
33

4+
from sentry import options
45
from sentry.models.grouphash import GroupHash
56
from sentry.tasks.delete_seer_grouping_records import (
67
call_delete_seer_grouping_records_by_hash,
@@ -12,21 +13,23 @@
1213

1314
@django_db_all
1415
class TestDeleteSeerGroupingRecordsByHash(TestCase):
15-
@patch("sentry.tasks.delete_seer_grouping_records.delete_grouping_records_by_hash")
16+
@patch("sentry.tasks.delete_seer_grouping_records.call_seer_to_delete_these_hashes")
1617
@patch(
1718
"sentry.tasks.delete_seer_grouping_records.delete_seer_grouping_records_by_hash.apply_async"
1819
)
1920
def test_delete_seer_grouping_records_by_hash_batches(
2021
self,
2122
mock_delete_seer_grouping_records_by_hash_apply_async: MagicMock,
22-
mock_delete_grouping_records_by_hash: MagicMock,
23+
mock_call_seer_to_delete_these_hashes: MagicMock,
2324
) -> None:
2425
"""
25-
Test that when delete_seer_grouping_records_by_hash is called with over 20 hashes, it spawns
26+
Test that when delete_seer_grouping_records_by_hash is called with more hashes than the batch size, it spawns
2627
another task with the end index of the previous batch.
2728
"""
28-
mock_delete_grouping_records_by_hash.return_value = True
29-
project_id, hashes = 1, [str(i) for i in range(101)]
29+
batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size") or 100
30+
mock_call_seer_to_delete_these_hashes.return_value = True
31+
project_id, hashes = 1, [str(i) for i in range(batch_size + 1)]
32+
# We call it as a function and will schedule a task for the extra hash
3033
delete_seer_grouping_records_by_hash(project_id, hashes, 0)
3134
assert mock_delete_seer_grouping_records_by_hash_apply_async.call_args[1] == {
3235
"args": [project_id, hashes, 100]
@@ -61,7 +64,7 @@ def test_call_delete_seer_grouping_records_by_hash_simple(
6164
def test_call_delete_seer_grouping_records_by_hash_chunked(self) -> None:
6265
"""
6366
Test that call_delete_seer_grouping_records_by_hash chunks large numbers of hashes
64-
into separate tasks with a maximum of BATCH_SIZE hashes per task.
67+
into separate tasks with a maximum of batch_size hashes per task.
6568
"""
6669
self.project.update_option("sentry:similarity_backfill_completed", int(time()))
6770

@@ -70,7 +73,7 @@ def test_call_delete_seer_grouping_records_by_hash_chunked(self) -> None:
7073
patch(
7174
"sentry.tasks.delete_seer_grouping_records.delete_seer_grouping_records_by_hash.apply_async"
7275
) as mock_apply_async,
73-
patch("sentry.tasks.delete_seer_grouping_records.BATCH_SIZE", batch_size),
76+
self.options({"embeddings-grouping.seer.delete-record-batch-size": batch_size}),
7477
):
7578
# Create 15 group hashes to test chunking (10 + 5 with batch size of 10)
7679
group_ids, expected_hashes = [], []
@@ -91,12 +94,14 @@ def test_call_delete_seer_grouping_records_by_hash_chunked(self) -> None:
9194
first_call_args = mock_apply_async.call_args_list[0][1]["args"]
9295
assert len(first_call_args[1]) == batch_size
9396
assert first_call_args[0] == self.project.id
97+
assert first_call_args[1] == expected_hashes[0:batch_size]
9498
assert first_call_args[2] == 0
9599

96100
# Verify the second chunk has 5 hashes (remainder)
97101
second_call_args = mock_apply_async.call_args_list[1][1]["args"]
98102
assert len(second_call_args[1]) == 5
99103
assert second_call_args[0] == self.project.id
104+
assert second_call_args[1] == expected_hashes[batch_size:]
100105
assert second_call_args[2] == 0
101106

102107
@patch(

0 commit comments

Comments
 (0)