fix(grouping): Schedule seer deletion tasks with less hashes (#95156)

armenzg · web-flow · commit 130345ed2fba · 2025-07-10T10:18:51.000-04:00
The original code would always pass all hashes to all tasks spawned,
thus, we could end up with massive payloads for tasks causing trouble to
taskbroker.

We got into such a situation in the last few days when the deletion of a
project would lead to hundreds of thousands of hashes being passed to
tasks (179k+ hashes -&gt; 6MB+ task payloads).

The changes here would take all hashes from a task, chunk the hashes and
spawn new tasks with a small size of hashes.

This moves us from sequential scheduling of tasks to parallelized
scheduling. This could have an impact on the Seer service if a massive number of hashes are requested for deletion.

Ref inc-1236
diff --git a/src/sentry/tasks/delete_seer_grouping_records.py b/src/sentry/tasks/delete_seer_grouping_records.py
@@ -14,6 +14,7 @@
 from sentry.tasks.base import instrumented_task
 from sentry.taskworker.config import TaskworkerConfig
 from sentry.taskworker.namespaces import seer_tasks
+from sentry.utils import metrics
 from sentry.utils.query import RangeQuerySetWrapper
 
 logger = logging.getLogger(__name__)
@@ -49,10 +50,23 @@ def delete_seer_grouping_records_by_hash(
 
     batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size") or 100
     len_hashes = len(hashes)
-    end_index = min(last_deleted_index + batch_size, len_hashes)
-    call_seer_to_delete_these_hashes(project_id, hashes[last_deleted_index:end_index])
-    if end_index < len_hashes:
-        delete_seer_grouping_records_by_hash.apply_async(args=[project_id, hashes, end_index])
+    if len_hashes <= batch_size:  # Base case
+        call_seer_to_delete_these_hashes(project_id, hashes)
+    else:
+        if last_deleted_index != 0:
+            # This tracks which tasks are still being scheduled with the whole list of hashes
+            metrics.incr(
+                "grouping.similarity.delete_seer_grouping_records_by_hash.batch_size_exceeded",
+                sample_rate=options.get("seer.similarity.metrics_sample_rate"),
+            )
+
+        # Iterate through hashes in chunks and schedule a task for each chunk
+        # There are tasks passing last_deleted_index, thus, we need to start from that index
+        # Eventually all tasks will pass 0
+        for i in range(last_deleted_index, len_hashes, batch_size):
+            # Slice operations are safe and will not raise IndexError
+            chunked_hashes = hashes[i : i + batch_size]
+            delete_seer_grouping_records_by_hash.apply_async(args=[project_id, chunked_hashes, 0])
 
 
 def call_delete_seer_grouping_records_by_hash(
diff --git a/tests/sentry/tasks/test_delete_seer_grouping_records.py b/tests/sentry/tasks/test_delete_seer_grouping_records.py
@@ -32,7 +32,8 @@ def test_delete_seer_grouping_records_by_hash_batches(
         # We call it as a function and will schedule a task for the extra hash
         delete_seer_grouping_records_by_hash(project_id, hashes, 0)
         assert mock_delete_seer_grouping_records_by_hash_apply_async.call_args[1] == {
-            "args": [project_id, hashes, 100]
+            # We do not schedule the task with all the hashes, but only the extra ones
+            "args": [project_id, hashes[batch_size:], 0]
         }
 
     @patch(

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,8 @@ def test_delete_seer_grouping_records_by_hash_batches(`
`32`	`32`	`# We call it as a function and will schedule a task for the extra hash`
`33`	`33`	`delete_seer_grouping_records_by_hash(project_id, hashes, 0)`
`34`	`34`	`assert mock_delete_seer_grouping_records_by_hash_apply_async.call_args[1] == {`
`35`		`- "args": [project_id, hashes, 100]`
	`35`	`+ # We do not schedule the task with all the hashes, but only the extra ones`
	`36`	`+ "args": [project_id, hashes[batch_size:], 0]`
`36`	`37`	`}`
`37`	`38`
`38`	`39`	`@patch(`