|
14 | 14 | from sentry.tasks.base import instrumented_task
|
15 | 15 | from sentry.taskworker.config import TaskworkerConfig
|
16 | 16 | from sentry.taskworker.namespaces import seer_tasks
|
| 17 | +from sentry.utils import metrics |
17 | 18 | from sentry.utils.query import RangeQuerySetWrapper
|
18 | 19 |
|
19 | 20 | logger = logging.getLogger(__name__)
|
@@ -49,10 +50,23 @@ def delete_seer_grouping_records_by_hash(
|
49 | 50 |
|
50 | 51 | batch_size = options.get("embeddings-grouping.seer.delete-record-batch-size") or 100
|
51 | 52 | len_hashes = len(hashes)
|
52 |
| - end_index = min(last_deleted_index + batch_size, len_hashes) |
53 |
| - call_seer_to_delete_these_hashes(project_id, hashes[last_deleted_index:end_index]) |
54 |
| - if end_index < len_hashes: |
55 |
| - delete_seer_grouping_records_by_hash.apply_async(args=[project_id, hashes, end_index]) |
| 53 | + if len_hashes <= batch_size: # Base case |
| 54 | + call_seer_to_delete_these_hashes(project_id, hashes) |
| 55 | + else: |
| 56 | + if last_deleted_index != 0: |
| 57 | + # This tracks which tasks are still being scheduled with the whole list of hashes |
| 58 | + metrics.incr( |
| 59 | + "grouping.similarity.delete_seer_grouping_records_by_hash.batch_size_exceeded", |
| 60 | + sample_rate=options.get("seer.similarity.metrics_sample_rate"), |
| 61 | + ) |
| 62 | + |
| 63 | + # Iterate through hashes in chunks and schedule a task for each chunk |
| 64 | + # There are tasks passing last_deleted_index, thus, we need to start from that index |
| 65 | + # Eventually all tasks will pass 0 |
| 66 | + for i in range(last_deleted_index, len_hashes, batch_size): |
| 67 | + # Slice operations are safe and will not raise IndexError |
| 68 | + chunked_hashes = hashes[i : i + batch_size] |
| 69 | + delete_seer_grouping_records_by_hash.apply_async(args=[project_id, chunked_hashes, 0]) |
56 | 70 |
|
57 | 71 |
|
58 | 72 | def call_delete_seer_grouping_records_by_hash(
|
|
0 commit comments