Skip to content

Commit 4b4891a

Browse files
committed
Handle duplicates in splice_ai and improve memory consumption when printing error message
1 parent d44f879 commit 4b4891a

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def validate_no_duplicate_variants(
5353
ht = ht.select()
5454
if ht.count() > 0:
5555
variant_format = dataset_type.table_key_format_fn(reference_genome)
56-
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()][:10]}'
56+
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.take(10)]}'
5757
raise SeqrValidationError(msg)
5858

5959

v03_pipeline/lib/reference_datasets/splice_ai.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def get_ht(
3939
.map(hl.float32),
4040
)
4141
ht = ht.annotate(delta_score=hl.max(ht.delta_scores))
42-
return ht.annotate(
42+
ht = ht.annotate(
4343
splice_consequence_id=hl.if_else(
4444
ht.delta_score > 0,
4545
# Splice Consequence enum ID is the index of the max score
@@ -48,3 +48,4 @@ def get_ht(
4848
num_delta_scores,
4949
),
5050
).drop('delta_scores')
51+
return ht.group_by(*ht.key).aggregate(splice_consequence_id=hl.agg.min(ht.splice_consequence_id))

0 commit comments

Comments
 (0)