Skip to content

Commit 2b2ab92

Browse files
authored
Resolve bugs and increment versions. (#994)
* hmtvar float32 * Also fix topmed * Bump versions * Fix exac * Eigen * Splice ai * put this back * No change to `mitimpact` * Remove duplicates based on transcript * update mock tables * Remove from here * Back to main * Back to main * Fix unit tests * Update splice_ai.py
1 parent 92d0d0e commit 2b2ab92

File tree

82 files changed

+34
-10
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+34
-10
lines changed

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,15 +263,15 @@ def get_ht(
263263
},
264264
ReferenceGenome.GRCh37: {
265265
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
266-
VERSION: '1.0',
266+
VERSION: '1.1',
267267
PATH: [
268268
'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.masked.snv.hg19.vcf.gz',
269269
'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.masked.indel.hg19.vcf.gz',
270270
],
271271
},
272272
ReferenceGenome.GRCh38: {
273273
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
274-
VERSION: '1.0',
274+
VERSION: '1.1',
275275
# NB: SpliceAI data is only available to download for authenticated Illumina users, so we will host the data
276276
PATH: [
277277
'gs://seqr-reference-data/GRCh38/spliceai/spliceai_scores.masked.snv.hg38.vcf.gz',

v03_pipeline/lib/reference_datasets/splice_ai.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,26 @@
99
from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht
1010

1111

12+
def remove_duplicate_scores(ht: hl.Table):
13+
#
14+
# SpliceAI has duplicate rows of the ilk:
15+
#
16+
# 1:861264 | ["C","A"] | NA | -1.00e+01 | NA | ["A|AL645608.1|0.00|0.00|0.00|0.00|2|27|12|1"] |
17+
# 1:861264 | ["C","A"] | NA | -1.00e+01 | NA | ["A|SAMD11|0.02|0.01|0.00|0.00|14|38|14|38"]
18+
#
19+
count_ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
20+
duplicate_variants_ht = count_ht.filter(count_ht.n > 1)
21+
duplicates_ht = ht.semi_join(duplicate_variants_ht)
22+
non_duplicates_ht = ht.anti_join(duplicates_ht)
23+
return non_duplicates_ht.union(
24+
# Remove rows that 1) are part of duplicate variant groupings
25+
# and 2) contain dots.
26+
duplicates_ht.filter(
27+
~duplicates_ht.info.SpliceAI[0].split(delim='\\|')[1].contains('.'),
28+
),
29+
)
30+
31+
1232
def get_ht(
1333
paths: list[str],
1434
reference_genome: ReferenceGenome,
@@ -26,6 +46,7 @@ def get_ht(
2646
# of partititons.
2747
)
2848
ht, _ = checkpoint(ht)
49+
ht = remove_duplicate_scores(ht)
2950

3051
# SpliceAI INFO field description from the VCF header: SpliceAIv1.3 variant annotation. These include
3152
# delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and
@@ -39,7 +60,7 @@ def get_ht(
3960
.map(hl.float32),
4061
)
4162
ht = ht.annotate(delta_score=hl.max(ht.delta_scores))
42-
ht = ht.annotate(
63+
return ht.annotate(
4364
splice_consequence_id=hl.if_else(
4465
ht.delta_score > 0,
4566
# Splice Consequence enum ID is the index of the max score
@@ -48,6 +69,3 @@ def get_ht(
4869
num_delta_scores,
4970
),
5071
).drop('delta_scores')
51-
return ht.group_by(*ht.key).aggregate(
52-
splice_consequence_id=hl.agg.min(ht.splice_consequence_id),
53-
)

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_update_vat_snv_indel_38(
146146
eigen='1.1',
147147
clinvar='2024-11-11',
148148
exac='1.1',
149-
splice_ai='1.0',
149+
splice_ai='1.1',
150150
topmed='1.1',
151151
hgmd='1.0',
152152
gnomad_exomes='1.0',
@@ -425,7 +425,7 @@ def test_update_vat_snv_indel_37(
425425
eigen='1.1',
426426
clinvar='2024-11-11',
427427
exac='1.1',
428-
splice_ai='1.0',
428+
splice_ai='1.1',
429429
topmed='1.1',
430430
hgmd='1.0',
431431
gnomad_exomes='1.0',

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ def test_multiple_update_vat(
501501
exac='1.1',
502502
gnomad_exomes='1.0',
503503
gnomad_genomes='1.0',
504-
splice_ai='1.0',
504+
splice_ai='1.1',
505505
topmed='1.1',
506506
gnomad_non_coding_constraint='1.0',
507507
screen='1.0',
@@ -762,7 +762,7 @@ def test_update_vat_without_accessing_private_datasets(
762762
exac='1.1',
763763
gnomad_exomes='1.0',
764764
gnomad_genomes='1.0',
765-
splice_ai='1.0',
765+
splice_ai='1.1',
766766
topmed='1.1',
767767
gnomad_non_coding_constraint='1.0',
768768
screen='1.0',

0 commit comments

Comments
 (0)