Skip to content

Commit 6432899

Browse files
jklugherzbpblanken
andauthored
Dev (#755)
* delete old crdq code * add some logging (#741) * clinvar submitters and conditions (#739) * still developing * more hail stuff * 2 tables * finish custom_select for clinver * oop * remove * comment * mock_join_clinvar_to_submission_summary * crdq test * list instead of set * partitions * a few changes * test tables * fix failing test and add test for no empty submitters * fix failing test and add test for no empty submitters * remove todo * Revert "delete old crdq code" (#743) * try moving clinvar submitter tmp file to gcs before importing (#742) * try moving tmp file to gcs first * Oops * make old crdq runnable (#744) * make old crdq code runnable * make old crdq code runnable * ruff * Fix missing hgmd variants (#747) * Fix missing recoding * Fix hgmd parsing * lint * Update tables * Update versions * Fix incorrect luigi arg type (#748) * Fix missing hgmd variants (#747) * Fix missing recoding * Fix hgmd parsing * lint * Update tables * Update versions * incorrect luigi * change clinvar submitters table partitions (#749) make them equal to the clinvar vcf min partitions to hopefully reduce flakiness of the join. this is mostly a guess yesterday's test [run](https://console.cloud.google.com/dataproc/jobs/UpdateVariantAnnotationsTableWithUpdatedReferenceDataset_20240328_9a449403/monitoring?region=us-central1&project=seqr-project) succeeded but after multiple attempts * change source path of gnomad_qc buckets (#752) * Increase relatedness threshold (#750) * Increase relatedness threshold * add value --------- Co-authored-by: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com> Co-authored-by: Benjamin Blankenmeister <bblanken@broadinstitute.org>
1 parent e4b6bdb commit 6432899

File tree

58 files changed

+348
-22
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+348
-22
lines changed

v03_pipeline/bin/write_cached_reference_dataset_query_ht.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,13 @@ def get_ht(
2626
query: CachedReferenceDatasetQuery,
2727
) -> hl.Table:
2828
# If the query is defined over an uncombined reference dataset, use the combiner config.
29-
if query.reference_dataset:
30-
config = CONFIG[query.reference_dataset][reference_genome.v02_value]
31-
return import_ht_from_config_path(config, reference_genome)
29+
if query.query_raw_dataset:
30+
config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value]
31+
return import_ht_from_config_path(
32+
config,
33+
query.dataset(dataset_type),
34+
reference_genome,
35+
)
3236
return hl.read_table(
3337
valid_reference_dataset_collection_path(
3438
reference_genome,

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample
77
from v03_pipeline.lib.model import Sex
88

9+
RELATEDNESS_TOLERANCE = 0.2
10+
911

1012
def passes_relatedness_check(
1113
relatedness_check_lookup: dict[tuple[str, str], list],
@@ -22,7 +24,7 @@ def passes_relatedness_check(
2224
if not coefficients or not np.allclose(
2325
coefficients,
2426
relation.coefficients,
25-
atol=0.1,
27+
atol=RELATEDNESS_TOLERANCE,
2628
):
2729
return (
2830
False,

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,10 @@
3838
'practice_guideline': 4,
3939
},
4040
)
41-
41+
CLINVAR_SUBMISSION_SUMMARY_URL = (
42+
'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz'
43+
)
44+
MIN_HT_PARTITIONS = 2000
4245
logger = get_logger(__name__)
4346

4447

@@ -125,11 +128,11 @@ def download_and_import_latest_clinvar_vcf(
125128
drop_samples=True,
126129
skip_invalid_loci=True,
127130
contig_recoding=reference_genome.contig_recoding(include_mt=True),
128-
min_partitions=2000,
131+
min_partitions=MIN_HT_PARTITIONS,
129132
force_bgz=True,
130133
)
131134
mt = mt.annotate_globals(version=_parse_clinvar_release_date(tmp_file.name))
132-
return mt.rows()
135+
return join_to_submission_summary_ht(mt.rows())
133136

134137

135138
def _parse_clinvar_release_date(local_vcf_path: str) -> str:
@@ -150,3 +153,45 @@ def _parse_clinvar_release_date(local_vcf_path: str) -> str:
150153
return None
151154

152155
return None
156+
157+
158+
def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table:
159+
# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/README - submission_summary.txt
160+
logger.info('Getting clinvar submission summary')
161+
ht = download_and_import_clinvar_submission_summary()
162+
ht = ht.rename({'#VariationID': 'VariationID'})
163+
ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo')
164+
ht = ht.group_by('VariationID').aggregate(
165+
Submitters=hl.agg.collect(ht.Submitter),
166+
Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
167+
)
168+
ht = ht.key_by('VariationID')
169+
return vcf_ht.annotate(
170+
submitters=ht[vcf_ht.rsid].Submitters,
171+
conditions=ht[vcf_ht.rsid].Conditions,
172+
)
173+
174+
175+
def download_and_import_clinvar_submission_summary() -> hl.Table:
176+
with tempfile.NamedTemporaryFile(
177+
suffix='.txt.gz',
178+
delete=False,
179+
) as tmp_file:
180+
urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name) # noqa: S310
181+
gcs_tmp_file_name = os.path.join(
182+
Env.HAIL_TMPDIR,
183+
os.path.basename(tmp_file.name),
184+
)
185+
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
186+
return hl.import_table(
187+
gcs_tmp_file_name,
188+
force=True,
189+
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
190+
types={
191+
'#VariationID': hl.tstr,
192+
'Submitter': hl.tstr,
193+
'ReportedPhenotypeInfo': hl.tstr,
194+
},
195+
missing='-',
196+
min_partitions=MIN_HT_PARTITIONS,
197+
)

v03_pipeline/lib/reference_data/clinvar_test.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import unittest
2+
from unittest import mock
23

34
import hail as hl
45

56
from v03_pipeline.lib.reference_data.clinvar import (
7+
join_to_submission_summary_ht,
68
parsed_and_mapped_clnsigconf,
79
parsed_clnsig,
810
)
@@ -83,3 +85,130 @@ def test_parsed_and_mapped_clnsigconf(self):
8385
],
8486
],
8587
)
88+
89+
@mock.patch(
90+
'v03_pipeline.lib.reference_data.clinvar.download_and_import_clinvar_submission_summary',
91+
)
92+
def test_join_to_submission_summary_ht(self, mock_download):
93+
clinvar_enums_struct = hl.Struct(
94+
CLNSIG=[
95+
'Pathogenic/Likely_pathogenic/Pathogenic',
96+
'_low_penetrance',
97+
],
98+
CLNSIGCONF=[
99+
'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic',
100+
'_low_penetrance(1)|Uncertain_significance(1)',
101+
],
102+
CLNREVSTAT=['no_classifications_from_unflagged_records'],
103+
)
104+
vcf_ht = hl.Table.parallelize(
105+
[
106+
{
107+
'locus': hl.Locus(
108+
contig='chr1',
109+
position=871269,
110+
reference_genome='GRCh38',
111+
),
112+
'alleles': ['A', 'C'],
113+
'rsid': '5',
114+
'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
115+
},
116+
{
117+
'locus': hl.Locus(
118+
contig='chr1',
119+
position=871269,
120+
reference_genome='GRCh38',
121+
),
122+
'alleles': ['A', 'AC'],
123+
'rsid': '7',
124+
'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
125+
},
126+
],
127+
hl.tstruct(
128+
locus=hl.tlocus('GRCh38'),
129+
alleles=hl.tarray(hl.tstr),
130+
rsid=hl.tstr,
131+
info=hl.tstruct(
132+
ALLELEID=hl.tint32,
133+
CLNSIG=hl.tarray(hl.tstr),
134+
CLNSIGCONF=hl.tarray(hl.tstr),
135+
CLNREVSTAT=hl.tarray(hl.tstr),
136+
),
137+
),
138+
)
139+
mock_download.return_value = hl.Table.parallelize(
140+
[
141+
{
142+
'#VariationID': '5',
143+
'Submitter': 'OMIM',
144+
'ReportedPhenotypeInfo': 'C3661900:not provided',
145+
},
146+
{
147+
'#VariationID': '5',
148+
'Submitter': 'Broad Institute Rare Disease Group, Broad Institute',
149+
'ReportedPhenotypeInfo': 'C0023264:Leigh syndrome',
150+
},
151+
{
152+
'#VariationID': '5',
153+
'Submitter': 'PreventionGenetics, part of Exact Sciences',
154+
'ReportedPhenotypeInfo': 'na:FOXRED1-related condition',
155+
},
156+
{
157+
'#VariationID': '5',
158+
'Submitter': 'Invitae',
159+
'ReportedPhenotypeInfo': 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
160+
},
161+
{
162+
'#VariationID': '6',
163+
'Submitter': 'A',
164+
'ReportedPhenotypeInfo': 'na:B',
165+
},
166+
],
167+
hl.tstruct(
168+
**{
169+
'#VariationID': hl.tstr,
170+
'Submitter': hl.tstr,
171+
'ReportedPhenotypeInfo': hl.tstr,
172+
},
173+
),
174+
)
175+
ht = join_to_submission_summary_ht(vcf_ht)
176+
self.assertEqual(
177+
ht.collect(),
178+
[
179+
hl.Struct(
180+
locus=hl.Locus(
181+
contig='chr1',
182+
position=871269,
183+
reference_genome='GRCh38',
184+
),
185+
alleles=['A', 'C'],
186+
rsid='5',
187+
info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
188+
submitters=[
189+
'OMIM',
190+
'Broad Institute Rare Disease Group, Broad Institute',
191+
'PreventionGenetics, part of Exact Sciences',
192+
'Invitae',
193+
],
194+
conditions=[
195+
'C3661900:not provided',
196+
'C0023264:Leigh syndrome',
197+
'na:FOXRED1-related condition',
198+
'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
199+
],
200+
),
201+
hl.Struct(
202+
locus=hl.Locus(
203+
contig='chr1',
204+
position=871269,
205+
reference_genome='GRCh38',
206+
),
207+
alleles=['A', 'AC'],
208+
rsid='7',
209+
info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
210+
submitters=None,
211+
conditions=None,
212+
),
213+
],
214+
)

v03_pipeline/lib/reference_data/config.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ def clinvar_custom_select(ht):
5353
# so there's a hidden enum-mapping inside this clinvar function.
5454
selects['conflictingPathogenicities'] = parsed_and_mapped_clnsigconf(ht)
5555
selects['goldStars'] = CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT))
56+
selects['submitters'] = ht.submitters
57+
selects['conditions'] = hl.map(
58+
lambda p: p.split(r':')[1],
59+
ht.conditions,
60+
) # assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome'
5661
return selects
5762

5863

@@ -376,12 +381,14 @@ def custom_mpc_select(ht):
376381
'37': {
377382
'version': 'v2',
378383
'custom_import': import_matrix_table,
379-
'source_path': 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
384+
# Note: copied from 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt'
385+
'source_path': 'gs://seqr-reference-data/gnomad_qc/GRCh37/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
380386
},
381387
'38': {
382388
'version': 'v3.1',
383389
'custom_import': import_matrix_table,
384-
'source_path': 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
390+
# Note: copied from 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt'
391+
'source_path': 'gs://seqr-reference-data/gnomad_qc/GRCh38/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
385392
},
386393
},
387394
'exac': {

v03_pipeline/lib/tasks/base/base_hail_table_task.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def output(self) -> luigi.Target:
1717
raise NotImplementedError
1818

1919
def complete(self) -> bool:
20+
logger.info(f'BaseHailTableTask: checking if {self.output().path} exists')
2021
return GCSorLocalFolderTarget(self.output().path).exists()
2122

2223
def init_hail(self):
@@ -33,7 +34,7 @@ def init_hail(self):
3334

3435
@luigi.Task.event_handler(luigi.Event.DEPENDENCY_DISCOVERED)
3536
def dependency_discovered(task, dependency):
36-
logger.info(f'{task} dependency_discovered {dependency}')
37+
logger.info(f'{task} dependency_discovered {dependency} at {task.output()}')
3738

3839

3940
@luigi.Task.event_handler(luigi.Event.DEPENDENCY_MISSING)
@@ -43,7 +44,7 @@ def dependency_missing(task):
4344

4445
@luigi.Task.event_handler(luigi.Event.DEPENDENCY_PRESENT)
4546
def dependency_present(task):
46-
logger.info(f'{task} dependency_present')
47+
logger.info(f'{task} dependency_present at {task.output()}')
4748

4849

4950
@luigi.Task.event_handler(luigi.Event.START)

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht'
3636
TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht'
3737

38+
3839
MOCK_CADD_CONFIG = {
3940
'version': 'v1.6',
4041
'select': ['PHRED'],
@@ -66,6 +67,8 @@
6667
CLNSIGCONF=hl.tarray(hl.tstr),
6768
CLNREVSTAT=hl.tarray(hl.tstr),
6869
),
70+
submitters=hl.tarray(hl.tstr),
71+
conditions=hl.tarray(hl.tstr),
6972
),
7073
key=['locus', 'alleles'],
7174
globals=hl.Struct(
@@ -456,6 +459,8 @@
456459
CLNSIGCONF=hl.tarray(hl.tstr),
457460
CLNREVSTAT=hl.tarray(hl.tstr),
458461
),
462+
submitters=hl.tarray(hl.tstr),
463+
conditions=hl.tarray(hl.tstr),
459464
),
460465
key=['locus', 'alleles'],
461466
globals=hl.Struct(
@@ -712,7 +717,15 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
712717
),
713718
alleles=['A', 'C'],
714719
cadd=hl.Struct(PHRED=2),
715-
clinvar=None,
720+
clinvar=hl.Struct(
721+
alleleId=None,
722+
conflictingPathogenicities=None,
723+
goldStars=None,
724+
pathogenicity_id=None,
725+
assertion_ids=None,
726+
submitters=None,
727+
conditions=None,
728+
),
716729
dbnsfp=hl.Struct(
717730
REVEL_score=0.043,
718731
SIFT_score=None,
@@ -949,7 +962,15 @@ def test_update_vat_with_updated_rdc_mito_38(
949962
reference_genome='GRCh38',
950963
),
951964
alleles=['A', 'C'],
952-
clinvar_mito=None,
965+
clinvar_mito=hl.Struct(
966+
alleleId=None,
967+
conflictingPathogenicities=None,
968+
goldStars=None,
969+
pathogenicity_id=None,
970+
assertion_ids=None,
971+
submitters=None,
972+
conditions=None,
973+
),
953974
dbnsfp_mito=hl.Struct(
954975
SIFT_score=None,
955976
MutationTaster_pred_id=2,
@@ -1093,7 +1114,6 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
10931114
),
10941115
],
10951116
)
1096-
10971117
self.assertCountEqual(
10981118
ht.collect(),
10991119
[
@@ -1105,7 +1125,15 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
11051125
),
11061126
alleles=['A', 'C'],
11071127
cadd=hl.Struct(PHRED=9.699999809265137),
1108-
clinvar=None,
1128+
clinvar=hl.Struct(
1129+
alleleId=None,
1130+
conflictingPathogenicities=None,
1131+
goldStars=None,
1132+
pathogenicity_id=None,
1133+
assertion_ids=None,
1134+
submitters=None,
1135+
conditions=None,
1136+
),
11091137
dbnsfp=hl.Struct(
11101138
REVEL_score=0.043,
11111139
SIFT_score=None,

0 commit comments

Comments
 (0)