broadinstitute
diff --git a/‎v03_pipeline/bin/write_cached_reference_dataset_query_ht.py
Lines changed: 7 additions & 3 deletions b/‎v03_pipeline/bin/write_cached_reference_dataset_query_ht.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎v03_pipeline/lib/misc/family_loading_failures.py
Lines changed: 3 additions & 1 deletion b/‎v03_pipeline/lib/misc/family_loading_failures.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎v03_pipeline/lib/reference_data/clinvar.py
Lines changed: 48 additions & 3 deletions b/‎v03_pipeline/lib/reference_data/clinvar.py
Lines changed: 48 additions & 3 deletions
diff --git a/‎v03_pipeline/lib/reference_data/clinvar_test.py
Lines changed: 129 additions & 0 deletions b/‎v03_pipeline/lib/reference_data/clinvar_test.py
Lines changed: 129 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/reference_data/config.py
Lines changed: 9 additions & 2 deletions b/‎v03_pipeline/lib/reference_data/config.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎v03_pipeline/lib/tasks/base/base_hail_table_task.py
Lines changed: 3 additions & 2 deletions b/‎v03_pipeline/lib/tasks/base/base_hail_table_task.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py
Lines changed: 32 additions & 4 deletions b/‎v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py
Lines changed: 32 additions & 4 deletions
@@ -26,9 +26,13 @@ def get_ht(
     query: CachedReferenceDatasetQuery,
 ) -> hl.Table:
     # If the query is defined over an uncombined reference dataset, use the combiner config.
-    if query.reference_dataset:
-        config = CONFIG[query.reference_dataset][reference_genome.v02_value]
-        return import_ht_from_config_path(config, reference_genome)
+    if query.query_raw_dataset:
+        config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value]
+        return import_ht_from_config_path(
+            config,
+            query.dataset(dataset_type),
+            reference_genome,
+        )
     return hl.read_table(
         valid_reference_dataset_collection_path(
             reference_genome,
 
@@ -6,6 +6,8 @@
 from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample
 from v03_pipeline.lib.model import Sex
 
+RELATEDNESS_TOLERANCE = 0.2
+
 
 def passes_relatedness_check(
     relatedness_check_lookup: dict[tuple[str, str], list],
@@ -22,7 +24,7 @@ def passes_relatedness_check(
     if not coefficients or not np.allclose(
         coefficients,
         relation.coefficients,
-        atol=0.1,
+        atol=RELATEDNESS_TOLERANCE,
     ):
         return (
             False,
 
@@ -38,7 +38,10 @@
         'practice_guideline': 4,
     },
 )
-
+CLINVAR_SUBMISSION_SUMMARY_URL = (
+    'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz'
+)
+MIN_HT_PARTITIONS = 2000
 logger = get_logger(__name__)
 
 
@@ -125,11 +128,11 @@ def download_and_import_latest_clinvar_vcf(
             drop_samples=True,
             skip_invalid_loci=True,
             contig_recoding=reference_genome.contig_recoding(include_mt=True),
-            min_partitions=2000,
+            min_partitions=MIN_HT_PARTITIONS,
             force_bgz=True,
         )
         mt = mt.annotate_globals(version=_parse_clinvar_release_date(tmp_file.name))
-        return mt.rows()
+        return join_to_submission_summary_ht(mt.rows())
 
 
 def _parse_clinvar_release_date(local_vcf_path: str) -> str:
@@ -150,3 +153,45 @@ def _parse_clinvar_release_date(local_vcf_path: str) -> str:
                 return None
 
     return None
+
+
+def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table:
+    # https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/README - submission_summary.txt
+    logger.info('Getting clinvar submission summary')
+    ht = download_and_import_clinvar_submission_summary()
+    ht = ht.rename({'#VariationID': 'VariationID'})
+    ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo')
+    ht = ht.group_by('VariationID').aggregate(
+        Submitters=hl.agg.collect(ht.Submitter),
+        Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
+    )
+    ht = ht.key_by('VariationID')
+    return vcf_ht.annotate(
+        submitters=ht[vcf_ht.rsid].Submitters,
+        conditions=ht[vcf_ht.rsid].Conditions,
+    )
+
+
+def download_and_import_clinvar_submission_summary() -> hl.Table:
+    with tempfile.NamedTemporaryFile(
+        suffix='.txt.gz',
+        delete=False,
+    ) as tmp_file:
+        urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name)  # noqa: S310
+        gcs_tmp_file_name = os.path.join(
+            Env.HAIL_TMPDIR,
+            os.path.basename(tmp_file.name),
+        )
+        safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
+        return hl.import_table(
+            gcs_tmp_file_name,
+            force=True,
+            filter='^(#[^:]*:|^##).*$',  # removes all comments except for the header line
+            types={
+                '#VariationID': hl.tstr,
+                'Submitter': hl.tstr,
+                'ReportedPhenotypeInfo': hl.tstr,
+            },
+            missing='-',
+            min_partitions=MIN_HT_PARTITIONS,
+        )
@@ -1,8 +1,10 @@
 import unittest
+from unittest import mock
 
 import hail as hl
 
 from v03_pipeline.lib.reference_data.clinvar import (
+    join_to_submission_summary_ht,
     parsed_and_mapped_clnsigconf,
     parsed_clnsig,
 )
@@ -83,3 +85,130 @@ def test_parsed_and_mapped_clnsigconf(self):
                 ],
             ],
         )
+
+    @mock.patch(
+        'v03_pipeline.lib.reference_data.clinvar.download_and_import_clinvar_submission_summary',
+    )
+    def test_join_to_submission_summary_ht(self, mock_download):
+        clinvar_enums_struct = hl.Struct(
+            CLNSIG=[
+                'Pathogenic/Likely_pathogenic/Pathogenic',
+                '_low_penetrance',
+            ],
+            CLNSIGCONF=[
+                'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic',
+                '_low_penetrance(1)|Uncertain_significance(1)',
+            ],
+            CLNREVSTAT=['no_classifications_from_unflagged_records'],
+        )
+        vcf_ht = hl.Table.parallelize(
+            [
+                {
+                    'locus': hl.Locus(
+                        contig='chr1',
+                        position=871269,
+                        reference_genome='GRCh38',
+                    ),
+                    'alleles': ['A', 'C'],
+                    'rsid': '5',
+                    'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
+                },
+                {
+                    'locus': hl.Locus(
+                        contig='chr1',
+                        position=871269,
+                        reference_genome='GRCh38',
+                    ),
+                    'alleles': ['A', 'AC'],
+                    'rsid': '7',
+                    'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
+                },
+            ],
+            hl.tstruct(
+                locus=hl.tlocus('GRCh38'),
+                alleles=hl.tarray(hl.tstr),
+                rsid=hl.tstr,
+                info=hl.tstruct(
+                    ALLELEID=hl.tint32,
+                    CLNSIG=hl.tarray(hl.tstr),
+                    CLNSIGCONF=hl.tarray(hl.tstr),
+                    CLNREVSTAT=hl.tarray(hl.tstr),
+                ),
+            ),
+        )
+        mock_download.return_value = hl.Table.parallelize(
+            [
+                {
+                    '#VariationID': '5',
+                    'Submitter': 'OMIM',
+                    'ReportedPhenotypeInfo': 'C3661900:not provided',
+                },
+                {
+                    '#VariationID': '5',
+                    'Submitter': 'Broad Institute Rare Disease Group, Broad Institute',
+                    'ReportedPhenotypeInfo': 'C0023264:Leigh syndrome',
+                },
+                {
+                    '#VariationID': '5',
+                    'Submitter': 'PreventionGenetics, part of Exact Sciences',
+                    'ReportedPhenotypeInfo': 'na:FOXRED1-related condition',
+                },
+                {
+                    '#VariationID': '5',
+                    'Submitter': 'Invitae',
+                    'ReportedPhenotypeInfo': 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
+                },
+                {
+                    '#VariationID': '6',
+                    'Submitter': 'A',
+                    'ReportedPhenotypeInfo': 'na:B',
+                },
+            ],
+            hl.tstruct(
+                **{
+                    '#VariationID': hl.tstr,
+                    'Submitter': hl.tstr,
+                    'ReportedPhenotypeInfo': hl.tstr,
+                },
+            ),
+        )
+        ht = join_to_submission_summary_ht(vcf_ht)
+        self.assertEqual(
+            ht.collect(),
+            [
+                hl.Struct(
+                    locus=hl.Locus(
+                        contig='chr1',
+                        position=871269,
+                        reference_genome='GRCh38',
+                    ),
+                    alleles=['A', 'C'],
+                    rsid='5',
+                    info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
+                    submitters=[
+                        'OMIM',
+                        'Broad Institute Rare Disease Group, Broad Institute',
+                        'PreventionGenetics, part of Exact Sciences',
+                        'Invitae',
+                    ],
+                    conditions=[
+                        'C3661900:not provided',
+                        'C0023264:Leigh syndrome',
+                        'na:FOXRED1-related condition',
+                        'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
+                    ],
+                ),
+                hl.Struct(
+                    locus=hl.Locus(
+                        contig='chr1',
+                        position=871269,
+                        reference_genome='GRCh38',
+                    ),
+                    alleles=['A', 'AC'],
+                    rsid='7',
+                    info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
+                    submitters=None,
+                    conditions=None,
+                ),
+            ],
+        )
@@ -53,6 +53,11 @@ def clinvar_custom_select(ht):
     # so there's a hidden enum-mapping inside this clinvar function.
     selects['conflictingPathogenicities'] = parsed_and_mapped_clnsigconf(ht)
     selects['goldStars'] = CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT))
+    selects['submitters'] = ht.submitters
+    selects['conditions'] = hl.map(
+        lambda p: p.split(r':')[1],
+        ht.conditions,
+    )  # assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome'
     return selects
 
 
@@ -376,12 +381,14 @@ def custom_mpc_select(ht):
         '37': {
             'version': 'v2',
             'custom_import': import_matrix_table,
-            'source_path': 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
+            # Note: copied from 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt'
+            'source_path': 'gs://seqr-reference-data/gnomad_qc/GRCh37/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
         },
         '38': {
             'version': 'v3.1',
             'custom_import': import_matrix_table,
-            'source_path': 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
+            # Note: copied from 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt'
+            'source_path': 'gs://seqr-reference-data/gnomad_qc/GRCh38/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
         },
     },
     'exac': {
 
@@ -17,6 +17,7 @@ def output(self) -> luigi.Target:
         raise NotImplementedError
 
     def complete(self) -> bool:
+        logger.info(f'BaseHailTableTask: checking if {self.output().path} exists')
         return GCSorLocalFolderTarget(self.output().path).exists()
 
     def init_hail(self):
@@ -33,7 +34,7 @@ def init_hail(self):
 
 @luigi.Task.event_handler(luigi.Event.DEPENDENCY_DISCOVERED)
 def dependency_discovered(task, dependency):
-    logger.info(f'{task} dependency_discovered {dependency}')
+    logger.info(f'{task} dependency_discovered {dependency} at {task.output()}')
 
 
 @luigi.Task.event_handler(luigi.Event.DEPENDENCY_MISSING)
@@ -43,7 +44,7 @@ def dependency_missing(task):
 
 @luigi.Task.event_handler(luigi.Event.DEPENDENCY_PRESENT)
 def dependency_present(task):
-    logger.info(f'{task} dependency_present')
+    logger.info(f'{task} dependency_present at {task.output()}')
 
 
 @luigi.Task.event_handler(luigi.Event.START)
 
@@ -35,6 +35,7 @@
 TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht'
 TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht'
 
+
 MOCK_CADD_CONFIG = {
     'version': 'v1.6',
     'select': ['PHRED'],
@@ -66,6 +67,8 @@
                 CLNSIGCONF=hl.tarray(hl.tstr),
                 CLNREVSTAT=hl.tarray(hl.tstr),
             ),
+            submitters=hl.tarray(hl.tstr),
+            conditions=hl.tarray(hl.tstr),
         ),
         key=['locus', 'alleles'],
         globals=hl.Struct(
@@ -456,6 +459,8 @@
                         CLNSIGCONF=hl.tarray(hl.tstr),
                         CLNREVSTAT=hl.tarray(hl.tstr),
                     ),
+                    submitters=hl.tarray(hl.tstr),
+                    conditions=hl.tarray(hl.tstr),
                 ),
                 key=['locus', 'alleles'],
                 globals=hl.Struct(
@@ -712,7 +717,15 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
                     ),
                     alleles=['A', 'C'],
                     cadd=hl.Struct(PHRED=2),
-                    clinvar=None,
+                    clinvar=hl.Struct(
+                        alleleId=None,
+                        conflictingPathogenicities=None,
+                        goldStars=None,
+                        pathogenicity_id=None,
+                        assertion_ids=None,
+                        submitters=None,
+                        conditions=None,
+                    ),
                     dbnsfp=hl.Struct(
                         REVEL_score=0.043,
                         SIFT_score=None,
@@ -949,7 +962,15 @@ def test_update_vat_with_updated_rdc_mito_38(
                         reference_genome='GRCh38',
                     ),
                     alleles=['A', 'C'],
-                    clinvar_mito=None,
+                    clinvar_mito=hl.Struct(
+                        alleleId=None,
+                        conflictingPathogenicities=None,
+                        goldStars=None,
+                        pathogenicity_id=None,
+                        assertion_ids=None,
+                        submitters=None,
+                        conditions=None,
+                    ),
                     dbnsfp_mito=hl.Struct(
                         SIFT_score=None,
                         MutationTaster_pred_id=2,
@@ -1093,7 +1114,6 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
                 ),
             ],
         )
-
         self.assertCountEqual(
             ht.collect(),
             [
@@ -1105,7 +1125,15 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
                     ),
                     alleles=['A', 'C'],
                     cadd=hl.Struct(PHRED=9.699999809265137),
-                    clinvar=None,
+                    clinvar=hl.Struct(
+                        alleleId=None,
+                        conflictingPathogenicities=None,
+                        goldStars=None,
+                        pathogenicity_id=None,
+                        assertion_ids=None,
+                        submitters=None,
+                        conditions=None,
+                    ),
                     dbnsfp=hl.Struct(
                         REVEL_score=0.043,
                         SIFT_score=None,