Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines

bpblanken · bpblanken · commit d86baba4f6a9 · 2025-03-18T12:22:05.000-04:00
diff --git a/v03_pipeline/lib/methods/sample_qc.py b/v03_pipeline/lib/methods/sample_qc.py
@@ -0,0 +1,66 @@
+import hail as hl
+from gnomad.sample_qc.pipeline import filter_rows_for_qc
+
+from v03_pipeline.lib.model import SampleType
+
+GNOMAD_FILTER_MIN_AF = 0.001
+GNOMAD_FILTER_MIN_CALLRATE = 0.99
+
+CALLRATE_LOW_THRESHOLD = 0.85
+CONTAMINATION_UPPER_THRESHOLD = 5
+WES_COVERAGE_LOW_THRESHOLD = 85
+WGS_CALLRATE_LOW_THRESHOLD = 30
+
+
+def call_sample_qc(
+    mt: hl.MatrixTable,
+    tdr_metrics_ht: hl.Table,
+    sample_type: SampleType,
+):
+    mt = mt.annotate_entries(
+        GT=hl.case()
+        .when(mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False))
+        .when(mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False))
+        .default(hl.missing(hl.tcall)),
+    )
+    mt = annotate_filtered_callrate(mt)
+    return annotate_filter_flags(mt, tdr_metrics_ht, sample_type)
+
+
+def annotate_filtered_callrate(mt: hl.MatrixTable) -> hl.MatrixTable:
+    filtered_mt = filter_rows_for_qc(
+        mt,
+        min_af=GNOMAD_FILTER_MIN_AF,
+        min_callrate=GNOMAD_FILTER_MIN_CALLRATE,
+        bi_allelic_only=True,
+        snv_only=True,
+        apply_hard_filters=False,
+        min_inbreeding_coeff_threshold=None,
+        min_hardy_weinberg_threshold=None,
+    )
+    callrate_ht = filtered_mt.select_cols(
+        filtered_callrate=hl.agg.fraction(hl.is_defined(filtered_mt.GT)),
+    ).cols()
+    return mt.annotate_cols(**callrate_ht[mt.col_key])
+
+
+def annotate_filter_flags(
+    mt: hl.MatrixTable,
+    tdr_metrics_ht: hl.Table,
+    sample_type: SampleType,
+) -> hl.MatrixTable:
+    mt = mt.annotate_cols(**tdr_metrics_ht[mt.col_key])
+    flags = {
+        'callrate': mt.filtered_callrate < CALLRATE_LOW_THRESHOLD,
+        'contamination': mt.contamination_rate > CONTAMINATION_UPPER_THRESHOLD,
+    }
+    if sample_type == SampleType.WES:
+        flags['coverage'] = mt.percent_bases_at_20x < WES_COVERAGE_LOW_THRESHOLD
+    else:
+        flags['coverage'] = mt.mean_coverage < WGS_CALLRATE_LOW_THRESHOLD
+
+    return mt.annotate_cols(
+        filter_flags=hl.array(
+            [hl.or_missing(filter_cond, name) for name, filter_cond in flags.items()],
+        ).filter(hl.is_defined),
+    )
diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py
@@ -244,6 +244,24 @@ def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
     return ht.key_by(ht.s)
 
 
+def import_tdr_qc_metrics(file_path: str) -> hl.Table:
+    ht = hl.import_table(
+        file_path,
+        types={
+            'contamination_rate': hl.tfloat32,
+            'percent_bases_at_20x': hl.tfloat32,
+            'mean_coverage': hl.tfloat32,
+        },
+    )
+    ht = ht.select(
+        s=ht.collaborator_sample_id,
+        contamination_rate=ht.contamination_rate,
+        percent_bases_at_20x=ht.percent_bases_at_20x,
+        mean_coverage=ht.mean_coverage,
+    )
+    return ht.key_by(ht.s)
+
+
 def import_remap(remap_path: str) -> hl.Table:
     ht = hl.import_table(remap_path)
     ht = ht.select(
diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
@@ -48,6 +48,26 @@ def validate_allele_type(
         raise SeqrValidationError(msg)
 
 
+def validate_allele_depth_length(
+    mt: hl.MatrixTable,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+    **_: Any,
+) -> None:
+    ht = mt.select_rows(
+        found_ad_lengths=hl.agg.collect_as_set(hl.len(mt.AD)).remove(
+            hl.missing(hl.tint32),
+        ),
+    ).rows()
+    ht = ht.filter(
+        hl.len(ht.found_ad_lengths) > 1,
+    )
+    if ht.count() > 0:
+        variant_format = dataset_type.table_key_format_fn(reference_genome)
+        msg = f'Found variants with unequal Allele Depth array lengths over samples (first 10, if applicable): {({variant_format(v): v.found_ad_lengths for v in ht.take(10)})}'
+        raise SeqrValidationError(msg)
+
+
 def validate_no_duplicate_variants(
     t: hl.Table | hl.MatrixTable,
     reference_genome: ReferenceGenome,
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
@@ -4,6 +4,7 @@
 
 from v03_pipeline.lib.misc.validation import (
     SeqrValidationError,
+    validate_allele_depth_length,
     validate_allele_type,
     validate_expected_contig_frequency,
     validate_imported_field_types,
@@ -122,6 +123,63 @@ def test_validate_allele_type(self) -> None:
             DatasetType.SNV_INDEL,
         )
 
+    def test_validate_allele_depth_length(self) -> None:
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chr1',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=2,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=3,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=4,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                    'alleles': [
+                        ['A', 'T'],
+                        # NB: star alleles should pass through this validation just fine,
+                        # but are eventually filtered out upstream.
+                        ['A', 'TC', 'TG'],
+                        ['A', 'TTT'],
+                        ['A', 'CCC'],
+                    ],
+                },
+                cols={'s': ['sample_1', 'sample_2']},
+                entries={
+                    'AD': [
+                        [[1, 0], [1, 0]],
+                        [[1], [1, 0, 1]],
+                        [[1, 0], [1]],
+                        [[1, 0], [1, 0]],
+                    ],
+                },
+            )
+            .key_rows_by('locus', 'alleles')
+            .key_cols_by('s')
+        )
+        self.assertRaisesRegex(
+            SeqrValidationError,
+            "Found variants with unequal Allele Depth array lengths over samples \\(first 10, if applicable\\): \\{'1-2-A-TC-TG': \\{1, 3\\}, '1-3-A-TTT': \\{1, 2\\}\\}",
+            validate_allele_depth_length,
+            mt,
+            ReferenceGenome.GRCh38,
+            DatasetType.SNV_INDEL,
+        )
+
     def test_validate_imputed_sex_ploidy(self) -> None:
         female_sample = 'HG00731_1'
         male_sample_1 = 'HG00732_1'
diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py
@@ -36,7 +36,7 @@ def table_key_format_fn(
         if self in {DatasetType.GCNV, DatasetType.SV}:
             return lambda s: s.variant_id
         return (
-            lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
+            lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{"-".join(s.alleles)}'
         )
 
     @property
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
@@ -219,6 +219,22 @@ def relatedness_check_tsv_path(
     )
 
 
+def sample_qc_json_path(
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+    callset_path: str,
+) -> str:
+    return os.path.join(
+        pipeline_prefix(
+            Env.LOADING_DATASETS_DIR,
+            reference_genome,
+            dataset_type,
+        ),
+        'sample_qc',
+        f'{_callset_path_hash(callset_path)}.json',
+    )
+
+
 def remapped_and_subsetted_callset_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
@@ -4,6 +4,7 @@
 
 from v03_pipeline.lib.misc.validation import (
     SeqrValidationError,
+    validate_allele_depth_length,
     validate_allele_type,
     validate_expected_contig_frequency,
     validate_imputed_sex_ploidy,
@@ -123,6 +124,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
             )
         validation_dependencies = self.get_validation_dependencies()
         for validation_f in [
+            validate_allele_depth_length,
             validate_allele_type,
             validate_imputed_sex_ploidy,
             validate_no_duplicate_variants,
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py
@@ -4,9 +4,11 @@
 import luigi
 import luigi.util
 
+from v03_pipeline.lib.model import FeatureFlag
 from v03_pipeline.lib.paths import (
     metadata_for_run_path,
     relatedness_check_tsv_path,
+    sample_qc_json_path,
 )
 from v03_pipeline.lib.tasks.base.base_loading_run_params import (
     BaseLoadingRunParams,
@@ -54,6 +56,7 @@ def run(self) -> None:
                 self.dataset_type,
                 self.callset_path,
             ),
+            'sample_qc': {},
         }
         for remapped_and_subsetted_callset in self.input():
             callset_mt = hl.read_matrix_table(remapped_and_subsetted_callset.path)
@@ -67,6 +70,20 @@ def run(self) -> None:
                     **collected_globals['failed_family_samples'][key],
                     **metadata_json['failed_family_samples'][key],
                 }
-
+        if (
+            FeatureFlag.EXPECT_TDR_METRICS
+            and not self.skip_expect_tdr_metrics
+            and self.dataset_type.expect_tdr_metrics(
+                self.reference_genome,
+            )
+        ):
+            with open(
+                sample_qc_json_path(
+                    self.reference_genome,
+                    self.dataset_type,
+                    self.callset_path,
+                ),
+            ) as f:
+                metadata_json['sample_qc'] = json.load(f)
         with self.output().open('w') as f:
             json.dump(metadata_json, f)
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
@@ -1,20 +1,38 @@
 import json
+from unittest import mock
+from unittest.mock import Mock
 
 import luigi.worker
 
 from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
 from v03_pipeline.lib.paths import relatedness_check_tsv_path
 from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask
+from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
 TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
 TEST_REMAP_2 = 'v03_pipeline/var/test/remaps/test_remap_2.tsv'
 TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
 TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv'
+TEST_SAMPLE_QC_JSON = 'v03_pipeline/var/test/sample_qc_1.json'
 
 
 class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
-    def test_write_metadata_for_run_task(self) -> None:
+    @mock.patch(
+        'v03_pipeline.lib.tasks.write_metadata_for_run.sample_qc_json_path',
+        lambda *_: TEST_SAMPLE_QC_JSON,
+    )
+    @mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
+    @mock.patch(
+        'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',
+    )
+    def test_write_metadata_for_run_task(
+        self,
+        write_tdr_metrics_task: Mock,
+        mock_ff: Mock,
+    ) -> None:
+        mock_ff.EXPECT_TDR_METRICS = True
+        write_tdr_metrics_task.return_value = MockCompleteTask()
         worker = luigi.worker.Worker()
         write_metadata_for_run_task = WriteMetadataForRunTask(
             reference_genome=ReferenceGenome.GRCh38,
@@ -77,5 +95,11 @@ def test_write_metadata_for_run_task(self) -> None:
                         DatasetType.SNV_INDEL,
                         TEST_VCF,
                     ),
+                    'sample_qc': {
+                        'HG00731': {'filter_flags': ['coverage', 'contamination']},
+                        'HG00732': {'filter_flags': ['coverage']},
+                        'HG00733': {'filter_flags': ['contamination']},
+                        'NA19675': {'filter_flags': []},
+                    },
                 },
             )
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -29,6 +29,7 @@
 from v03_pipeline.lib.tasks.write_relatedness_check_tsv import (
     WriteRelatednessCheckTsvTask,
 )
+from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask
 from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
     with_persisted_validation_errors,
@@ -83,6 +84,17 @@ def requires(self) -> list[luigi.Task]:
                 self.clone(WriteRelatednessCheckTsvTask),
                 self.clone(WriteSexCheckTableTask),
             ]
+        if (
+            FeatureFlag.EXPECT_TDR_METRICS
+            and not self.skip_expect_tdr_metrics
+            and self.dataset_type.expect_tdr_metrics(
+                self.reference_genome,
+            )
+        ):
+            requirements = [
+                *requirements,
+                self.clone(WriteSampleQCJsonTask),
+            ]
         return requirements
 
     @with_persisted_validation_errors
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -94,6 +94,7 @@ def test_write_remapped_and_subsetted_callset_task(
             project_pedigree_paths=[TEST_PEDIGREE_3],
             project_i=0,
             skip_validation=True,
+            skip_expect_tdr_metrics=True,
         )
         worker.add(wrsc_task)
         worker.run()
@@ -138,6 +139,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
             project_pedigree_paths=[TEST_PEDIGREE_4],
             project_i=0,
             skip_validation=True,
+            skip_expect_tdr_metrics=True,
         )
         worker.add(wrsc_task)
         worker.run()
@@ -203,6 +205,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
             project_pedigree_paths=[TEST_PEDIGREE_7],
             project_i=0,
             skip_validation=True,
+            skip_expect_tdr_metrics=True,
         )
         worker.add(wrsc_task)
         worker.run()
diff --git a/v03_pipeline/lib/tasks/write_sample_qc_json.py b/v03_pipeline/lib/tasks/write_sample_qc_json.py
diff --git a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py
diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py
diff --git a/v03_pipeline/var/test/sample_qc_1.json b/v03_pipeline/var/test/sample_qc_1.json
diff --git a/v03_pipeline/var/test/tdr_metrics.tsv b/v03_pipeline/var/test/tdr_metrics.tsv

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def table_key_format_fn(`
`36`	`36`	`if self in {DatasetType.GCNV, DatasetType.SV}:`
`37`	`37`	`return lambda s: s.variant_id`
`38`	`38`	`return (`
`39`		`- lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'`
	`39`	`+ lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{"-".join(s.alleles)}'`
`40`	`40`	`)`
`41`	`41`
`42`	`42`	`@property`