vets filters (#774)

bpblanken · matren395 · web-flow · commit b97cf6ff2016 · 2024-05-01T16:37:48.000-04:00
* vets filters

* move to separate module

* remove PASS

* fix linters

* Update v03_pipeline/lib/misc/vets_test.py

Co-authored-by: Daniel Marten &lt;78616802+matren395@users.noreply.github.com&gt;

* fix tests

* PR review comments

* Update vets.py

---------

Co-authored-by: Daniel Marten &lt;78616802+matren395@users.noreply.github.com&gt;
diff --git a/v03_pipeline/lib/misc/io_test.py b/v03_pipeline/lib/misc/io_test.py
@@ -1,6 +1,9 @@
 import unittest
 
-from v03_pipeline.lib.misc.io import compute_hail_n_partitions, file_size_bytes
+from v03_pipeline.lib.misc.io import (
+    compute_hail_n_partitions,
+    file_size_bytes,
+)
 
 TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
 TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
diff --git a/v03_pipeline/lib/misc/vets.py b/v03_pipeline/lib/misc/vets.py
@@ -0,0 +1,39 @@
+import hail as hl
+
+VETS_SNP_CUTOFF = 0.997
+VETS_INDEL_CUTOFF = 0.99
+VETS_SNP_FILTER = 'high_CALIBRATION_SENSITIVITY_SNP'
+VETS_INDEL_FILTER = 'high_CALIBRATION_SENSITIVITY_INDEL'
+
+
+def annotate_vets(mt: hl.MatrixTable) -> hl.MatrixTable:
+    if not hasattr(mt, 'info') or not hasattr(mt.info, 'CALIBRATION_SENSITIVITY'):
+        return mt
+    return mt.annotate_rows(
+        filters=hl.bind(
+            lambda is_snp, split_cs: (
+                hl.case()
+                .when(
+                    is_snp & (split_cs > VETS_SNP_CUTOFF),
+                    hl.if_else(
+                        hl.is_defined(mt.filters),
+                        mt.filters.add(VETS_SNP_FILTER),
+                        hl.set([VETS_SNP_FILTER]),
+                    ),
+                )
+                .when(
+                    ~is_snp & (split_cs > VETS_INDEL_CUTOFF),
+                    hl.if_else(
+                        hl.is_defined(mt.filters),
+                        mt.filters.add(VETS_INDEL_FILTER),
+                        hl.set([VETS_INDEL_FILTER]),
+                    ),
+                )
+                .default(
+                    mt.filters,
+                )
+            ),
+            hl.is_snp(mt.alleles[0], mt.alleles[1]),
+            hl.parse_float(mt.info.CALIBRATION_SENSITIVITY[mt.a_index - 1]),
+        ),
+    )
diff --git a/v03_pipeline/lib/misc/vets_test.py b/v03_pipeline/lib/misc/vets_test.py
@@ -0,0 +1,104 @@
+import unittest
+
+import hail as hl
+
+from v03_pipeline.lib.misc.io import split_multi_hts
+from v03_pipeline.lib.misc.vets import (
+    annotate_vets,
+)
+
+
+class VetsTest(unittest.TestCase):
+    def test_annotate_vets(self) -> None:
+        gatk_mt = hl.MatrixTable.from_parts(
+            rows={
+                'locus': [
+                    hl.Locus(
+                        contig='chr1',
+                        position=1,
+                        reference_genome='GRCh38',
+                    ),
+                ],
+                'filters': [
+                    hl.set(['NO_HQ_GENOTYPES']),
+                ],
+            },
+            cols={'s': ['sample_1']},
+            entries={'HL': [[0.0]]},
+        ).key_rows_by('locus')
+        gatk_mt = annotate_vets(gatk_mt)
+        dragen_mt = hl.MatrixTable.from_parts(
+            rows={
+                'locus': [
+                    hl.Locus(
+                        contig='chr1',
+                        position=1,
+                        reference_genome='GRCh38',
+                    ),
+                    hl.Locus(
+                        contig='chr1',
+                        position=2,
+                        reference_genome='GRCh38',
+                    ),
+                    hl.Locus(
+                        contig='chr1',
+                        position=3,
+                        reference_genome='GRCh38',
+                    ),
+                    hl.Locus(
+                        contig='chr1',
+                        position=4,
+                        reference_genome='GRCh38',
+                    ),
+                    hl.Locus(
+                        contig='chr1',
+                        position=5,
+                        reference_genome='GRCh38',
+                    ),
+                    hl.Locus(
+                        contig='chr1',
+                        position=6,
+                        reference_genome='GRCh38',
+                    ),
+                ],
+                'alleles': [
+                    ['A', 'T'],
+                    ['A', 'T'],
+                    ['A', 'T'],
+                    ['AC', 'T'],
+                    ['AT', 'ATC'],
+                    ['AG', 'ATG'],
+                ],
+                'filters': [
+                    hl.set(['NO_HQ_GENOTYPES']),
+                    hl.empty_set(hl.tstr),
+                    hl.missing(hl.tset(hl.tstr)),
+                    hl.set(['NO_HQ_GENOTYPES']),
+                    hl.empty_set(hl.tstr),
+                    hl.set(['NO_HQ_GENOTYPES']),
+                ],
+                'info': [
+                    hl.Struct(CALIBRATION_SENSITIVITY=['0.999']),
+                    hl.Struct(CALIBRATION_SENSITIVITY=['0.995']),
+                    hl.Struct(CALIBRATION_SENSITIVITY=['0.999']),
+                    hl.Struct(CALIBRATION_SENSITIVITY=['0.98']),
+                    hl.Struct(CALIBRATION_SENSITIVITY=['0.99']),
+                    hl.Struct(CALIBRATION_SENSITIVITY=['0.991']),
+                ],
+            },
+            cols={'s': ['sample_1']},
+            entries={'HL': [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]},
+        ).key_rows_by('locus', 'alleles')
+        dragen_mt = split_multi_hts(dragen_mt)
+        dragen_mt = annotate_vets(dragen_mt)
+        self.assertListEqual(
+            dragen_mt.filters.collect(),
+            [
+                {'NO_HQ_GENOTYPES', 'high_CALIBRATION_SENSITIVITY_SNP'},
+                set(),
+                {'high_CALIBRATION_SENSITIVITY_SNP'},
+                {'NO_HQ_GENOTYPES'},
+                set(),
+                {'NO_HQ_GENOTYPES', 'high_CALIBRATION_SENSITIVITY_INDEL'},
+            ],
+        )
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -11,6 +11,7 @@
     validate_no_duplicate_variants,
     validate_sample_type,
 )
+from v03_pipeline.lib.misc.vets import annotate_vets
 from v03_pipeline.lib.model import CachedReferenceDatasetQuery
 from v03_pipeline.lib.paths import (
     imported_callset_path,
@@ -85,6 +86,10 @@ def create_table(self) -> hl.MatrixTable:
         mt = select_relevant_fields(mt, self.dataset_type)
         if self.dataset_type.has_multi_allelic_variants:
             mt = split_multi_hts(mt)
+        # Special handling of variant-level filter annotation for VETs filters.
+        # The annotations are present on the sample-level FT field but are
+        # expected upstream on "filters".
+        mt = annotate_vets(mt)
         if self.dataset_type.can_run_validation:
             # Rather than throwing an error, we silently remove invalid contigs.
             # This happens fairly often for AnVIL requests.