Skip to content

Commit 189a1cc

Browse files
committed
Merge branch 'main' of github.com:broadinstitute/seqr-loading-pipelines into dev
2 parents 82baf0c + 56fc218 commit 189a1cc

38 files changed

+379
-53
lines changed

v03_pipeline/lib/annotations/fields_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def test_get_formatting_fields(self, mock_vep: Mock) -> None:
131131
reference_genome,
132132
DatasetType.SNV_INDEL,
133133
)
134-
if reference_dataset.is_keyed_by_interval
134+
if reference_dataset.formatting_annotation
135135
},
136136
**(
137137
{

v03_pipeline/lib/annotations/sv.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,14 @@ def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression:
175175
)
176176

177177

178-
def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression:
179-
return hl.or_missing(
180-
hl.is_defined(ht['info.gnomAD_V2_AF']),
181-
hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']),
182-
)
178+
def gnomad_svs(
179+
ht: hl.Table,
180+
gnomad_svs_ht: hl.Table,
181+
**_: Any,
182+
) -> hl.Expression:
183+
return gnomad_svs_ht.annotate(
184+
ID=gnomad_svs_ht.KEY,
185+
)[ht['info.GNOMAD_V4.1_TRUTH_VID']]
183186

184187

185188
def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:

v03_pipeline/lib/misc/io_test.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,13 @@
2323
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
2424
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
2525
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
26-
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
2726

2827

2928
class IOTest(unittest.TestCase):
3029
def test_file_size_mb(self) -> None:
3130
# find v03_pipeline/var/test/callsets/mito_1.mt -type f | grep -v 'crc' | xargs ls -alt {} | awk '{sum += $5; print sum}'
3231
# 191310
3332
self.assertEqual(file_size_bytes(TEST_MITO_MT), 191310)
34-
self.assertEqual(file_size_bytes(TEST_SV_VCF), 20040)
3533

3634
def test_compute_hail_n_partitions(self) -> None:
3735
self.assertEqual(compute_hail_n_partitions(23), 1)

v03_pipeline/lib/misc/sv.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.annotations import sv
4+
from v03_pipeline.lib.misc.pedigree import Family
5+
from v03_pipeline.lib.model import ReferenceGenome, Sex
6+
7+
8+
def overwrite_male_non_par_calls(
9+
mt: hl.MatrixTable,
10+
families: set[Family],
11+
) -> hl.MatrixTable:
12+
male_sample_ids = {
13+
s.sample_id for f in families for s in f.samples.values() if s.sex == Sex.MALE
14+
}
15+
male_sample_ids = (
16+
hl.set(male_sample_ids) if male_sample_ids else hl.empty_set(hl.str)
17+
)
18+
par_intervals = hl.array(
19+
[
20+
i
21+
for i in hl.get_reference(ReferenceGenome.GRCh38).par
22+
if i.start.contig == ReferenceGenome.GRCh38.x_contig
23+
],
24+
)
25+
non_par_interval = hl.interval(
26+
par_intervals[0].end,
27+
par_intervals[1].start,
28+
)
29+
# NB: making use of existing formatting_annotation_fns.
30+
# We choose to annotate & drop here as the sample level
31+
# fields are dropped by the time we format variants.
32+
mt = mt.annotate_rows(
33+
start_locus=sv.start_locus(mt),
34+
end_locus=sv.end_locus(mt),
35+
)
36+
mt = mt.annotate_entries(
37+
GT=hl.if_else(
38+
(
39+
male_sample_ids.contains(mt.s)
40+
& non_par_interval.overlaps(
41+
hl.interval(
42+
mt.start_locus,
43+
mt.end_locus,
44+
),
45+
)
46+
& mt.GT.is_het()
47+
),
48+
hl.Call([1], phased=False),
49+
mt.GT,
50+
),
51+
)
52+
return mt.drop('start_locus', 'end_locus')

v03_pipeline/lib/misc/sv_test.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import unittest
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.misc.io import import_callset, select_relevant_fields
6+
from v03_pipeline.lib.misc.pedigree import Family, Sample
7+
from v03_pipeline.lib.misc.sample_ids import subset_samples
8+
from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls
9+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, Sex
10+
11+
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
12+
13+
14+
class SVTest(unittest.TestCase):
15+
def test_overwrite_male_non_par_calls(self) -> None:
16+
mt = import_callset(TEST_SV_VCF, ReferenceGenome.GRCh38, DatasetType.SV)
17+
mt = select_relevant_fields(
18+
mt,
19+
DatasetType.SV,
20+
)
21+
mt = subset_samples(
22+
mt,
23+
hl.Table.parallelize(
24+
[{'s': sample_id} for sample_id in ['RGP_164_1', 'RGP_164_2']],
25+
hl.tstruct(s=hl.dtype('str')),
26+
key='s',
27+
),
28+
)
29+
mt = overwrite_male_non_par_calls(
30+
mt,
31+
{
32+
Family(
33+
family_guid='family_1',
34+
samples={
35+
'RGP_164_1': Sample(sample_id='RGP_164_1', sex=Sex.FEMALE),
36+
'RGP_164_2': Sample(sample_id='RGP_164_2', sex=Sex.MALE),
37+
},
38+
),
39+
},
40+
)
41+
mt = mt.filter_rows(mt.locus.contig == 'chrX')
42+
self.assertEqual(
43+
[
44+
hl.Locus(contig='chrX', position=3, reference_genome='GRCh38'),
45+
hl.Locus(contig='chrX', position=2781700, reference_genome='GRCh38'),
46+
],
47+
mt.locus.collect(),
48+
)
49+
self.assertEqual(
50+
[
51+
hl.Call(alleles=[0, 0], phased=False),
52+
# END of this variant < start of the non-par region.
53+
hl.Call(alleles=[0, 1], phased=False),
54+
hl.Call(alleles=[0, 0], phased=False),
55+
hl.Call(alleles=[1], phased=False),
56+
],
57+
mt.GT.collect(),
58+
)
59+
self.assertFalse(
60+
hasattr(mt, 'start_locus'),
61+
)

v03_pipeline/lib/model/dataset_type.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,9 @@ def row_fields(
121121
'info.CPX_TYPE': hl.tstr,
122122
'info.END': hl.tint32,
123123
'info.END2': hl.tint32,
124-
'info.gnomAD_V2_AF': hl.tfloat64,
125-
'info.gnomAD_V2_SVID': hl.tstr,
126124
'info.N_HET': hl.tint32,
127125
'info.N_HOMALT': hl.tint32,
126+
'info.GNOMAD_V4.1_TRUTH_VID': hl.tstr,
128127
'info.StrVCTVRE': hl.tstr,
129128
'info.SVLEN': hl.tint32,
130129
**sv.CONSEQ_PREDICTED_GENE_COLS,
@@ -387,3 +386,7 @@ def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
387386
sv.info,
388387
],
389388
}[self]
389+
390+
@property
391+
def overwrite_male_non_par_calls(self) -> None:
392+
return self == DatasetType.SV

v03_pipeline/lib/model/definitions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ def mito_contig(self) -> str:
7070
ReferenceGenome.GRCh38: 'chrM',
7171
}[self]
7272

73+
@property
74+
def x_contig(self) -> str:
75+
return 'X' if self == ReferenceGenome.GRCh37 else 'chrX'
76+
7377
def contig_recoding(self, include_mt: bool = False) -> dict[str, str]:
7478
recode = {
7579
ReferenceGenome.GRCh37: {
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.model import ReferenceGenome
4+
from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht
5+
6+
7+
def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table:
8+
ht = vcf_to_ht(path, reference_genome)
9+
ht = ht.select(
10+
KEY=ht.rsid,
11+
AF=ht.info.AF[0],
12+
AC=ht.info.AC[0],
13+
AN=ht.info.AN,
14+
N_HET=ht.info.N_HET,
15+
N_HOMREF=ht.info.N_HOMREF,
16+
)
17+
ht = ht.key_by('KEY')
18+
return ht.drop('locus', 'alleles')

v03_pipeline/lib/reference_datasets/misc.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,12 @@ def filter_contigs(ht, reference_genome: ReferenceGenome):
8383
ht.interval.start.contig,
8484
),
8585
)
86-
return ht.filter(
87-
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
88-
)
86+
# SV reference datasets are not keyed by locus.
87+
if hasattr(ht, 'locus'):
88+
return ht.filter(
89+
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
90+
)
91+
return ht
8992

9093

9194
def vcf_to_ht(

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import hail as hl
88

9+
from v03_pipeline.lib.annotations import snv_indel, sv
910
from v03_pipeline.lib.misc.validation import (
1011
validate_allele_type,
1112
validate_no_duplicate_variants,
@@ -27,8 +28,9 @@
2728
DATASET_TYPES = 'dataset_types'
2829
ENUMS = 'enums'
2930
EXCLUDE_FROM_ANNOTATIONS = 'exclude_from_annotations'
31+
EXCLUDE_FROM_ANNOTATIONS_UPDATES = 'exclude_from_annotations_updates'
32+
FORMATTING_ANNOTATION = 'formatting_annotation'
3033
FILTER = 'filter'
31-
IS_INTERVAL = 'is_interval'
3234
SELECT = 'select'
3335
VERSION = 'version'
3436
PATH = 'path'
@@ -69,9 +71,24 @@ def for_reference_genome_dataset_type_annotations(
6971
if not CONFIG[dataset].get(EXCLUDE_FROM_ANNOTATIONS, False)
7072
}
7173

74+
@classmethod
75+
def for_reference_genome_dataset_type_annotations_updates(
76+
cls,
77+
reference_genome: ReferenceGenome,
78+
dataset_type: DatasetType,
79+
) -> set['ReferenceDataset']:
80+
return {
81+
dataset
82+
for dataset in cls.for_reference_genome_dataset_type_annotations(
83+
reference_genome,
84+
dataset_type,
85+
)
86+
if not CONFIG[dataset].get(EXCLUDE_FROM_ANNOTATIONS_UPDATES, False)
87+
}
88+
7289
@property
73-
def is_keyed_by_interval(self) -> bool:
74-
return CONFIG[self].get(IS_INTERVAL, False)
90+
def formatting_annotation(self) -> Callable | None:
91+
return CONFIG[self].get(FORMATTING_ANNOTATION)
7592

7693
@property
7794
def access_control(self) -> AccessControl:
@@ -154,9 +171,10 @@ class ReferenceDataset(BaseReferenceDataset, StrEnum):
154171
gnomad_coding_and_noncoding = 'gnomad_coding_and_noncoding'
155172
gnomad_exomes = 'gnomad_exomes'
156173
gnomad_genomes = 'gnomad_genomes'
157-
gnomad_qc = 'gnomad_qc'
158174
gnomad_mito = 'gnomad_mito'
159175
gnomad_non_coding_constraint = 'gnomad_non_coding_constraint'
176+
gnomad_qc = 'gnomad_qc'
177+
gnomad_svs = 'gnomad_svs'
160178
screen = 'screen'
161179
local_constraint_mito = 'local_constraint_mito'
162180
mitomap = 'mitomap'
@@ -394,7 +412,7 @@ def get_ht(
394412
},
395413
},
396414
ReferenceDataset.gnomad_non_coding_constraint: {
397-
IS_INTERVAL: True,
415+
FORMATTING_ANNOTATION: snv_indel.gnomad_non_coding_constraint,
398416
ReferenceGenome.GRCh38: {
399417
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
400418
VERSION: '1.0',
@@ -414,7 +432,7 @@ def get_ht(
414432
'low-DNase',
415433
],
416434
},
417-
IS_INTERVAL: True,
435+
FORMATTING_ANNOTATION: snv_indel.screen,
418436
ReferenceGenome.GRCh38: {
419437
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
420438
VERSION: '1.0',
@@ -428,6 +446,15 @@ def get_ht(
428446
PATH: 'https://www.biorxiv.org/content/biorxiv/early/2023/01/27/2022.12.16.520778/DC3/embed/media-3.zip',
429447
},
430448
},
449+
ReferenceDataset.gnomad_svs: {
450+
EXCLUDE_FROM_ANNOTATIONS_UPDATES: True,
451+
FORMATTING_ANNOTATION: sv.gnomad_svs,
452+
ReferenceGenome.GRCh38: {
453+
DATASET_TYPES: frozenset([DatasetType.SV]),
454+
VERSION: '1.0',
455+
PATH: 'gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz',
456+
},
457+
},
431458
}
432459
CONFIG[ReferenceDatasetQuery.clinvar_path_variants] = {
433460
EXCLUDE_FROM_ANNOTATIONS: True,

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
)
99
from v03_pipeline.lib.reference_datasets.reference_dataset import (
1010
BaseReferenceDataset,
11+
ReferenceDataset,
1112
ReferenceDatasetQuery,
1213
)
1314
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
@@ -77,17 +78,13 @@ def update_table(self, ht: hl.Table) -> hl.Table:
7778
def annotate_globals(
7879
self,
7980
ht: hl.Table,
81+
reference_datasets: set[ReferenceDataset],
8082
) -> hl.Table:
8183
ht = ht.annotate_globals(
8284
versions=hl.Struct(),
8385
enums=hl.Struct(),
8486
)
85-
for (
86-
reference_dataset
87-
) in BaseReferenceDataset.for_reference_genome_dataset_type_annotations(
88-
self.reference_genome,
89-
self.dataset_type,
90-
):
87+
for reference_dataset in reference_datasets:
9188
rd_ht = hl.read_table(
9289
valid_reference_dataset_path(self.reference_genome, reference_dataset),
9390
)

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(self, *args, **kwargs):
2929
def complete(self) -> bool:
3030
reference_dataset_names = {
3131
rd.name
32-
for rd in BaseReferenceDataset.for_reference_genome_dataset_type_annotations(
32+
for rd in BaseReferenceDataset.for_reference_genome_dataset_type_annotations_updates(
3333
self.reference_genome,
3434
self.dataset_type,
3535
)
@@ -65,18 +65,11 @@ def update_table(self, ht: hl.Table) -> hl.Table:
6565
reference_dataset_ht = hl.read_table(
6666
valid_reference_dataset_path(self.reference_genome, reference_dataset),
6767
)
68-
if reference_dataset.is_keyed_by_interval:
69-
formatting_fn = next(
70-
x
71-
for x in self.dataset_type.formatting_annotation_fns(
72-
self.reference_genome,
73-
)
74-
if x.__name__ == reference_dataset.name
75-
)
68+
if reference_dataset.formatting_annotation:
7669
ht = ht.annotate(
7770
**get_fields(
7871
ht,
79-
[formatting_fn],
72+
[reference_dataset.formatting_annotation],
8073
**{f'{reference_dataset.name}_ht': reference_dataset_ht},
8174
**self.param_kwargs,
8275
),
@@ -103,4 +96,10 @@ def update_table(self, ht: hl.Table) -> hl.Table:
10396
)
10497
ht = ht.join(reference_dataset_ht, 'left')
10598

106-
return self.annotate_globals(ht)
99+
return self.annotate_globals(
100+
ht,
101+
BaseReferenceDataset.for_reference_genome_dataset_type_annotations_updates(
102+
self.reference_genome,
103+
self.dataset_type,
104+
),
105+
)

0 commit comments

Comments
 (0)