Skip to content

Commit 56fc218

Browse files
bpblankenjklugherz
andauthored
feat: add gnomad_svs as reference dataset. (#1041)
* bump hail version (#1027) * First pass * Finish test * do it correctly :/ * ruff * correct feature flag * bump * fix test * fix svs * ruff * Benb/bugfix allow deleting project to succeed if no project (#1038) * First pass * add comment * ruff * gnomad svs reference dataset * tweaks * tdr updates for sample qc * dead code (#1045) * add global to table * ruff * use the function * add missing enums struct * sprawl :/ * test finished! * ruff * change field name * delete * fixes * need loadable families * ruff --------- Co-authored-by: Julia Klugherz <juliaklugherz@gmail.com>
1 parent 4ea8740 commit 56fc218

32 files changed

+108
-49
lines changed

v03_pipeline/lib/annotations/fields_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def test_get_formatting_fields(self, mock_vep: Mock) -> None:
131131
reference_genome,
132132
DatasetType.SNV_INDEL,
133133
)
134-
if reference_dataset.is_keyed_by_interval
134+
if reference_dataset.formatting_annotation
135135
},
136136
**(
137137
{

v03_pipeline/lib/annotations/sv.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,14 @@ def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression:
175175
)
176176

177177

178-
def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression:
179-
return hl.or_missing(
180-
hl.is_defined(ht['info.gnomAD_V2_AF']),
181-
hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']),
182-
)
178+
def gnomad_svs(
179+
ht: hl.Table,
180+
gnomad_svs_ht: hl.Table,
181+
**_: Any,
182+
) -> hl.Expression:
183+
return gnomad_svs_ht.annotate(
184+
ID=gnomad_svs_ht.KEY,
185+
)[ht['info.GNOMAD_V4.1_TRUTH_VID']]
183186

184187

185188
def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:

v03_pipeline/lib/misc/io_test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
2424
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
2525
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
26-
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
2726

2827

2928
class IOTest(unittest.TestCase):

v03_pipeline/lib/model/dataset_type.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,9 @@ def row_fields(
121121
'info.CPX_TYPE': hl.tstr,
122122
'info.END': hl.tint32,
123123
'info.END2': hl.tint32,
124-
'info.gnomAD_V2_AF': hl.tfloat64,
125-
'info.gnomAD_V2_SVID': hl.tstr,
126124
'info.N_HET': hl.tint32,
127125
'info.N_HOMALT': hl.tint32,
126+
'info.GNOMAD_V4.1_TRUTH_VID': hl.tstr,
128127
'info.StrVCTVRE': hl.tstr,
129128
'info.SVLEN': hl.tint32,
130129
**sv.CONSEQ_PREDICTED_GENE_COLS,
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.model import ReferenceGenome
4+
from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht
5+
6+
7+
def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table:
8+
ht = vcf_to_ht(path, reference_genome)
9+
ht = ht.select(
10+
KEY=ht.rsid,
11+
AF=ht.info.AF[0],
12+
AC=ht.info.AC[0],
13+
AN=ht.info.AN,
14+
N_HET=ht.info.N_HET,
15+
N_HOMREF=ht.info.N_HOMREF,
16+
)
17+
ht = ht.key_by('KEY')
18+
return ht.drop('locus', 'alleles')

v03_pipeline/lib/reference_datasets/misc.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,12 @@ def filter_contigs(ht, reference_genome: ReferenceGenome):
8383
ht.interval.start.contig,
8484
),
8585
)
86-
return ht.filter(
87-
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
88-
)
86+
# SV reference datasets are not keyed by locus.
87+
if hasattr(ht, 'locus'):
88+
return ht.filter(
89+
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
90+
)
91+
return ht
8992

9093

9194
def vcf_to_ht(

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import hail as hl
88

9+
from v03_pipeline.lib.annotations import snv_indel, sv
910
from v03_pipeline.lib.misc.validation import (
1011
validate_allele_type,
1112
validate_no_duplicate_variants,
@@ -27,8 +28,9 @@
2728
DATASET_TYPES = 'dataset_types'
2829
ENUMS = 'enums'
2930
EXCLUDE_FROM_ANNOTATIONS = 'exclude_from_annotations'
31+
EXCLUDE_FROM_ANNOTATIONS_UPDATES = 'exclude_from_annotations_updates'
32+
FORMATTING_ANNOTATION = 'formatting_annotation'
3033
FILTER = 'filter'
31-
IS_INTERVAL = 'is_interval'
3234
SELECT = 'select'
3335
VERSION = 'version'
3436
PATH = 'path'
@@ -69,9 +71,24 @@ def for_reference_genome_dataset_type_annotations(
6971
if not CONFIG[dataset].get(EXCLUDE_FROM_ANNOTATIONS, False)
7072
}
7173

74+
@classmethod
75+
def for_reference_genome_dataset_type_annotations_updates(
76+
cls,
77+
reference_genome: ReferenceGenome,
78+
dataset_type: DatasetType,
79+
) -> set['ReferenceDataset']:
80+
return {
81+
dataset
82+
for dataset in cls.for_reference_genome_dataset_type_annotations(
83+
reference_genome,
84+
dataset_type,
85+
)
86+
if not CONFIG[dataset].get(EXCLUDE_FROM_ANNOTATIONS_UPDATES, False)
87+
}
88+
7289
@property
73-
def is_keyed_by_interval(self) -> bool:
74-
return CONFIG[self].get(IS_INTERVAL, False)
90+
def formatting_annotation(self) -> Callable | None:
91+
return CONFIG[self].get(FORMATTING_ANNOTATION)
7592

7693
@property
7794
def access_control(self) -> AccessControl:
@@ -154,9 +171,10 @@ class ReferenceDataset(BaseReferenceDataset, StrEnum):
154171
gnomad_coding_and_noncoding = 'gnomad_coding_and_noncoding'
155172
gnomad_exomes = 'gnomad_exomes'
156173
gnomad_genomes = 'gnomad_genomes'
157-
gnomad_qc = 'gnomad_qc'
158174
gnomad_mito = 'gnomad_mito'
159175
gnomad_non_coding_constraint = 'gnomad_non_coding_constraint'
176+
gnomad_qc = 'gnomad_qc'
177+
gnomad_svs = 'gnomad_svs'
160178
screen = 'screen'
161179
local_constraint_mito = 'local_constraint_mito'
162180
mitomap = 'mitomap'
@@ -394,7 +412,7 @@ def get_ht(
394412
},
395413
},
396414
ReferenceDataset.gnomad_non_coding_constraint: {
397-
IS_INTERVAL: True,
415+
FORMATTING_ANNOTATION: snv_indel.gnomad_non_coding_constraint,
398416
ReferenceGenome.GRCh38: {
399417
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
400418
VERSION: '1.0',
@@ -414,7 +432,7 @@ def get_ht(
414432
'low-DNase',
415433
],
416434
},
417-
IS_INTERVAL: True,
435+
FORMATTING_ANNOTATION: snv_indel.screen,
418436
ReferenceGenome.GRCh38: {
419437
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
420438
VERSION: '1.0',
@@ -428,6 +446,15 @@ def get_ht(
428446
PATH: 'https://www.biorxiv.org/content/biorxiv/early/2023/01/27/2022.12.16.520778/DC3/embed/media-3.zip',
429447
},
430448
},
449+
ReferenceDataset.gnomad_svs: {
450+
EXCLUDE_FROM_ANNOTATIONS_UPDATES: True,
451+
FORMATTING_ANNOTATION: sv.gnomad_svs,
452+
ReferenceGenome.GRCh38: {
453+
DATASET_TYPES: frozenset([DatasetType.SV]),
454+
VERSION: '1.0',
455+
PATH: 'gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz',
456+
},
457+
},
431458
}
432459
CONFIG[ReferenceDatasetQuery.clinvar_path_variants] = {
433460
EXCLUDE_FROM_ANNOTATIONS: True,

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
)
99
from v03_pipeline.lib.reference_datasets.reference_dataset import (
1010
BaseReferenceDataset,
11+
ReferenceDataset,
1112
ReferenceDatasetQuery,
1213
)
1314
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
@@ -77,17 +78,13 @@ def update_table(self, ht: hl.Table) -> hl.Table:
7778
def annotate_globals(
7879
self,
7980
ht: hl.Table,
81+
reference_datasets: set[ReferenceDataset],
8082
) -> hl.Table:
8183
ht = ht.annotate_globals(
8284
versions=hl.Struct(),
8385
enums=hl.Struct(),
8486
)
85-
for (
86-
reference_dataset
87-
) in BaseReferenceDataset.for_reference_genome_dataset_type_annotations(
88-
self.reference_genome,
89-
self.dataset_type,
90-
):
87+
for reference_dataset in reference_datasets:
9188
rd_ht = hl.read_table(
9289
valid_reference_dataset_path(self.reference_genome, reference_dataset),
9390
)

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(self, *args, **kwargs):
2929
def complete(self) -> bool:
3030
reference_dataset_names = {
3131
rd.name
32-
for rd in BaseReferenceDataset.for_reference_genome_dataset_type_annotations(
32+
for rd in BaseReferenceDataset.for_reference_genome_dataset_type_annotations_updates(
3333
self.reference_genome,
3434
self.dataset_type,
3535
)
@@ -65,18 +65,11 @@ def update_table(self, ht: hl.Table) -> hl.Table:
6565
reference_dataset_ht = hl.read_table(
6666
valid_reference_dataset_path(self.reference_genome, reference_dataset),
6767
)
68-
if reference_dataset.is_keyed_by_interval:
69-
formatting_fn = next(
70-
x
71-
for x in self.dataset_type.formatting_annotation_fns(
72-
self.reference_genome,
73-
)
74-
if x.__name__ == reference_dataset.name
75-
)
68+
if reference_dataset.formatting_annotation:
7669
ht = ht.annotate(
7770
**get_fields(
7871
ht,
79-
[formatting_fn],
72+
[reference_dataset.formatting_annotation],
8073
**{f'{reference_dataset.name}_ht': reference_dataset_ht},
8174
**self.param_kwargs,
8275
),
@@ -103,4 +96,10 @@ def update_table(self, ht: hl.Table) -> hl.Table:
10396
)
10497
ht = ht.join(reference_dataset_ht, 'left')
10598

106-
return self.annotate_globals(ht)
99+
return self.annotate_globals(
100+
ht,
101+
BaseReferenceDataset.for_reference_genome_dataset_type_annotations_updates(
102+
self.reference_genome,
103+
self.dataset_type,
104+
),
105+
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
lookup_table_path,
1010
new_variants_table_path,
1111
)
12+
from v03_pipeline.lib.reference_datasets.reference_dataset import (
13+
BaseReferenceDataset,
14+
)
1215
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
1316
BaseLoadingRunParams,
1417
)
@@ -104,7 +107,13 @@ def update_table(self, ht: hl.Table) -> hl.Table:
104107
ht = non_callset_variants_ht.union(callset_variants_ht, unify=True)
105108

106109
# Fix up the globals and mark the table as updated with these callset/project pairs.
107-
ht = self.annotate_globals(ht)
110+
ht = self.annotate_globals(
111+
ht,
112+
BaseReferenceDataset.for_reference_genome_dataset_type_annotations(
113+
self.reference_genome,
114+
self.dataset_type,
115+
),
116+
)
108117
return ht.annotate_globals(
109118
updates=ht.updates.union(
110119
{

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -935,8 +935,9 @@ def test_sv_update_vat(
935935
ht.globals.collect(),
936936
[
937937
hl.Struct(
938-
versions=hl.Struct(),
938+
versions=hl.Struct(gnomad_svs='1.0'),
939939
enums=hl.Struct(
940+
gnomad_svs=hl.Struct(),
940941
sv_type=SV_TYPES,
941942
sv_type_detail=SV_TYPE_DETAILS,
942943
sorted_gene_consequences=hl.Struct(
@@ -1548,8 +1549,12 @@ def test_sv_update_vat(
15481549
Het=590,
15491550
),
15501551
gnomad_svs=hl.Struct(
1551-
AF=hl.eval(hl.float32(0.06896299868822098)),
1552-
ID='gnomAD-SV_v2.1_INS_chr1_65',
1552+
ID='gnomAD-SV_v3_BND_chr1_0001f5b7',
1553+
AF=3.2e-05,
1554+
AC=4,
1555+
AN=126092,
1556+
N_HET=4,
1557+
N_HOMREF=63042,
15531558
),
15541559
rg37_locus=hl.Locus(
15551560
contig=1,

v03_pipeline/lib/tasks/write_new_variants_table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def create_table(self) -> hl.Table:
168168
self.reference_genome,
169169
self.dataset_type,
170170
):
171-
if reference_dataset.is_keyed_by_interval:
171+
if reference_dataset.formatting_annotation:
172172
continue
173173
reference_dataset_ht = self.annotation_dependencies[
174174
f'{reference_dataset.value}_ht'

v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010
from v03_pipeline.lib.tasks.write_variant_annotations_vcf import (
1111
WriteVariantAnnotationsVCF,
1212
)
13-
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
14-
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
13+
from v03_pipeline.lib.test.mocked_reference_datasets_testcase import (
14+
MockedReferenceDatasetsTestCase,
15+
)
1516

1617
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
1718
TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
@@ -36,20 +37,15 @@
3637
}
3738

3839

39-
class WriteVariantAnnotationsVCFTest(MockedDatarootTestCase):
40+
class WriteVariantAnnotationsVCFTest(MockedReferenceDatasetsTestCase):
4041
@patch(
4142
'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_gene_symbol_to_gene_id',
4243
)
43-
@patch(
44-
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetQueryTask',
45-
)
4644
def test_sv_export_vcf(
4745
self,
48-
mock_rd_query_task: Mock,
4946
mock_load_gencode: Mock,
5047
) -> None:
5148
mock_load_gencode.return_value = GENE_ID_MAPPING
52-
mock_rd_query_task.return_value = MockCompleteTask()
5349
worker = luigi.worker.Worker()
5450
update_variant_annotations_task = (
5551
UpdateVariantAnnotationsTableWithNewSamplesTask(

0 commit comments

Comments
 (0)