Skip to content

Commit 9e9f2b3

Browse files
authored
Merge pull request #1050 from broadinstitute/dev
Dev
2 parents 189a1cc + aa0027b commit 9e9f2b3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+160
-8
lines changed

v03_pipeline/lib/annotations/sv.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def gnomad_svs(
180180
gnomad_svs_ht: hl.Table,
181181
**_: Any,
182182
) -> hl.Expression:
183+
gnomad_svs_ht = gnomad_svs_ht.drop('locus', 'alleles')
183184
return gnomad_svs_ht.annotate(
184185
ID=gnomad_svs_ht.KEY,
185186
)[ht['info.GNOMAD_V4.1_TRUTH_VID']]

v03_pipeline/lib/reference_datasets/gnomad_svs.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,4 @@ def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table:
1414
N_HET=ht.info.N_HET,
1515
N_HOMREF=ht.info.N_HOMREF,
1616
)
17-
ht = ht.key_by('KEY')
18-
return ht.drop('locus', 'alleles')
17+
return ht.key_by('KEY')
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import unittest
2+
from unittest.mock import patch
3+
4+
import hail as hl
5+
6+
from v03_pipeline.lib.model import ReferenceGenome
7+
from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset
8+
9+
TEST_GNOMAD_SVS_RAW_HT = (
10+
'v03_pipeline/var/test/reference_datasets/raw/gnomad_svs_from_vcf.ht'
11+
)
12+
13+
14+
class GnomadSVsTest(unittest.TestCase):
15+
@patch('v03_pipeline.lib.reference_datasets.gnomad_svs.vcf_to_ht')
16+
def test_gnomad_svs(self, mock_vcf_to_ht):
17+
mock_vcf_to_ht.return_value = hl.read_table(TEST_GNOMAD_SVS_RAW_HT)
18+
ht = ReferenceDataset.gnomad_svs.get_ht(ReferenceGenome.GRCh38)
19+
self.assertEqual(
20+
ht.collect(),
21+
[
22+
hl.Struct(
23+
KEY='gnomAD-SV_v3_BND_chr1_1a45f73a',
24+
locus=hl.Locus(
25+
contig='chr1',
26+
position=10434,
27+
reference_genome=ReferenceGenome.GRCh38,
28+
),
29+
alleles=['N', '<BND>'],
30+
AF=0.11413399875164032,
31+
AC=8474,
32+
AN=74246,
33+
N_HET=8426,
34+
N_HOMREF=28673,
35+
),
36+
hl.Struct(
37+
KEY='gnomAD-SV_v3_BND_chr1_3fa36917',
38+
locus=hl.Locus(
39+
contig='chr1',
40+
position=10440,
41+
reference_genome=ReferenceGenome.GRCh38,
42+
),
43+
alleles=['N', '<BND>'],
44+
AF=0.004201000090688467,
45+
AC=466,
46+
AN=110936,
47+
N_HET=466,
48+
N_HOMREF=55002,
49+
),
50+
hl.Struct(
51+
KEY='gnomAD-SV_v3_BND_chr1_7bbf34b5',
52+
locus=hl.Locus(
53+
contig='chr1',
54+
position=10464,
55+
reference_genome=ReferenceGenome.GRCh38,
56+
),
57+
alleles=['N', '<BND>'],
58+
AF=0.03698499873280525,
59+
AC=3119,
60+
AN=84332,
61+
N_HET=3115,
62+
N_HOMREF=39049,
63+
),
64+
hl.Struct(
65+
KEY='gnomAD-SV_v3_BND_chr1_933a2971',
66+
locus=hl.Locus(
67+
contig='chr1',
68+
position=10450,
69+
reference_genome=ReferenceGenome.GRCh38,
70+
),
71+
alleles=['N', '<BND>'],
72+
AF=0.3238990008831024,
73+
AC=21766,
74+
AN=67200,
75+
N_HET=21616,
76+
N_HOMREF=11909,
77+
),
78+
hl.Struct(
79+
KEY='gnomAD-SV_v3_DUP_chr1_01c2781c',
80+
locus=hl.Locus(
81+
contig='chr1',
82+
position=10000,
83+
reference_genome=ReferenceGenome.GRCh38,
84+
),
85+
alleles=['N', '<DUP>'],
86+
AF=0.0019970000721514225,
87+
AC=139,
88+
AN=69594,
89+
N_HET=139,
90+
N_HOMREF=34658,
91+
),
92+
],
93+
)

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ def version(self, reference_genome: ReferenceGenome) -> str:
104104
)
105105
return version
106106

107+
def dataset_types(
108+
self,
109+
reference_genome: ReferenceGenome,
110+
) -> frozenset[DatasetType]:
111+
return CONFIG[self][reference_genome][DATASET_TYPES]
112+
107113
@property
108114
def enums(self) -> dict | None:
109115
return CONFIG[self].get(ENUMS)
@@ -143,11 +149,9 @@ def get_ht(
143149
if enum_selects:
144150
ht = ht.transmute(**enum_selects)
145151
ht = filter_contigs(ht, reference_genome)
146-
# Reference Datasets are DatasetType agnostic, but these
147-
# methods (in theory) support SV/GCNV. SNV_INDEL
148-
# is passed as a proxy for non-SV/GCNV.
149-
validate_allele_type(ht, DatasetType.SNV_INDEL)
150-
validate_no_duplicate_variants(ht, reference_genome, DatasetType.SNV_INDEL)
152+
for dataset_type in self.dataset_types(reference_genome):
153+
validate_allele_type(ht, dataset_type)
154+
validate_no_duplicate_variants(ht, reference_genome, dataset_type)
151155
# NB: we do not filter with "filter" here
152156
# ReferenceDatasets are DatasetType agnostic and that
153157
# filter is only used at annotation time.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.annotations import sv
4+
from v03_pipeline.lib.migration.base_migration import BaseMigration
5+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
6+
from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset
7+
8+
# This vcf was generated with the gatk command:
9+
#
10+
# gatk SVConcordance --verbosity DEBUG --evaluation /var/seqr/phase4.seqr.gnomad_v4_tmp.vcf.gz
11+
# --truth /var/seqr/gnomad.v4.1.sv.sites.modified.vcf.bgz
12+
# --sequence-dictionary gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict
13+
#
14+
# Followed by:
15+
# bcftools annotate --rename-annots /var/seqr/remap /var/seqr/phase4.seqr.gnomad_v4_tmp.vcf.gz | bgzip > /var/seqr/phase4.seqr.gnomad_v4.vcf.gz
16+
#
17+
# where remap contains "INFO/TRUTH_VID GNOMAD_V4.1_TRUTH_VID"
18+
PHASE_4_CALLSET_WITH_GNOMAD_V4 = 'gs://seqr-loading-temp/phase4.seqr.gnomad_v4.vcf.gz'
19+
20+
21+
class AddGnomadSVs(BaseMigration):
22+
reference_genome_dataset_types: frozenset[
23+
tuple[ReferenceGenome, DatasetType]
24+
] = frozenset(
25+
((ReferenceGenome.GRCh38, DatasetType.SV),),
26+
)
27+
28+
@staticmethod
29+
def migrate(ht: hl.Table, **_) -> hl.Table:
30+
mapping_ht = (
31+
hl.import_vcf(
32+
PHASE_4_CALLSET_WITH_GNOMAD_V4,
33+
reference_genome=ReferenceGenome.GRCh38.value,
34+
force_bgz=True,
35+
)
36+
.key_rows_by('rsid')
37+
.rows()
38+
)
39+
ht = ht.annotate(
40+
**{
41+
'info.GNOMAD_V4.1_TRUTH_VID': mapping_ht[ht.key].info[
42+
'GNOMAD_V4.1_TRUTH_VID'
43+
],
44+
},
45+
)
46+
gnomad_svs_ht = ReferenceDataset.gnomad_svs.get_ht(ReferenceGenome.GRCh38)
47+
ht = ht.annotate(gnomad_svs=sv.gnomad_svs(ht, gnomad_svs_ht))
48+
ht = ht.drop('info.GNOMAD_V4.1_TRUTH_VID')
49+
return ht.annotate_globals(
50+
versions=ht.globals.versions.annotate(gnomad_svs='1.0'),
51+
enums=ht.globals.enums.annotate(gnomad_svs=hl.Struct()),
52+
)
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
22
Written with version 0.2.133-4c60fddb171a
3-
Created at 2025/02/16 18:40:38
3+
Created at 2025/03/05 12:27:53
Binary file not shown.
Binary file not shown.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
2+
Written with version 0.2.133-4c60fddb171a
3+
Created at 2025/03/04 14:19:46

v03_pipeline/var/test/reference_datasets/raw/gnomad_svs_from_vcf.ht/_SUCCESS

Whitespace-only changes.

0 commit comments

Comments
 (0)