Skip to content

Commit a6a560c

Browse files
authored
main <- dev (#860)
* Move vep files (#844) * Add mito local constraint (#845) * Add mito local constraint * Fix tests * lint * Benb/migration task (#834) * split import and validate * lint and share function * ruff * change dep * tweak update * lint * wrong method * correct method * mocks * change sample type annotation on test * hack on migration * sort return list * move the migration * still hacking * better! * getting there * Cleaner * ruff * Finish it off * migration * rename var * add migrations to annotations table * fix test import * actually fix the test * add migrations * not used here * use globals * missed one * a hilarious typo * Update migrate_variant_annotations_table.py * correct sign * add lookup migration * Add lookup table migration * adjust migration * ruff * Add to tasks * ensure a migration cannot run before a previous migration! * ruff * fix bug * lint * add referencegenomedatasetype * Annoying but fixed * Add new SV annotations for VCF export. (#857) * Add SV annotations * ruff * push * ruff * Update update_variant_annotations_table_with_new_samples_test.py * Add a task to export the SV annotations table to VCF. (#858) * Export VCF task * Fix test * lint * Resolve the assumption in the pipeline that remap/pedigree files are immutable. (#856) * add remap_pedigree hash * add func * all the imports * ruff * Fix it * support missing remap * ruff * ruff * ruff * tweak the type * tweak the type * Fix test * ruff * add remap pedigree hash * Explicit int32 * lint * Update io.py * ruff * lint * hash * Flappy test * wrong pedigree * bad colon * finish tests * add a test * add pedigree * Fix test
1 parent 94fc293 commit a6a560c

File tree

47 files changed

+1372
-51
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1372
-51
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ inline-quotes = "single"
6464
'SLF001', # allow private access
6565
'PLR0913', # allow high arity functions
6666
]
67+
'*migration*' = [
68+
'N999', # allow invalid module names
69+
]
6770

6871
[tool.ruff.pylint]
6972
max-args = 6

v03_pipeline/lib/annotations/sv.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,41 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:
8181
return ht.alleles[1].replace('[<>]', '').split(':', 2)
8282

8383

84+
def alleles(ht: hl.Table, **_: Any) -> hl.ArrayExpression:
85+
return hl.array(
86+
[
87+
'N',
88+
hl.if_else(
89+
(
90+
hl.is_defined(ht.sv_type_detail_id)
91+
& (hl.array(SV_TYPES)[ht.sv_type_id] != 'CPX')
92+
),
93+
hl.format(
94+
'<%s:%s>',
95+
hl.array(SV_TYPES)[ht.sv_type_id],
96+
hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id],
97+
),
98+
hl.format('<%s>', hl.array(SV_TYPES)[ht.sv_type_id]),
99+
),
100+
],
101+
)
102+
103+
104+
def info(ht: hl.Table, **_: Any) -> hl.StructExpression:
105+
return hl.Struct(
106+
ALGORITHMS=ht.algorithms,
107+
END=ht.start_locus.position,
108+
CHR2=ht.end_locus.contig,
109+
END2=ht.end_locus.position,
110+
SVTYPE=hl.array(SV_TYPES)[ht.sv_type_id],
111+
SVLEN=ht.sv_len,
112+
)
113+
114+
115+
def locus(ht: hl.Table, **_: Any) -> hl.LocusExpression:
116+
return ht.start_locus
117+
118+
84119
def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
85120
return hl.str(',').join(ht['info.ALGORITHMS'])
86121

@@ -205,6 +240,10 @@ def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
205240
return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))
206241

207242

243+
def sv_len(ht: hl.Table, **_: Any) -> hl.Expression:
244+
return ht['info.SVLEN']
245+
246+
208247
def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
209248
return SV_TYPES_LOOKUP[_sv_types(ht)[0]]
210249

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import unittest
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.annotations.fields import get_fields
6+
from v03_pipeline.lib.model import DatasetType
7+
8+
9+
class SVTest(unittest.TestCase):
10+
def test_sv_export_annotations(self) -> None:
11+
ht = hl.Table.parallelize(
12+
[
13+
hl.Struct(
14+
id=0,
15+
algorithms='manta',
16+
end_locus=hl.Locus(
17+
contig='chr5',
18+
position=20404,
19+
reference_genome='GRCh38',
20+
),
21+
start_locus=hl.Locus(
22+
contig='chr1',
23+
position=180928,
24+
reference_genome='GRCh38',
25+
),
26+
sv_len=123,
27+
sv_type_id=2,
28+
sv_type_detail_id=None,
29+
),
30+
hl.Struct(
31+
id=1,
32+
algorithms='manta',
33+
end_locus=hl.Locus(
34+
contig='chr1',
35+
position=789481,
36+
reference_genome='GRCh38',
37+
),
38+
start_locus=hl.Locus(
39+
contig='chr1',
40+
position=789481,
41+
reference_genome='GRCh38',
42+
),
43+
sv_len=245,
44+
sv_type_id=2,
45+
sv_type_detail_id=None,
46+
),
47+
hl.Struct(
48+
id=2,
49+
algorithms='manta',
50+
end_locus=hl.Locus(
51+
contig='chr1',
52+
position=6559723,
53+
reference_genome='GRCh38',
54+
),
55+
start_locus=hl.Locus(
56+
contig='chr1',
57+
position=6558902,
58+
reference_genome='GRCh38',
59+
),
60+
sv_len=245,
61+
sv_type_id=3,
62+
sv_type_detail_id=2,
63+
),
64+
hl.Struct(
65+
id=3,
66+
algorithms='manta',
67+
end_locus=hl.Locus(
68+
contig='chr1',
69+
position=6559723,
70+
reference_genome='GRCh38',
71+
),
72+
start_locus=hl.Locus(
73+
contig='chr1',
74+
position=6558902,
75+
reference_genome='GRCh38',
76+
),
77+
sv_len=245,
78+
sv_type_id=7,
79+
sv_type_detail_id=6,
80+
),
81+
],
82+
hl.tstruct(
83+
id=hl.tint32,
84+
algorithms=hl.tstr,
85+
end_locus=hl.tlocus('GRCh38'),
86+
start_locus=hl.tlocus('GRCh38'),
87+
sv_len=hl.tint32,
88+
sv_type_id=hl.tint32,
89+
sv_type_detail_id=hl.tint32,
90+
),
91+
key='id',
92+
)
93+
ht = ht.select(
94+
**get_fields(
95+
ht,
96+
DatasetType.SV.export_vcf_annotation_fns,
97+
),
98+
)
99+
self.assertEqual(
100+
ht.collect(),
101+
[
102+
hl.Struct(
103+
id=0,
104+
locus=hl.Locus(
105+
contig='chr1',
106+
position=180928,
107+
reference_genome='GRCh38',
108+
),
109+
alleles=['N', '<BND>'],
110+
info=hl.Struct(
111+
ALGORITHMS='manta',
112+
END=180928,
113+
CHR2='chr5',
114+
END2=20404,
115+
SVTYPE='BND',
116+
SVLEN=123,
117+
),
118+
),
119+
hl.Struct(
120+
id=1,
121+
locus=hl.Locus(
122+
contig='chr1',
123+
position=789481,
124+
reference_genome='GRCh38',
125+
),
126+
alleles=['N', '<BND>'],
127+
info=hl.Struct(
128+
ALGORITHMS='manta',
129+
END=789481,
130+
CHR2='chr1',
131+
END2=789481,
132+
SVTYPE='BND',
133+
SVLEN=245,
134+
),
135+
),
136+
hl.Struct(
137+
id=2,
138+
locus=hl.Locus(
139+
contig='chr1',
140+
position=6558902,
141+
reference_genome='GRCh38',
142+
),
143+
alleles=['N', '<CPX>'],
144+
info=hl.Struct(
145+
ALGORITHMS='manta',
146+
END=6558902,
147+
CHR2='chr1',
148+
END2=6559723,
149+
SVTYPE='CPX',
150+
SVLEN=245,
151+
),
152+
),
153+
hl.Struct(
154+
id=3,
155+
locus=hl.Locus(
156+
contig='chr1',
157+
position=6558902,
158+
reference_genome='GRCh38',
159+
),
160+
alleles=['N', '<INS:ME:SVA>'],
161+
info=hl.Struct(
162+
ALGORITHMS='manta',
163+
END=6558902,
164+
CHR2='chr1',
165+
END2=6559723,
166+
SVTYPE='INS',
167+
SVLEN=245,
168+
),
169+
),
170+
],
171+
)

v03_pipeline/lib/migration/__init__.py

Whitespace-only changes.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from abc import ABC, abstractmethod
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
6+
7+
8+
class BaseMigration(ABC):
9+
reference_genome_dataset_types: frozenset[
10+
tuple[ReferenceGenome, DatasetType]
11+
] = None
12+
13+
@staticmethod
14+
@abstractmethod
15+
def migrate(ht: hl.Table) -> hl.Table:
16+
pass

v03_pipeline/lib/migration/misc.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import inspect
2+
import pkgutil
3+
import re
4+
5+
from v03_pipeline.lib.migration.base_migration import BaseMigration
6+
7+
MIGRATION_NAME_PATTERN = r'(\d\d\d\d_.*)'
8+
9+
10+
def list_migrations(
11+
path: str,
12+
) -> list[tuple[str, BaseMigration]]:
13+
migrations = []
14+
for loader, name, _ in pkgutil.iter_modules([path]):
15+
match = re.search(MIGRATION_NAME_PATTERN, name)
16+
if match:
17+
module = loader.find_module(name).load_module(name)
18+
implemented_migration = next(
19+
(
20+
m
21+
for m in module.__dict__.values()
22+
# Return objects that are
23+
# classes, subclasses of the BaseMigration
24+
# and also NOT the BaseMigration class.
25+
if inspect.isclass(m)
26+
and issubclass(m, BaseMigration)
27+
and m != BaseMigration
28+
),
29+
None,
30+
)
31+
if implemented_migration:
32+
migrations.append((match.group(1), implemented_migration))
33+
return sorted(migrations, key=lambda x: x[0])
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import os
2+
import shutil
3+
import tempfile
4+
import unittest
5+
6+
from v03_pipeline.lib.migration.base_migration import BaseMigration
7+
from v03_pipeline.lib.migration.misc import list_migrations
8+
9+
10+
class TestListMigrations(unittest.TestCase):
11+
def setUp(self):
12+
self.tmpdir = tempfile.TemporaryDirectory()
13+
for migration in [
14+
'__init__.py',
15+
'1111_a_migration.py',
16+
'0000_migration.py',
17+
'001_test.py',
18+
'abcd_test.py',
19+
'0000_migration.txt',
20+
]:
21+
with open(os.path.join(self.tmpdir.name, migration), 'w') as f:
22+
f.write(
23+
"""
24+
from v03_pipeline.lib.migration.base_migration import BaseMigration
25+
class ImplementedMigration(BaseMigration):
26+
pass
27+
""",
28+
)
29+
30+
def tearDown(self):
31+
if os.path.isdir(self.tmpdir.name):
32+
shutil.rmtree(self.tmpdir.name)
33+
34+
def test_list_migrations(self):
35+
self.assertEqual(
36+
[
37+
(x, issubclass(y, BaseMigration))
38+
for (x, y) in list_migrations(self.tmpdir.name)
39+
],
40+
[
41+
('0000_migration', True),
42+
('1111_a_migration', True),
43+
],
44+
)
45+
self.assertTrue(
46+
all(hasattr(x[1], 'migrate') for x in list_migrations(self.tmpdir.name)),
47+
)

v03_pipeline/lib/misc/io.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
import hashlib
12
import math
23
import os
34
import uuid
45

56
import hail as hl
7+
import hailtop.fs as hfs
68

79
from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
810
from v03_pipeline.lib.misc.nested_field import parse_nested_field
@@ -200,6 +202,17 @@ def import_pedigree(pedigree_path: str) -> hl.Table:
200202
)
201203

202204

205+
def remap_pedigree_hash(remap_path: str, pedigree_path: str) -> hl.Int32Expression:
206+
sha256 = hashlib.sha256()
207+
if hfs.exists(remap_path):
208+
with hfs.open(remap_path) as f1:
209+
sha256.update(f1.read().encode('utf8'))
210+
with hfs.open(pedigree_path) as f2:
211+
sha256.update(f2.read().encode('utf8'))
212+
# maximum 4 byte int
213+
return hl.int32(int(sha256.hexdigest()[:8], 16))
214+
215+
203216
def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, str]:
204217
suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht'
205218
read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table

v03_pipeline/lib/misc/io_test.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
compute_hail_n_partitions,
77
file_size_bytes,
88
import_imputed_sex,
9+
remap_pedigree_hash,
910
)
1011

1112
TEST_IMPUTED_SEX = 'v03_pipeline/var/test/sex_check/test_imputed_sex.tsv'
1213
TEST_IMPUTED_SEX_UNEXPECTED_VALUE = (
1314
'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv'
1415
)
16+
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
1517
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
18+
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
1619
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
1720

1821

@@ -46,3 +49,14 @@ def test_import_imputed_sex_unexpected_value(self) -> None:
4649
'Found unexpected value Unknown in imputed sex file',
4750
ht.collect,
4851
)
52+
53+
def test_remap_pedigree_hash(self) -> None:
54+
self.assertEqual(
55+
hl.eval(
56+
remap_pedigree_hash(
57+
TEST_REMAP,
58+
TEST_PEDIGREE_3,
59+
),
60+
),
61+
-560434714,
62+
)

0 commit comments

Comments
 (0)