Skip to content

Commit bec9d16

Browse files
authored
Benb/add key to pipeline (#1086)
* first pass * progress * tests passing * missed some tests
1 parent 9b962a6 commit bec9d16

8 files changed

+65
-0
lines changed

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def initialize_table(self) -> hl.Table:
6969
),
7070
),
7171
migrations=hl.empty_array(hl.tstr),
72+
max_key_=hl.int64(-1),
7273
),
7374
)
7475

@@ -100,5 +101,6 @@ def annotate_globals(
100101
),
101102
updates=ht.globals.updates,
102103
migrations=ht.globals.migrations,
104+
max_key_=ht.globals.max_key_,
103105
)
104106
return annotate_enums(ht, self.reference_genome, self.dataset_type)

v03_pipeline/lib/tasks/migrate_variant_annotations_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,6 @@ def initialize_table(self) -> hl.Table:
4040
),
4141
),
4242
migrations=hl.empty_array(hl.tstr),
43+
max_key_=hl.int64(-1),
4344
),
4445
)

v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def test_mock_migration(
6969
enums=hl.Struct(),
7070
updates=set(),
7171
migrations=['0012_mock_migration'],
72+
max_key_=-1,
7273
mock_migration='a mock migration',
7374
),
7475
],
@@ -108,6 +109,7 @@ def test_migration_is_noop_for_other_dataset_types(
108109
enums=hl.Struct(),
109110
updates=set(),
110111
migrations=[],
112+
max_key_=-1,
111113
),
112114
],
113115
)
@@ -143,6 +145,7 @@ def test_migration_dependency(
143145
enums=hl.Struct(),
144146
updates=set(),
145147
migrations=['0012_mock_migration', '0013_mock_migration2'],
148+
max_key_=-1,
146149
mock_migration='a mock migration',
147150
mock_migration2='a second mock migration',
148151
),

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def test_create_empty_annotations_table(self):
9191
**BASE_ENUMS,
9292
),
9393
migrations=[],
94+
max_key_=-1,
9495
updates=set(),
9596
),
9697
],
@@ -125,6 +126,7 @@ def test_update_vat_snv_indel_38(
125126
enums=hl.Struct(),
126127
updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)),
127128
migrations=hl.empty_array(hl.tstr),
129+
max_key_=0,
128130
),
129131
)
130132

@@ -173,6 +175,7 @@ def test_update_vat_snv_indel_38(
173175
),
174176
migrations=[],
175177
updates=set(),
178+
max_key_=0,
176179
),
177180
],
178181
)
@@ -276,6 +279,7 @@ def test_update_vat_mito_38(
276279
enums=hl.Struct(),
277280
updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)),
278281
migrations=hl.empty_array(hl.tstr),
282+
max_key_=0,
279283
),
280284
)
281285

@@ -325,6 +329,7 @@ def test_update_vat_mito_38(
325329
),
326330
migrations=[],
327331
updates=set(),
332+
max_key_=0,
328333
),
329334
],
330335
)
@@ -404,6 +409,7 @@ def test_update_vat_snv_indel_37(
404409
enums=hl.Struct(),
405410
updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)),
406411
migrations=hl.empty_array(hl.tstr),
412+
max_key_=0,
407413
),
408414
)
409415

@@ -452,6 +458,7 @@ def test_update_vat_snv_indel_37(
452458
),
453459
migrations=[],
454460
updates=set(),
461+
max_key_=0,
455462
),
456463
],
457464
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,5 @@ def update_table(self, ht: hl.Table) -> hl.Table:
137137
for i, project_guid in enumerate(self.project_guids)
138138
},
139139
),
140+
max_key_=ht.aggregate(hl.agg.max(ht.key_)),
140141
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,10 @@ def test_multiple_update_vat(
335335
},
336336
],
337337
)
338+
self.assertEqual(
339+
hl.eval(ht.globals.max_key_),
340+
29,
341+
)
338342

339343
# Ensure that new variants are added correctly to the table.
340344
uvatwns_task_4 = UpdateVariantAnnotationsTableWithNewSamplesTask(
@@ -504,6 +508,7 @@ def test_multiple_update_vat(
504508
hgmd='1.0',
505509
),
506510
migrations=[],
511+
max_key_=29,
507512
enums=hl.Struct(
508513
clinvar=ReferenceDataset.clinvar.enum_globals,
509514
dbnsfp=ReferenceDataset.dbnsfp.enum_globals,
@@ -691,6 +696,7 @@ def test_update_vat_grch37(
691696
hgmd=None,
692697
gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0),
693698
CAID=None,
699+
key_=0,
694700
),
695701
)
696702

@@ -838,6 +844,7 @@ def test_mito_update_vat(
838844
mitotip=hl.Struct(trna_prediction=MITOTIP_PATHOGENICITIES),
839845
),
840846
migrations=[],
847+
max_key_=4,
841848
updates={
842849
hl.Struct(
843850
callset='v03_pipeline/var/test/callsets/mito_1.mt',
@@ -888,6 +895,7 @@ def test_mito_update_vat(
888895
AN=4,
889896
),
890897
local_constraint_mito=None,
898+
key_=0,
891899
),
892900
)
893901

@@ -936,6 +944,7 @@ def test_sv_multiple_vcf_update_vat(
936944
),
937945
),
938946
migrations=[],
947+
max_key_=12,
939948
updates={
940949
hl.Struct(
941950
callset=TEST_SV_VCF,
@@ -995,6 +1004,7 @@ def test_sv_multiple_vcf_update_vat(
9951004
sv_type_id=2,
9961005
sv_type_detail_id=None,
9971006
xpos=1000180928,
1007+
key_=0,
9981008
),
9991009
hl.Struct(
10001010
variant_id='BND_chr1_9',
@@ -1038,6 +1048,7 @@ def test_sv_multiple_vcf_update_vat(
10381048
sv_type_id=2,
10391049
sv_type_detail_id=None,
10401050
xpos=1000789481,
1051+
key_=1,
10411052
),
10421053
hl.Struct(
10431054
variant_id='CPX_chr1_22',
@@ -1091,6 +1102,7 @@ def test_sv_multiple_vcf_update_vat(
10911102
sv_type_id=3,
10921103
sv_type_detail_id=2,
10931104
xpos=1006558902,
1105+
key_=2,
10941106
),
10951107
hl.Struct(
10961108
variant_id='CPX_chr1_251',
@@ -1147,6 +1159,7 @@ def test_sv_multiple_vcf_update_vat(
11471159
sv_type_id=3,
11481160
sv_type_detail_id=9,
11491161
xpos=1180540234,
1162+
key_=3,
11501163
),
11511164
hl.Struct(
11521165
variant_id='CPX_chr1_41',
@@ -1200,6 +1213,7 @@ def test_sv_multiple_vcf_update_vat(
12001213
sv_type_id=3,
12011214
sv_type_detail_id=12,
12021215
xpos=1016088760,
1216+
key_=4,
12031217
),
12041218
hl.Struct(
12051219
variant_id='CPX_chr1_54',
@@ -1258,6 +1272,7 @@ def test_sv_multiple_vcf_update_vat(
12581272
sv_type_id=3,
12591273
sv_type_detail_id=13,
12601274
xpos=1021427498,
1275+
key_=5,
12611276
),
12621277
hl.Struct(
12631278
variant_id='CPX_chrX_251',
@@ -1326,6 +1341,7 @@ def test_sv_multiple_vcf_update_vat(
13261341
position=2699041,
13271342
reference_genome='GRCh37',
13281343
),
1344+
key_=6,
13291345
),
13301346
hl.Struct(
13311347
variant_id='CPX_chrX_252',
@@ -1398,6 +1414,7 @@ def test_sv_multiple_vcf_update_vat(
13981414
position=2699941,
13991415
reference_genome='GRCh37',
14001416
),
1417+
key_=7,
14011418
),
14021419
],
14031420
)
@@ -1432,6 +1449,7 @@ def test_sv_multiple_vcf_update_vat(
14321449
),
14331450
),
14341451
migrations=[],
1452+
max_key_=13,
14351453
updates={
14361454
hl.Struct(
14371455
callset=TEST_SV_VCF,
@@ -1500,6 +1518,7 @@ def test_sv_multiple_vcf_update_vat(
15001518
sv_type_id=2,
15011519
sv_type_detail_id=None,
15021520
xpos=1000180928,
1521+
key_=0,
15031522
),
15041523
],
15051524
)
@@ -1582,6 +1601,7 @@ def test_gcnv_update_vat_multiple(
15821601
),
15831602
),
15841603
migrations=[],
1604+
max_key_=1,
15851605
updates={
15861606
hl.Struct(
15871607
callset=TEST_GCNV_BED_FILE,
@@ -1637,6 +1657,7 @@ def test_gcnv_update_vat_multiple(
16371657
strvctvre=hl.Struct(score=hl.eval(hl.float32(0.583))),
16381658
sv_type_id=5,
16391659
xpos=1100006937,
1660+
key_=0,
16401661
),
16411662
hl.Struct(
16421663
variant_id='suffix_16457_DEL',
@@ -1675,6 +1696,7 @@ def test_gcnv_update_vat_multiple(
16751696
strvctvre=hl.Struct(score=0.5070000290870667),
16761697
sv_type_id=5,
16771698
xpos=1100017586,
1699+
key_=1,
16781700
),
16791701
],
16801702
)

v03_pipeline/lib/tasks/write_new_variants_table.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,12 @@ def create_table(self) -> hl.Table:
192192
},
193193
)
194194
new_variants_ht = new_variants_ht.join(reference_dataset_ht, 'left')
195+
196+
# Add serial integer index
197+
new_variants_ht = new_variants_ht.add_index(name='key_')
198+
new_variants_ht = new_variants_ht.transmute(
199+
key_=new_variants_ht.key_ + annotations_ht.index_globals().max_key_ + 1,
200+
)
195201
return new_variants_ht.select_globals(
196202
updates={
197203
hl.Struct(
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.migration.base_migration import BaseMigration
4+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
5+
6+
7+
class AddKeyField(BaseMigration):
8+
reference_genome_dataset_types: frozenset[tuple[ReferenceGenome, DatasetType]] = (
9+
frozenset(
10+
(
11+
(ReferenceGenome.GRCh37, DatasetType.SNV_INDEL),
12+
(ReferenceGenome.GRCh38, DatasetType.SNV_INDEL),
13+
(ReferenceGenome.GRCh38, DatasetType.MITO),
14+
(ReferenceGenome.GRCh38, DatasetType.GCNV),
15+
(ReferenceGenome.GRCh38, DatasetType.SV),
16+
),
17+
)
18+
)
19+
20+
@staticmethod
21+
def migrate(ht: hl.Table, **_) -> hl.Table:
22+
ht = ht.add_index(name='key_')
23+
return ht.annotate_globals(max_key_=(ht.count() - 1))

0 commit comments

Comments
 (0)