Skip to content

Commit 24eda59

Browse files
authored
new task for updating annotations w/ rdc data (#676)
* refactor globals and create new task to update vat * see if tests pass: * test for new task * getting Unfulfilled dependency at runtime * load_gencode * fix 1 test * a few more tests for the task * big test * tests * oops * most tests should pass. * do not clear globals if updating annotations table with rdc update * fix da tests * pr comments * oops * refactor * honestly cleaner * even cleaner
1 parent 92f2713 commit 24eda59

36 files changed

+822
-122
lines changed

v03_pipeline/lib/model/reference_dataset_collection.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,16 @@ def for_reference_genome_dataset_type(
9494
if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS:
9595
return [rdc for rdc in rdcs if rdc.access_control == AccessControl.PUBLIC]
9696
return rdcs
97+
98+
@classmethod
99+
def for_dataset(
100+
cls,
101+
dataset: str,
102+
dataset_type: DatasetType,
103+
) -> 'ReferenceDatasetCollection':
104+
for rdc in cls:
105+
if dataset in rdc.datasets(dataset_type):
106+
return rdc
107+
108+
err_msg = f'Dataset "{dataset}" not found in any reference dataset collection'
109+
raise ValueError(err_msg)

v03_pipeline/lib/tasks/base/base_variant_annotations_table.py

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,29 @@
77
variant_annotations_table_path,
88
)
99
from v03_pipeline.lib.tasks.base.base_update_task import BaseUpdateTask
10-
from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask
10+
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
11+
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import (
12+
UpdatedReferenceDatasetCollectionTask,
13+
)
1114

1215

1316
class BaseVariantAnnotationsTableTask(BaseUpdateTask):
17+
@property
18+
def rdc_annotation_dependencies(self) -> dict[str, hl.Table]:
19+
annotation_dependencies = {}
20+
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
21+
self.reference_genome,
22+
self.dataset_type,
23+
):
24+
annotation_dependencies[f'{rdc.value}_ht'] = hl.read_table(
25+
valid_reference_dataset_collection_path(
26+
self.reference_genome,
27+
self.dataset_type,
28+
rdc,
29+
),
30+
)
31+
return annotation_dependencies
32+
1433
def output(self) -> luigi.Target:
1534
return GCSorLocalTarget(
1635
variant_annotations_table_path(
@@ -21,12 +40,11 @@ def output(self) -> luigi.Target:
2140

2241
def requires(self) -> list[luigi.Task]:
2342
return [
24-
HailTableTask(
25-
valid_reference_dataset_collection_path(
26-
self.reference_genome,
27-
self.dataset_type,
28-
rdc,
29-
),
43+
UpdatedReferenceDatasetCollectionTask(
44+
self.reference_genome,
45+
self.dataset_type,
46+
self.sample_type,
47+
rdc,
3048
)
3149
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
3250
self.reference_genome,
@@ -50,3 +68,30 @@ def initialize_table(self) -> hl.Table:
5068

5169
def update_table(self, ht: hl.Table) -> hl.Table:
5270
return ht
71+
72+
def annotate_reference_dataset_collection_globals(
73+
self,
74+
ht: hl.Table,
75+
) -> hl.Table:
76+
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
77+
self.reference_genome,
78+
self.dataset_type,
79+
):
80+
rdc_ht = self.rdc_annotation_dependencies[f'{rdc.value}_ht']
81+
rdc_globals = rdc_ht.index_globals()
82+
ht = ht.select_globals(
83+
paths=hl.Struct(
84+
**ht.globals.paths,
85+
**rdc_globals.paths,
86+
),
87+
versions=hl.Struct(
88+
**ht.globals.versions,
89+
**rdc_globals.versions,
90+
),
91+
enums=hl.Struct(
92+
**ht.globals.enums,
93+
**rdc_globals.enums,
94+
),
95+
updates=ht.globals.updates,
96+
)
97+
return ht

v03_pipeline/lib/tasks/base/base_variant_annotations_table_test.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import shutil
2+
from unittest.mock import patch
23

34
import hail as hl
45
import luigi.worker
@@ -14,6 +15,7 @@
1415
BaseVariantAnnotationsTableTask,
1516
)
1617
from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget
18+
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
1719
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1820

1921
TEST_COMBINED_1 = 'v03_pipeline/var/test/reference_data/test_combined_1.ht'
@@ -49,7 +51,11 @@ def setUp(self) -> None:
4951
),
5052
)
5153

52-
def test_should_create_initialized_table(self) -> None:
54+
@patch(
55+
'v03_pipeline.lib.tasks.base.base_variant_annotations_table.UpdatedReferenceDatasetCollectionTask',
56+
)
57+
def test_should_create_initialized_table(self, mock_update_rdc_task) -> None:
58+
mock_update_rdc_task.return_value = MockCompleteTask()
5359
vat_task = BaseVariantAnnotationsTableTask(
5460
reference_genome=ReferenceGenome.GRCh38,
5561
dataset_type=DatasetType.SNV_INDEL,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.model import ReferenceDatasetCollection
4+
from v03_pipeline.lib.reference_data.compare_globals import (
5+
Globals,
6+
get_datasets_to_update,
7+
)
8+
from v03_pipeline.lib.tasks.base.base_variant_annotations_table import (
9+
BaseVariantAnnotationsTableTask,
10+
)
11+
12+
13+
class UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
14+
BaseVariantAnnotationsTableTask,
15+
):
16+
_datasets_to_update: list[str]
17+
18+
@property
19+
def reference_dataset_collections(self) -> list[ReferenceDatasetCollection]:
20+
return [
21+
rdc
22+
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
23+
self.reference_genome,
24+
self.dataset_type,
25+
)
26+
if not rdc.requires_annotation
27+
]
28+
29+
def complete(self) -> bool:
30+
self._datasets_to_update = []
31+
32+
if not super().complete():
33+
for rdc in self.reference_dataset_collections:
34+
self._datasets_to_update.extend(
35+
rdc.datasets(
36+
self.dataset_type,
37+
),
38+
)
39+
return False
40+
41+
for rdc in self.reference_dataset_collections:
42+
annotations_ht_globals = Globals.from_ht(
43+
hl.read_table(self.output().path),
44+
rdc,
45+
self.dataset_type,
46+
)
47+
rdc_ht_globals = Globals.from_ht(
48+
self.rdc_annotation_dependencies[f'{rdc.value}_ht'],
49+
rdc,
50+
self.dataset_type,
51+
)
52+
self._datasets_to_update.extend(
53+
get_datasets_to_update(
54+
rdc,
55+
annotations_ht_globals,
56+
rdc_ht_globals,
57+
self.dataset_type,
58+
),
59+
)
60+
return not self._datasets_to_update
61+
62+
def update_table(self, ht: hl.Table) -> hl.Table:
63+
for dataset in self._datasets_to_update:
64+
rdc = ReferenceDatasetCollection.for_dataset(dataset, self.dataset_type)
65+
rdc_ht = self.rdc_annotation_dependencies[f'{rdc.value}_ht']
66+
67+
if dataset in ht.row:
68+
ht = ht.drop(dataset)
69+
70+
ht = ht.join(rdc_ht.select(dataset), 'left')
71+
72+
return self.annotate_reference_dataset_collection_globals(ht)

0 commit comments

Comments
 (0)