Skip to content

Commit 97257c9

Browse files
authored
Ensure rows are deleted after deleting samples! (#770)
* Delete project tasks * cleanup * ruff format * well * rename * hacking away * almost there! * ruff * Fix missing updates change * ruff * Remove debug code * remove bad merge * more precision in test * project table * allow for missing project * remove some unnecessary checks * test already deleted family * Lots of renames * More updates * Sketch * Flesh out test * fix paths * Rename base hail table * a bunch more renames * delete project table * Add delete project families * add comment * test it! * Fix * add dep * Lookup table filtering * Ensure rows with no projects/families defined are removed * ruff * remove mock * Remove mocks from args * tweak tests
1 parent 91de7c5 commit 97257c9

10 files changed

+47
-192
lines changed

v03_pipeline/lib/misc/lookup.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,15 @@ def remove_family_guids(
9898
)
9999
),
100100
)
101-
project_i = ht.project_guids.index(
102-
project_guid,
103-
) # double reference because new expression
101+
ht = ht.filter(
102+
hl.any(ht.project_stats.map(lambda fs: hl.any(fs.map(hl.is_defined)))),
103+
)
104104
return ht.annotate_globals(
105105
project_families=hl.dict(
106-
hl.enumerate(ht.project_families.items()).starmap(
107-
lambda i, item: (
106+
ht.project_families.items().map(
107+
lambda item: (
108108
hl.if_else(
109-
i != project_i,
109+
item[0] != project_guid,
110110
item,
111111
(
112112
item[0],
@@ -140,6 +140,7 @@ def remove_project(
140140
)
141141
),
142142
)
143+
ht = ht.filter(hl.any(ht.project_stats.map(hl.is_defined)))
143144
return ht.annotate_globals(
144145
project_guids=project_indexes_to_keep.map(
145146
lambda i: ht.project_guids[i],

v03_pipeline/lib/misc/lookup_test.py

Lines changed: 4 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -96,16 +96,8 @@ def test_remove_family_guids(self) -> None:
9696
'id': 0,
9797
'project_stats': [
9898
[
99-
hl.Struct(
100-
ref_samples=0,
101-
heteroplasmic_samples=0,
102-
homoplasmic_samples=0,
103-
),
104-
hl.Struct(
105-
ref_samples=1,
106-
heteroplasmic_samples=1,
107-
homoplasmic_samples=1,
108-
),
99+
None,
100+
None,
109101
hl.Struct(
110102
ref_samples=2,
111103
heteroplasmic_samples=2,
@@ -125,11 +117,7 @@ def test_remove_family_guids(self) -> None:
125117
'id': 1,
126118
'project_stats': [
127119
[
128-
hl.Struct(
129-
ref_samples=0,
130-
heteroplasmic_samples=0,
131-
homoplasmic_samples=0,
132-
),
120+
None,
133121
hl.Struct(
134122
ref_samples=1,
135123
heteroplasmic_samples=1,
@@ -201,19 +189,6 @@ def test_remove_family_guids(self) -> None:
201189
self.assertCountEqual(
202190
lookup_ht.collect(),
203191
[
204-
hl.Struct(
205-
id=0,
206-
project_stats=[
207-
[
208-
hl.Struct(
209-
ref_samples=1,
210-
heteroplasmic_samples=1,
211-
homoplasmic_samples=1,
212-
),
213-
],
214-
[],
215-
],
216-
),
217192
hl.Struct(
218193
id=1,
219194
project_stats=[
@@ -253,13 +228,7 @@ def test_remove_project(self) -> None:
253228
homoplasmic_samples=2,
254229
),
255230
],
256-
[
257-
hl.Struct(
258-
ref_samples=3,
259-
heteroplasmic_samples=3,
260-
homoplasmic_samples=3,
261-
),
262-
],
231+
None,
263232
],
264233
},
265234
{
@@ -330,18 +299,6 @@ def test_remove_project(self) -> None:
330299
self.assertCountEqual(
331300
lookup_ht.collect(),
332301
[
333-
hl.Struct(
334-
id=0,
335-
project_stats=[
336-
[
337-
hl.Struct(
338-
ref_samples=3,
339-
heteroplasmic_samples=3,
340-
homoplasmic_samples=3,
341-
),
342-
],
343-
],
344-
),
345302
hl.Struct(
346303
id=1,
347304
project_stats=[

v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,12 @@
77
from v03_pipeline.lib.tasks.base.base_update_lookup_table import (
88
BaseUpdateLookupTableTask,
99
)
10-
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_deleted_families import (
11-
UpdateVariantAnnotationsTableWithDeletedFamiliesTask,
12-
)
1310

1411

1512
class UpdateLookupTableWithDeletedFamiliesTask(BaseUpdateLookupTableTask):
1613
project_guid = luigi.Parameter()
1714
family_guids = luigi.ListParameter()
1815

19-
def requires(self) -> luigi.Task:
20-
# We require updating the annotations table first so that
21-
# we are able to use the lookup table to determine which rows
22-
# of the annotations table require re-annotation.
23-
return UpdateVariantAnnotationsTableWithDeletedFamiliesTask(
24-
dataset_type=self.dataset_type,
25-
sample_type=self.sample_type,
26-
reference_genome=self.reference_genome,
27-
project_guid=self.project_guid,
28-
family_guids=self.family_guids,
29-
)
30-
3116
def complete(self) -> bool:
3217
return super().complete() and hl.eval(
3318
hl.bind(

v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,13 @@
77
from v03_pipeline.lib.tasks.update_lookup_table_with_deleted_families import (
88
UpdateLookupTableWithDeletedFamiliesTask,
99
)
10-
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
1110
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1211

1312

14-
@mock.patch(
15-
'v03_pipeline.lib.tasks.update_lookup_table_with_deleted_families.UpdateVariantAnnotationsTableWithDeletedFamiliesTask',
16-
)
1713
class UpdateLookupTableWithDeletedProjectTaskTest(MockedDatarootTestCase):
1814
def test_delete_project_empty_table(
1915
self,
20-
mock_update_lookup_table_task: mock.Mock,
2116
) -> None:
22-
mock_update_lookup_table_task.return_value = MockCompleteTask()
2317
worker = luigi.worker.Worker()
2418
task = UpdateLookupTableWithDeletedFamiliesTask(
2519
dataset_type=DatasetType.SNV_INDEL,
@@ -51,9 +45,7 @@ def test_delete_project_empty_table(
5145
def test_delete_project(
5246
self,
5347
mock_initialize_table: mock.Mock,
54-
mock_update_lookup_table_task: mock.Mock,
5548
) -> None:
56-
mock_update_lookup_table_task.return_value = MockCompleteTask()
5749
mock_initialize_table.return_value = hl.Table.parallelize(
5850
[
5951
{

v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,11 @@
77
from v03_pipeline.lib.tasks.base.base_update_lookup_table import (
88
BaseUpdateLookupTableTask,
99
)
10-
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_deleted_project import (
11-
UpdateVariantAnnotationsTableWithDeletedProjectTask,
12-
)
1310

1411

1512
class UpdateLookupTableWithDeletedProjectTask(BaseUpdateLookupTableTask):
1613
project_guid = luigi.Parameter()
1714

18-
def requires(self) -> luigi.Task:
19-
return UpdateVariantAnnotationsTableWithDeletedProjectTask(
20-
dataset_type=self.dataset_type,
21-
sample_type=self.sample_type,
22-
reference_genome=self.reference_genome,
23-
project_guid=self.project_guid,
24-
)
25-
2615
def complete(self) -> bool:
2716
return super().complete() and hl.eval(
2817
~hl.read_table(self.output().path).updates.project_guid.contains(

v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,13 @@
77
from v03_pipeline.lib.tasks.update_lookup_table_with_deleted_project import (
88
UpdateLookupTableWithDeletedProjectTask,
99
)
10-
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
1110
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1211

1312

14-
@mock.patch(
15-
'v03_pipeline.lib.tasks.update_lookup_table_with_deleted_project.UpdateVariantAnnotationsTableWithDeletedProjectTask',
16-
)
1713
class UpdateLookupTableWithDeletedProjectTaskTest(MockedDatarootTestCase):
1814
def test_delete_project_empty_table(
1915
self,
20-
mock_update_lookup_table_task: mock.Mock,
2116
) -> None:
22-
mock_update_lookup_table_task.return_value = MockCompleteTask()
2317
worker = luigi.worker.Worker()
2418
task = UpdateLookupTableWithDeletedProjectTask(
2519
dataset_type=DatasetType.SNV_INDEL,
@@ -50,9 +44,7 @@ def test_delete_project_empty_table(
5044
def test_delete_project(
5145
self,
5246
mock_initialize_table: mock.Mock,
53-
mock_update_lookup_table_task: mock.Mock,
5447
) -> None:
55-
mock_update_lookup_table_task.return_value = MockCompleteTask()
5648
mock_initialize_table.return_value = hl.Table.parallelize(
5749
[
5850
{

v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families.py

Lines changed: 15 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,12 @@
22
import luigi
33

44
from v03_pipeline.lib.annotations.fields import get_fields
5-
from v03_pipeline.lib.misc.lookup import (
6-
remove_family_guids,
7-
)
8-
from v03_pipeline.lib.paths import (
9-
lookup_table_path,
10-
)
115
from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import (
126
BaseUpdateVariantAnnotationsTableTask,
137
)
14-
from v03_pipeline.lib.tasks.files import HailTableTask
8+
from v03_pipeline.lib.tasks.update_lookup_table_with_deleted_families import (
9+
UpdateLookupTableWithDeletedFamiliesTask,
10+
)
1511

1612

1713
class UpdateVariantAnnotationsTableWithDeletedFamiliesTask(
@@ -25,72 +21,33 @@ def __init__(self, *args, **kwargs):
2521
self.done = False
2622

2723
def requires(self) -> luigi.Task:
28-
return HailTableTask(
29-
lookup_table_path(
30-
self.reference_genome,
31-
self.dataset_type,
32-
),
24+
return UpdateLookupTableWithDeletedFamiliesTask(
25+
dataset_type=self.dataset_type,
26+
sample_type=self.sample_type,
27+
reference_genome=self.reference_genome,
28+
project_guid=self.project_guid,
29+
family_guids=self.family_guids,
3330
)
3431

3532
def complete(self) -> bool:
3633
if not self.dataset_type.has_lookup_table:
3734
return True
38-
return super().complete() and (
39-
# We don't have the concept of families being present or
40-
# not present in annotations table, thus we check the "lookup"
41-
# table OR a done flag to prevent the task from looping over itself.
42-
self.done
43-
or hl.eval(
44-
hl.bind(
45-
lambda family_guids: (
46-
hl.is_missing(family_guids) # The project itself is missing
47-
| hl.all(
48-
hl.array(list(self.family_guids)).map(
49-
lambda family_guid: ~hl.set(family_guids).contains(
50-
family_guid,
51-
),
52-
),
53-
)
54-
),
55-
hl.read_table(self.input().path).globals.project_families.get(
56-
self.project_guid,
57-
),
58-
),
59-
)
60-
)
35+
# We don't have the concept of families being present or not present
36+
# in annotations table a done flag to prevent the task from looping over itself.
37+
return super().complete() and self.done
6138

6239
def update_table(self, ht: hl.Table) -> hl.Table:
6340
if not self.dataset_type.has_lookup_table:
6441
return ht
6542
lookup_ht = hl.read_table(self.input().path)
66-
project_i = hl.eval(lookup_ht.globals.project_guids.index(self.project_guid))
67-
68-
# Filter lookup table to only the rows where any of the requested families are defined
69-
# (A family may be set to missing during lookup table construction if all values for a family are 0)
70-
family_guids_set = hl.set(list(self.family_guids))
71-
family_indexes = (
72-
hl.enumerate(lookup_ht.globals.project_families[self.project_guid])
73-
.filter(lambda item: family_guids_set.contains(item[1]))
74-
.map(lambda item: item[0])
75-
)
76-
lookup_ht = lookup_ht.filter(
77-
hl.any(
78-
family_indexes.map(
79-
lambda i: hl.is_defined(lookup_ht.project_stats[project_i][i]),
80-
),
81-
),
82-
)
83-
lookup_ht = remove_family_guids(lookup_ht, self.project_guid, family_guids_set)
84-
family_variants_ht = ht.semi_join(lookup_ht)
85-
family_variants_ht = family_variants_ht.annotate(
43+
ht = ht.semi_join(lookup_ht)
44+
ht = ht.annotate(
8645
**get_fields(
87-
family_variants_ht,
46+
ht,
8847
self.dataset_type.lookup_table_annotation_fns,
8948
lookup_ht=lookup_ht,
9049
**self.param_kwargs,
9150
),
9251
)
93-
ht = ht.anti_join(lookup_ht)
94-
ht = ht.union(family_variants_ht)
9552
self.done = True
9653
return ht

v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,10 @@ def setUp(self) -> None:
8484
globals=hl.Struct(
8585
project_guids=['project_a', 'project_b'],
8686
project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']},
87+
updates={
88+
hl.Struct(callset='abc', project_guid='project_a'),
89+
hl.Struct(callset='123', project_guid='project_b'),
90+
},
8791
),
8892
)
8993
ht.write(
@@ -158,10 +162,6 @@ def test_update_annotations_with_deleted_project(self) -> None:
158162
ht.collect(),
159163
[
160164
hl.Struct(id=0, gt_stats=hl.Struct(AC=9, AN=18, AF=0.5, hom=3)),
161-
hl.Struct(id=1, gt_stats=hl.Struct(AC=0, AN=1, AF=0.25, hom=0)),
162-
hl.Struct(
163-
id=2,
164-
gt_stats=hl.Struct(AC=0, AN=1, AF=0.25, hom=0),
165-
),
165+
hl.Struct(id=1, gt_stats=hl.Struct(AC=12, AN=24, AF=0.5, hom=4)),
166166
],
167167
)

0 commit comments

Comments
 (0)