Skip to content

Commit ed4364f

Browse files
authored
Benb/use metadata as source of family table load (#936)
* use run metadata as source of family table load * ruff
1 parent 799ff8f commit ed4364f

File tree

3 files changed

+86
-28
lines changed

3 files changed

+86
-28
lines changed

v03_pipeline/lib/tasks/write_project_family_tables.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
import luigi
33
import luigi.util
44

5-
from v03_pipeline.lib.misc.io import import_pedigree
6-
from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
5+
from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
76
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
8-
from v03_pipeline.lib.tasks.files import RawFileTask
97
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
108
from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask
9+
from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import (
10+
WriteRemappedAndSubsettedCallsetTask,
11+
)
1112

1213

1314
@luigi.util.inherits(BaseLoadingRunParams)
@@ -26,27 +27,26 @@ def complete(self) -> bool:
2627
for write_family_table_task in self.dynamic_write_family_table_tasks
2728
)
2829

29-
def run(self):
30-
# https://luigi.readthedocs.io/en/stable/tasks.html#dynamic-dependencies
31-
# Fetch family guids from project table
32-
update_project_table_task: luigi.Target = yield self.clone(
33-
UpdateProjectTableTask,
34-
)
35-
project_ht = hl.read_table(update_project_table_task.path)
36-
family_guids_in_project_table = set(hl.eval(project_ht.globals.family_guids))
30+
def requires(self) -> list[luigi.Task]:
31+
return [
32+
self.clone(
33+
WriteRemappedAndSubsettedCallsetTask,
34+
),
35+
self.clone(
36+
UpdateProjectTableTask,
37+
),
38+
]
3739

38-
# Fetch family guids from pedigree
39-
pedigree_ht_task: luigi.Target = yield RawFileTask(self.project_pedigree_path)
40-
pedigree_ht = import_pedigree(pedigree_ht_task.path)
41-
families_guids_in_pedigree = {
42-
f.family_guid for f in parse_pedigree_ht_to_families(pedigree_ht)
43-
}
44-
45-
# Intersect them
46-
family_guids_to_load = (
47-
family_guids_in_project_table & families_guids_in_pedigree
40+
def run(self):
41+
ht = hl.read_matrix_table(
42+
remapped_and_subsetted_callset_path(
43+
self.reference_genome,
44+
self.dataset_type,
45+
self.callset_path,
46+
self.project_guid,
47+
),
4848
)
49-
for family_guid in family_guids_to_load:
49+
for family_guid in set(hl.eval(ht.globals.family_samples).keys()):
5050
self.dynamic_write_family_table_tasks.add(
5151
self.clone(WriteFamilyTableTask, family_guid=family_guid),
5252
)

v03_pipeline/lib/tasks/write_project_family_tables_test.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import luigi.worker
33

44
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
5-
from v03_pipeline.lib.paths import project_table_path
5+
from v03_pipeline.lib.paths import (
6+
project_table_path,
7+
remapped_and_subsetted_callset_path,
8+
)
69
from v03_pipeline.lib.tasks.write_project_family_tables import (
710
WriteProjectFamilyTablesTask,
811
)
@@ -38,6 +41,33 @@ def test_snv_write_project_family_tables_task(self) -> None:
3841
hl.read_table(write_family_table_task.output().path)
3942
for write_family_table_task in write_project_family_tables.dynamic_write_family_table_tasks
4043
]
44+
# Validate remapped and subsetted callset families
45+
remapped_and_subsetted_callset = hl.read_matrix_table(
46+
remapped_and_subsetted_callset_path(
47+
ReferenceGenome.GRCh38,
48+
DatasetType.SNV_INDEL,
49+
TEST_SNV_INDEL_VCF,
50+
'R0113_test_project',
51+
),
52+
)
53+
self.assertCountEqual(
54+
hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()),
55+
{
56+
'123_1',
57+
'234_1',
58+
'345_1',
59+
'456_1',
60+
'567_1',
61+
'678_1',
62+
'789_1',
63+
'890_1',
64+
'901_1',
65+
'bcd_1',
66+
'cde_1',
67+
'def_1',
68+
'efg_1',
69+
},
70+
)
4171
self.assertCountEqual(
4272
[ht.globals.sample_ids.collect() for ht in hts],
4373
[
@@ -73,13 +103,39 @@ def test_snv_write_project_family_tables_task(self) -> None:
73103
worker.run()
74104
self.assertTrue(write_project_family_tables_subset.complete())
75105
hts = [
76-
hl.read_table(write_family_table_task.output().path)
106+
write_family_table_task.output().path
77107
for write_family_table_task in write_project_family_tables_subset.dynamic_write_family_table_tasks
78108
]
79-
# Only one family table written
80-
self.assertEqual(
81-
len(hts),
82-
1,
109+
self.assertTrue(len(hts))
110+
self.assertTrue(
111+
'123_1' in hts[0],
112+
)
113+
# Validate remapped and subsetted callset families
114+
# (and that it was re-written)
115+
remapped_and_subsetted_callset = hl.read_matrix_table(
116+
remapped_and_subsetted_callset_path(
117+
ReferenceGenome.GRCh38,
118+
DatasetType.SNV_INDEL,
119+
TEST_SNV_INDEL_VCF,
120+
'R0113_test_project',
121+
),
122+
)
123+
self.assertCountEqual(
124+
hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()),
125+
{'123_1'},
126+
)
127+
self.assertCountEqual(
128+
hl.eval(remapped_and_subsetted_callset.globals.failed_family_samples),
129+
hl.Struct(
130+
missing_samples={
131+
'234_1': {
132+
'reasons': ["Missing samples: {'NA19678_999'}"],
133+
'samples': ['NA19678_1', 'NA19678_999'],
134+
},
135+
},
136+
relatedness_check={},
137+
sex_check={},
138+
),
83139
)
84140
# Project table still contains all family guids
85141
self.assertCountEqual(
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex
22
R0114_project4 123_1 123 NA19675_1 F
3+
R0114_project4 234_1 234 NA19678_1 M
4+
R0114_project4 234_1 234 NA19678_999 F

0 commit comments

Comments
 (0)