Skip to content

Commit 91de7c5

Browse files
authored
[optimization] read family tables directly from project table. (#769)
* Delete project tasks * cleanup * ruff format * well * rename * hacking away * almost there! * ruff * Fix missing updates change * ruff * Remove debug code * remove bad merge * more precision in test * project table * allow for missing project * remove some unnecessary checks * test already deleted family * Lots of renames * More updates * Sketch * Flesh out test * fix paths * Rename base hail table * a bunch more renames * delete project table * Add delete project families * is it that simple? * add comment * test it! * Fix * add dep
1 parent b9b075f commit 91de7c5

File tree

2 files changed

+17
-45
lines changed

2 files changed

+17
-45
lines changed

v03_pipeline/lib/tasks/write_family_table.py

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,11 @@
11
import hail as hl
22
import luigi
33

4-
from v03_pipeline.lib.annotations.fields import get_fields
5-
from v03_pipeline.lib.misc.family_entries import compute_callset_family_entries_ht
6-
from v03_pipeline.lib.misc.io import import_pedigree
7-
from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
8-
from v03_pipeline.lib.misc.sample_ids import subset_samples
94
from v03_pipeline.lib.paths import family_table_path
105
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
116
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
12-
from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import (
13-
WriteRemappedAndSubsettedCallsetTask,
7+
from v03_pipeline.lib.tasks.update_project_table import (
8+
UpdateProjectTableTask,
149
)
1510

1611

@@ -56,50 +51,28 @@ def complete(self) -> bool:
5651
)
5752

5853
def requires(self) -> luigi.Task:
59-
return WriteRemappedAndSubsettedCallsetTask(
54+
return UpdateProjectTableTask(
6055
self.reference_genome,
6156
self.dataset_type,
6257
self.sample_type,
63-
self.callset_path,
6458
self.project_guid,
59+
self.callset_path,
6560
self.project_remap_path,
6661
self.project_pedigree_path,
6762
self.ignore_missing_samples_when_subsetting,
6863
self.ignore_missing_samples_when_remapping,
6964
self.validate,
7065
False,
66+
self.is_new_gcnv_joint_call,
7167
)
7268

7369
def create_table(self) -> hl.Table:
74-
callset_mt = hl.read_matrix_table(self.input().path)
75-
pedigree_ht = import_pedigree(self.project_pedigree_path)
76-
families = parse_pedigree_ht_to_families(pedigree_ht)
77-
family = next(
78-
iter(
79-
family for family in families if family.family_guid == self.family_guid
80-
),
81-
)
82-
callset_mt = subset_samples(
83-
callset_mt,
84-
hl.Table.parallelize(
85-
[{'s': sample_id} for sample_id in family.samples],
86-
hl.tstruct(s=hl.dtype('str')),
87-
key='s',
88-
),
89-
False,
90-
)
91-
ht = compute_callset_family_entries_ht(
92-
self.dataset_type,
93-
callset_mt,
94-
get_fields(
95-
callset_mt,
96-
self.dataset_type.genotype_entry_annotation_fns,
97-
**self.param_kwargs,
98-
),
99-
)
100-
ht = ht.transmute(
101-
entries=hl.flatten(ht.family_entries),
70+
project_ht = hl.read_table(self.input().path)
71+
family_i = project_ht.globals.family_guids.index(self.family_guid)
72+
ht = project_ht.transmute(
73+
entries=project_ht.family_entries[family_i],
10274
)
75+
ht = ht.filter(hl.is_defined(ht.entries))
10376
return ht.select_globals(
10477
sample_ids=ht.family_samples[self.family_guid],
10578
sample_type=self.sample_type.value,

v03_pipeline/lib/tasks/write_project_family_tables.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22
import luigi
33

44
from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask
5+
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
56
from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask
6-
from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import (
7-
WriteRemappedAndSubsettedCallsetTask,
8-
)
97

108

119
class WriteProjectFamilyTablesTask(BaseHailTableTask):
@@ -50,22 +48,23 @@ def complete(self) -> bool:
5048

5149
def run(self):
5250
# https://luigi.readthedocs.io/en/stable/tasks.html#dynamic-dependencies
53-
rmsct_output: luigi.Target = yield WriteRemappedAndSubsettedCallsetTask(
51+
update_project_table_task: luigi.Target = yield UpdateProjectTableTask(
5452
self.reference_genome,
5553
self.dataset_type,
5654
self.sample_type,
57-
self.callset_path,
5855
self.project_guid,
56+
self.callset_path,
5957
self.project_remap_path,
6058
self.project_pedigree_path,
6159
self.ignore_missing_samples_when_subsetting,
6260
self.ignore_missing_samples_when_remapping,
6361
self.validate,
6462
False,
63+
self.is_new_gcnv_joint_call,
6564
)
66-
callset_mt = hl.read_matrix_table(rmsct_output.path)
67-
family_samples = hl.eval(callset_mt.globals.family_samples)
68-
for family_guid in family_samples:
65+
project_ht = hl.read_table(update_project_table_task.path)
66+
family_guids = hl.eval(project_ht.globals.family_guids)
67+
for family_guid in family_guids:
6968
self.dynamic_write_family_table_tasks.add(
7069
WriteFamilyTableTask(
7170
**self.param_kwargs,

0 commit comments

Comments
 (0)