Skip to content

Commit 6e2d31e

Browse files
committed
making progress
1 parent e3cae7a commit 6e2d31e

9 files changed

+71
-290
lines changed

v03_pipeline/lib/misc/callsets.py

Lines changed: 3 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,11 @@
66
from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
77

88

9-
def get_callset_ht( # noqa: PLR0913
9+
def get_callset_ht(
1010
reference_genome: ReferenceGenome,
1111
dataset_type: DatasetType,
12-
callset_paths: list[str],
12+
callset_path: str,
1313
project_guids: list[str],
14-
project_remap_paths: list[str],
15-
project_pedigree_paths: list[str],
16-
imputed_sex_paths: list[str] | None,
1714
):
1815
callset_hts = [
1916
hl.read_matrix_table(
@@ -24,58 +21,10 @@ def get_callset_ht( # noqa: PLR0913
2421
project_guid,
2522
),
2623
).rows()
27-
for (callset_path, project_guid, _, _, _) in callset_project_pairs(
28-
callset_paths,
29-
project_guids,
30-
project_remap_paths,
31-
project_pedigree_paths,
32-
imputed_sex_paths,
33-
)
24+
for project_guid in project_guids
3425
]
3526
callset_ht = functools.reduce(
3627
(lambda ht1, ht2: ht1.union(ht2, unify=True)),
3728
callset_hts,
3829
)
3930
return callset_ht.distinct()
40-
41-
42-
def callset_project_pairs(
43-
callset_paths: list[str],
44-
project_guids: list[str],
45-
project_remap_paths: list[str],
46-
project_pedigree_paths: list[str],
47-
imputed_sex_paths: list[str] | None,
48-
):
49-
if len(callset_paths) == len(project_guids):
50-
return zip(
51-
callset_paths,
52-
project_guids,
53-
project_remap_paths,
54-
project_pedigree_paths,
55-
imputed_sex_paths
56-
if imputed_sex_paths is not None
57-
else [None] * len(callset_paths),
58-
strict=True,
59-
)
60-
return (
61-
(
62-
callset_path,
63-
project_guid,
64-
project_remap_path,
65-
project_pedigree_path,
66-
imputed_sex_path,
67-
)
68-
for callset_path, imputed_sex_path in zip(
69-
callset_paths,
70-
imputed_sex_paths
71-
if imputed_sex_paths is not None
72-
else [None] * len(callset_paths),
73-
strict=False,
74-
)
75-
for (project_guid, project_remap_path, project_pedigree_path) in zip(
76-
project_guids,
77-
project_remap_paths,
78-
project_pedigree_paths,
79-
strict=True,
80-
)
81-
)

v03_pipeline/lib/tasks/base/base_loading_run_params.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,11 @@ class BaseLoadingRunParams(luigi.Task):
3434
default=False,
3535
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
3636
)
37+
is_new_gcnv_joint_call = luigi.BoolParameter(
38+
default=False,
39+
description='Is this a fully joint-called callset.',
40+
)
41+
liftover_ref_path = luigi.OptionalParameter(
42+
default='gs://hail-common/references/grch38_to_grch37.over.chain.gz',
43+
description='Path to GRCh38 to GRCh37 coordinates file',
44+
)

v03_pipeline/lib/tasks/update_project_table.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import hail as hl
22
import luigi
3+
import luigi.util
34

45
from v03_pipeline.lib.annotations.fields import get_fields
56
from v03_pipeline.lib.misc.family_entries import (
@@ -14,8 +15,10 @@
1415
from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import (
1516
WriteRemappedAndSubsettedCallsetTask,
1617
)
18+
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
1719

1820

21+
@luigi.util.inherits(BaseLoadingRunParams)
1922
class UpdateProjectTableTask(BaseUpdateProjectTableTask):
2023
sample_type = luigi.EnumParameter(enum=SampleType)
2124
callset_path = luigi.Parameter()
Lines changed: 10 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,33 @@
11
import hail as hl
22
import luigi
3+
import luigi.util
34

45
from v03_pipeline.lib.annotations.fields import get_fields
5-
from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht
6-
from v03_pipeline.lib.model import SampleType
6+
from v03_pipeline.lib.misc.callsets import get_callset_ht
77
from v03_pipeline.lib.paths import (
88
lookup_table_path,
99
new_variants_table_path,
1010
)
11+
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
1112
from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import (
1213
BaseUpdateVariantAnnotationsTableTask,
1314
)
1415
from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask
1516

1617

18+
@luigi.util.inherits(BaseLoadingRunParams)
1719
class UpdateVariantAnnotationsTableWithNewSamplesTask(
1820
BaseUpdateVariantAnnotationsTableTask,
1921
):
20-
sample_type = luigi.EnumParameter(enum=SampleType)
21-
callset_paths = luigi.ListParameter()
2222
project_guids = luigi.ListParameter()
2323
project_remap_paths = luigi.ListParameter()
2424
project_pedigree_paths = luigi.ListParameter()
25-
imputed_sex_paths = luigi.ListParameter(default=None)
26-
ignore_missing_samples_when_remapping = luigi.BoolParameter(
27-
default=False,
28-
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
29-
)
30-
validate = luigi.BoolParameter(
31-
default=True,
32-
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
33-
)
34-
force = luigi.BoolParameter(
35-
default=False,
36-
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
37-
)
38-
liftover_ref_path = luigi.OptionalParameter(
39-
default='gs://hail-common/references/grch38_to_grch37.over.chain.gz',
40-
description='Path to GRCh38 to GRCh37 coordinates file',
41-
)
4225
run_id = luigi.Parameter()
4326

4427
def requires(self) -> list[luigi.Task]:
4528
return [
4629
*super().requires(),
47-
WriteNewVariantsTableTask(
48-
self.reference_genome,
49-
self.dataset_type,
50-
self.sample_type,
51-
self.callset_paths,
52-
self.project_guids,
53-
self.project_remap_paths,
54-
self.project_pedigree_paths,
55-
self.imputed_sex_paths,
56-
self.ignore_missing_samples_when_remapping,
57-
self.validate,
58-
self.force,
59-
self.liftover_ref_path,
60-
self.run_id,
61-
),
30+
self.clone(WriteNewVariantsTableTask),
6231
]
6332

6433
def complete(self) -> bool:
@@ -71,23 +40,11 @@ def complete(self) -> bool:
7140
[
7241
updates.contains(
7342
hl.Struct(
74-
callset=callset_path,
43+
callset=self.callset_path,
7544
project_guid=project_guid,
7645
),
7746
)
78-
for (
79-
callset_path,
80-
project_guid,
81-
_,
82-
_,
83-
_,
84-
) in callset_project_pairs(
85-
self.callset_paths,
86-
self.project_guids,
87-
self.project_remap_paths,
88-
self.project_pedigree_paths,
89-
self.imputed_sex_paths,
90-
)
47+
for project_guid in self.project_guids
9148
],
9249
),
9350
hl.read_table(self.output().path).updates,
@@ -110,11 +67,8 @@ def update_table(self, ht: hl.Table) -> hl.Table:
11067
callset_ht = get_callset_ht(
11168
self.reference_genome,
11269
self.dataset_type,
113-
self.callset_paths,
70+
self.callset_path,
11471
self.project_guids,
115-
self.project_remap_paths,
116-
self.project_pedigree_paths,
117-
self.imputed_sex_paths,
11872
)
11973
# new_variants_ht consists of variants present in the new callset, fully annotated,
12074
# but NOT present in the existing annotations table.
@@ -142,20 +96,8 @@ def update_table(self, ht: hl.Table) -> hl.Table:
14296
return ht.annotate_globals(
14397
updates=ht.updates.union(
14498
{
145-
hl.Struct(callset=callset_path, project_guid=project_guid)
146-
for (
147-
callset_path,
148-
project_guid,
149-
_,
150-
_,
151-
_,
152-
) in callset_project_pairs(
153-
self.callset_paths,
154-
self.project_guids,
155-
self.project_remap_paths,
156-
self.project_pedigree_paths,
157-
self.imputed_sex_paths,
158-
)
99+
hl.Struct(callset=self.callset_path, project_guid=project_guid)
100+
for project_guid in self.project_guids
159101
},
160102
),
161103
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def test_missing_pedigree(
155155
reference_genome=ReferenceGenome.GRCh38,
156156
dataset_type=DatasetType.SNV_INDEL,
157157
sample_type=SampleType.WGS,
158-
callset_paths=[TEST_SNV_INDEL_VCF],
158+
callset_path=TEST_SNV_INDEL_VCF,
159159
project_guids=['R0113_test_project'],
160160
project_remap_paths=[TEST_REMAP],
161161
project_pedigree_paths=['bad_pedigree'],
@@ -189,7 +189,7 @@ def test_missing_interval_reference(
189189
reference_genome=ReferenceGenome.GRCh38,
190190
dataset_type=DatasetType.SNV_INDEL,
191191
sample_type=SampleType.WGS,
192-
callset_paths=[TEST_SNV_INDEL_VCF],
192+
callset_path=TEST_SNV_INDEL_VCF,
193193
project_guids=['R0113_test_project'],
194194
project_remap_paths=[TEST_REMAP],
195195
project_pedigree_paths=[TEST_PEDIGREE_3],
@@ -351,7 +351,7 @@ def test_multiple_update_vat(
351351
reference_genome=ReferenceGenome.GRCh38,
352352
dataset_type=DatasetType.SNV_INDEL,
353353
sample_type=SampleType.WGS,
354-
callset_paths=[TEST_SNV_INDEL_VCF],
354+
callset_path=TEST_SNV_INDEL_VCF,
355355
project_guids=['R0113_test_project'],
356356
project_remap_paths=[TEST_REMAP],
357357
project_pedigree_paths=[TEST_PEDIGREE_3],
@@ -403,7 +403,7 @@ def test_multiple_update_vat(
403403
reference_genome=ReferenceGenome.GRCh38,
404404
dataset_type=DatasetType.SNV_INDEL,
405405
sample_type=SampleType.WGS,
406-
callset_paths=[TEST_SNV_INDEL_VCF],
406+
callset_path=TEST_SNV_INDEL_VCF,
407407
project_guids=['R0114_project4'],
408408
project_remap_paths=[TEST_REMAP],
409409
project_pedigree_paths=[TEST_PEDIGREE_4],
@@ -662,7 +662,7 @@ def test_update_vat_grch37(
662662
reference_genome=ReferenceGenome.GRCh37,
663663
dataset_type=DatasetType.SNV_INDEL,
664664
sample_type=SampleType.WGS,
665-
callset_paths=[TEST_SNV_INDEL_VCF],
665+
callset_path=TEST_SNV_INDEL_VCF,
666666
project_guids=['R0113_test_project'],
667667
project_remap_paths=[TEST_REMAP],
668668
project_pedigree_paths=[TEST_PEDIGREE_3],
@@ -735,7 +735,7 @@ def test_update_vat_without_accessing_private_datasets(
735735
reference_genome=ReferenceGenome.GRCh38,
736736
dataset_type=DatasetType.SNV_INDEL,
737737
sample_type=SampleType.WGS,
738-
callset_paths=[TEST_SNV_INDEL_VCF],
738+
callset_path=TEST_SNV_INDEL_VCF,
739739
project_guids=['R0113_test_project'],
740740
project_remap_paths=[TEST_REMAP],
741741
project_pedigree_paths=[TEST_PEDIGREE_3],
@@ -793,7 +793,7 @@ def test_mito_update_vat(
793793
reference_genome=ReferenceGenome.GRCh38,
794794
dataset_type=DatasetType.MITO,
795795
sample_type=SampleType.WGS,
796-
callset_paths=[TEST_MITO_MT],
796+
callset_path=TEST_MITO_MT,
797797
project_guids=['R0115_test_project2'],
798798
project_remap_paths=['not_a_real_file'],
799799
project_pedigree_paths=[TEST_PEDIGREE_5],
@@ -1058,7 +1058,7 @@ def test_sv_update_vat(
10581058
reference_genome=ReferenceGenome.GRCh38,
10591059
dataset_type=DatasetType.SV,
10601060
sample_type=SampleType.WGS,
1061-
callset_paths=[TEST_SV_VCF],
1061+
callset_path=TEST_SV_VCF,
10621062
project_guids=['R0115_test_project2'],
10631063
project_remap_paths=['not_a_real_file'],
10641064
project_pedigree_paths=[TEST_PEDIGREE_5],
@@ -1620,7 +1620,7 @@ def test_gcnv_update_vat(
16201620
reference_genome=ReferenceGenome.GRCh38,
16211621
dataset_type=DatasetType.GCNV,
16221622
sample_type=SampleType.WES,
1623-
callset_paths=[TEST_GCNV_BED_FILE],
1623+
callset_path=TEST_GCNV_BED_FILE,
16241624
project_guids=['R0115_test_project2'],
16251625
project_remap_paths=['not_a_real_file'],
16261626
project_pedigree_paths=[TEST_PEDIGREE_5],

v03_pipeline/lib/tasks/write_family_table.py

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,21 @@
11
import hail as hl
22
import luigi
3+
import luigi.util
34

4-
from v03_pipeline.lib.model import SampleType
55
from v03_pipeline.lib.paths import family_table_path
6+
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
67
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
78
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
89
from v03_pipeline.lib.tasks.update_project_table import (
910
UpdateProjectTableTask,
1011
)
1112

1213

14+
@luigi.util.inherits(BaseLoadingRunParams)
1315
class WriteFamilyTableTask(BaseWriteTask):
14-
sample_type = luigi.EnumParameter(enum=SampleType)
15-
callset_path = luigi.Parameter()
1616
project_guid = luigi.Parameter()
1717
project_remap_path = luigi.Parameter()
1818
project_pedigree_path = luigi.Parameter()
19-
imputed_sex_path = luigi.Parameter(default=None)
20-
ignore_missing_samples_when_remapping = luigi.BoolParameter(
21-
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
22-
)
23-
validate = luigi.BoolParameter(
24-
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
25-
)
26-
force = luigi.BoolParameter(
27-
default=False,
28-
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
29-
)
30-
is_new_gcnv_joint_call = luigi.BoolParameter(
31-
description='Is this a fully joint-called callset.',
32-
)
3319
family_guid = luigi.Parameter()
3420

3521
def output(self) -> luigi.Target:
@@ -51,20 +37,7 @@ def complete(self) -> bool:
5137
)
5238

5339
def requires(self) -> luigi.Task:
54-
return UpdateProjectTableTask(
55-
self.reference_genome,
56-
self.dataset_type,
57-
self.project_guid,
58-
self.sample_type,
59-
self.callset_path,
60-
self.project_remap_path,
61-
self.project_pedigree_path,
62-
self.imputed_sex_path,
63-
self.ignore_missing_samples_when_remapping,
64-
self.validate,
65-
False,
66-
self.is_new_gcnv_joint_call,
67-
)
40+
return self.clone(UpdateProjectTableTask, force=False)
6841

6942
def create_table(self) -> hl.Table:
7043
project_ht = hl.read_table(self.input().path)

0 commit comments

Comments
 (0)