Skip to content

Commit be76f32

Browse files
committed
Merge branch 'main' of github.com:broadinstitute/seqr-loading-pipelines
2 parents f5ca273 + 9801838 commit be76f32

15 files changed

+154
-55
lines changed

v03_pipeline/lib/model/definitions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class Sex(Enum):
1717
class PipelineVersion(Enum):
1818
V02 = 'v02'
1919
V03 = 'v03'
20+
V3_1 = 'v3.1'
2021

2122

2223
class ReferenceGenome(Enum):

v03_pipeline/lib/paths.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@
1414
)
1515

1616

17-
def _v03_pipeline_prefix(
17+
def _pipeline_prefix(
1818
root: str,
1919
reference_genome: ReferenceGenome,
2020
dataset_type: DatasetType,
2121
) -> str:
2222
return os.path.join(
2323
root,
24-
PipelineVersion.V03.value,
24+
PipelineVersion.V3_1.value,
2525
reference_genome.value,
2626
dataset_type.value,
2727
)
@@ -62,15 +62,17 @@ def cached_reference_dataset_query_path(
6262
def family_table_path(
6363
reference_genome: ReferenceGenome,
6464
dataset_type: DatasetType,
65+
sample_type: SampleType,
6566
family_guid: str,
6667
) -> str:
6768
return os.path.join(
68-
_v03_pipeline_prefix(
69+
_pipeline_prefix(
6970
Env.HAIL_SEARCH_DATA,
7071
reference_genome,
7172
dataset_type,
7273
),
7374
'families',
75+
sample_type.value,
7476
f'{family_guid}.ht',
7577
)
7678

@@ -81,7 +83,7 @@ def imputed_sex_path(
8183
callset_path: str,
8284
) -> str:
8385
return os.path.join(
84-
_v03_pipeline_prefix(
86+
_pipeline_prefix(
8587
Env.LOADING_DATASETS,
8688
reference_genome,
8789
dataset_type,
@@ -97,7 +99,7 @@ def imported_callset_path(
9799
callset_path: str,
98100
) -> str:
99101
return os.path.join(
100-
_v03_pipeline_prefix(
102+
_pipeline_prefix(
101103
Env.LOADING_DATASETS,
102104
reference_genome,
103105
dataset_type,
@@ -125,15 +127,17 @@ def metadata_for_run_path(
125127
def project_table_path(
126128
reference_genome: ReferenceGenome,
127129
dataset_type: DatasetType,
130+
sample_type: SampleType,
128131
project_guid: str,
129132
) -> str:
130133
return os.path.join(
131-
_v03_pipeline_prefix(
134+
_pipeline_prefix(
132135
Env.HAIL_SEARCH_DATA,
133136
reference_genome,
134137
dataset_type,
135138
),
136139
'projects',
140+
sample_type.value,
137141
f'{project_guid}.ht',
138142
)
139143

@@ -144,7 +148,7 @@ def relatedness_check_table_path(
144148
callset_path: str,
145149
) -> str:
146150
return os.path.join(
147-
_v03_pipeline_prefix(
151+
_pipeline_prefix(
148152
Env.LOADING_DATASETS,
149153
reference_genome,
150154
dataset_type,
@@ -161,7 +165,7 @@ def remapped_and_subsetted_callset_path(
161165
project_guid: str,
162166
) -> str:
163167
return os.path.join(
164-
_v03_pipeline_prefix(
168+
_pipeline_prefix(
165169
Env.LOADING_DATASETS,
166170
reference_genome,
167171
dataset_type,
@@ -177,7 +181,7 @@ def lookup_table_path(
177181
dataset_type: DatasetType,
178182
) -> str:
179183
return os.path.join(
180-
_v03_pipeline_prefix(
184+
_pipeline_prefix(
181185
Env.HAIL_SEARCH_DATA,
182186
reference_genome,
183187
dataset_type,
@@ -191,7 +195,7 @@ def runs_path(
191195
dataset_type: DatasetType,
192196
) -> str:
193197
return os.path.join(
194-
_v03_pipeline_prefix(
198+
_pipeline_prefix(
195199
Env.HAIL_SEARCH_DATA,
196200
reference_genome,
197201
dataset_type,
@@ -206,7 +210,7 @@ def sex_check_table_path(
206210
callset_path: str,
207211
) -> str:
208212
return os.path.join(
209-
_v03_pipeline_prefix(
213+
_pipeline_prefix(
210214
Env.LOADING_DATASETS,
211215
reference_genome,
212216
dataset_type,
@@ -260,7 +264,7 @@ def variant_annotations_table_path(
260264
dataset_type: DatasetType,
261265
) -> str:
262266
return os.path.join(
263-
_v03_pipeline_prefix(
267+
_pipeline_prefix(
264268
Env.HAIL_SEARCH_DATA,
265269
reference_genome,
266270
dataset_type,
@@ -274,7 +278,7 @@ def variant_annotations_vcf_path(
274278
dataset_type: DatasetType,
275279
) -> str:
276280
return os.path.join(
277-
_v03_pipeline_prefix(
281+
_pipeline_prefix(
278282
Env.HAIL_SEARCH_DATA,
279283
reference_genome,
280284
dataset_type,

v03_pipeline/lib/paths_test.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,21 @@ def test_family_table_path(self) -> None:
4242
family_table_path(
4343
ReferenceGenome.GRCh37,
4444
DatasetType.SNV_INDEL,
45+
SampleType.WES,
4546
'franklin',
4647
),
47-
'/hail-search-data/v03/GRCh37/SNV_INDEL/families/franklin.ht',
48+
'/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
4849
)
4950
with patch('v03_pipeline.lib.paths.Env') as mock_env:
5051
mock_env.HAIL_SEARCH_DATA = 'gs://seqr-datasets/'
5152
self.assertEqual(
5253
family_table_path(
5354
ReferenceGenome.GRCh37,
5455
DatasetType.SNV_INDEL,
56+
SampleType.WES,
5557
'franklin',
5658
),
57-
'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
59+
'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
5860
)
5961

6062
def test_valid_filters_path(self) -> None:
@@ -82,9 +84,10 @@ def test_project_table_path(self) -> None:
8284
project_table_path(
8385
ReferenceGenome.GRCh38,
8486
DatasetType.MITO,
87+
SampleType.WES,
8588
'R0652_pipeline_test',
8689
),
87-
'/hail-search-data/v03/GRCh38/MITO/projects/R0652_pipeline_test.ht',
90+
'/hail-search-data/v3.1/GRCh38/MITO/projects/WES/R0652_pipeline_test.ht',
8891
)
8992

9093
def test_valid_reference_dataset_collection_path(self) -> None:
@@ -113,7 +116,7 @@ def test_lookup_table_path(self) -> None:
113116
ReferenceGenome.GRCh37,
114117
DatasetType.SV,
115118
),
116-
'/hail-search-data/v03/GRCh37/SV/lookup.ht',
119+
'/hail-search-data/v3.1/GRCh37/SV/lookup.ht',
117120
)
118121

119122
def test_sex_check_table_path(self) -> None:
@@ -123,7 +126,7 @@ def test_sex_check_table_path(self) -> None:
123126
DatasetType.SNV_INDEL,
124127
'gs://abc.efg/callset.vcf.gz',
125128
),
126-
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
129+
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
127130
)
128131

129132
def test_relatedness_check_table_path(self) -> None:
@@ -133,7 +136,7 @@ def test_relatedness_check_table_path(self) -> None:
133136
DatasetType.SNV_INDEL,
134137
'gs://abc.efg/callset.vcf.gz',
135138
),
136-
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
139+
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
137140
)
138141

139142
def test_metadata_for_run_path(self) -> None:
@@ -143,7 +146,7 @@ def test_metadata_for_run_path(self) -> None:
143146
DatasetType.SNV_INDEL,
144147
'manual__2023-06-26T18:30:09.349671+00:00',
145148
),
146-
'/hail-search-data/v03/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json',
149+
'/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json',
147150
)
148151

149152
def test_variant_annotations_table_path(self) -> None:
@@ -152,7 +155,7 @@ def test_variant_annotations_table_path(self) -> None:
152155
ReferenceGenome.GRCh38,
153156
DatasetType.GCNV,
154157
),
155-
'/hail-search-data/v03/GRCh38/GCNV/annotations.ht',
158+
'/hail-search-data/v3.1/GRCh38/GCNV/annotations.ht',
156159
)
157160

158161
def test_remapped_and_subsetted_callset_path(self) -> None:
@@ -163,7 +166,7 @@ def test_remapped_and_subsetted_callset_path(self) -> None:
163166
'gs://abc.efg/callset.vcf.gz',
164167
'R0111_tgg_bblanken_wes',
165168
),
166-
'/seqr-loading-temp/v03/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
169+
'/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
167170
)
168171
self.assertEqual(
169172
remapped_and_subsetted_callset_path(
@@ -172,7 +175,7 @@ def test_remapped_and_subsetted_callset_path(self) -> None:
172175
'gs://abc.efg/callset/*.vcf.gz',
173176
'R0111_tgg_bblanken_wes',
174177
),
175-
'/seqr-loading-temp/v03/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt',
178+
'/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt',
176179
)
177180

178181
def test_imported_callset_path(self) -> None:
@@ -182,7 +185,7 @@ def test_imported_callset_path(self) -> None:
182185
DatasetType.SNV_INDEL,
183186
'gs://abc.efg/callset.vcf.gz',
184187
),
185-
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
188+
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
186189
)
187190

188191
def test_imputed_sex_path(self) -> None:
@@ -192,7 +195,7 @@ def test_imputed_sex_path(self) -> None:
192195
DatasetType.SNV_INDEL,
193196
'gs://abc.efg/callset.vcf.gz',
194197
),
195-
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
198+
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
196199
)
197200

198201
def test_new_variants_table_path(self) -> None:
@@ -202,5 +205,5 @@ def test_new_variants_table_path(self) -> None:
202205
DatasetType.SNV_INDEL,
203206
'manual__2023-06-26T18:30:09.349671+00:00',
204207
),
205-
'/hail-search-data/v03/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
208+
'/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
206209
)

v03_pipeline/lib/tasks/base/base_update_project_table.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
11
import hail as hl
22
import luigi
33

4+
from v03_pipeline.lib.model import SampleType
45
from v03_pipeline.lib.paths import project_table_path
56
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
67
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
78

89

910
class BaseUpdateProjectTableTask(BaseUpdateTask):
11+
sample_type = luigi.EnumParameter(enum=SampleType)
1012
project_guid = luigi.Parameter()
1113

1214
def output(self) -> luigi.Target:
1315
return GCSorLocalTarget(
1416
project_table_path(
1517
self.reference_genome,
1618
self.dataset_type,
19+
self.sample_type,
1720
self.project_guid,
1821
),
1922
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,21 @@
11
import luigi
22

3+
from v03_pipeline.lib.model import SampleType
34
from v03_pipeline.lib.paths import family_table_path
45
from v03_pipeline.lib.tasks.base.base_delete_table import BaseDeleteTableTask
56
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
67

78

89
class DeleteFamilyTableTask(BaseDeleteTableTask):
10+
sample_type = luigi.EnumParameter(enum=SampleType)
911
family_guid = luigi.Parameter()
1012

1113
def output(self) -> luigi.Target:
1214
return GCSorLocalTarget(
1315
family_table_path(
1416
self.reference_genome,
1517
self.dataset_type,
18+
self.sample_type,
1619
self.family_guid,
1720
),
1821
)

v03_pipeline/lib/tasks/delete_family_table_test.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import hail as hl
44
import luigi.worker
55

6-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
6+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
77
from v03_pipeline.lib.paths import family_table_path
88
from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask
99
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
@@ -41,6 +41,7 @@ def setUp(self) -> None:
4141
family_table_path(
4242
ReferenceGenome.GRCh38,
4343
DatasetType.SNV_INDEL,
44+
SampleType.WES,
4445
'abc_1',
4546
),
4647
)
@@ -50,6 +51,7 @@ def test_delete_family_table_task(self) -> None:
5051
task = DeleteFamilyTableTask(
5152
reference_genome=ReferenceGenome.GRCh38,
5253
dataset_type=DatasetType.SNV_INDEL,
54+
sample_type=SampleType.WES,
5355
family_guid='abc_1',
5456
)
5557
worker.add(task)
@@ -60,6 +62,7 @@ def test_delete_family_table_task(self) -> None:
6062
family_table_path(
6163
ReferenceGenome.GRCh38,
6264
DatasetType.SNV_INDEL,
65+
SampleType.WES,
6366
'abc_1',
6467
),
6568
).exists(),

v03_pipeline/lib/tasks/delete_family_tables.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import luigi
22

3+
from v03_pipeline.lib.model import SampleType
34
from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask
45
from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask
56

@@ -18,12 +19,14 @@ def complete(self) -> bool:
1819
)
1920

2021
def run(self):
21-
for family_guid in self.family_guids:
22-
self.dynamic_delete_family_table_tasks.add(
23-
DeleteFamilyTableTask(
24-
reference_genome=self.reference_genome,
25-
dataset_type=self.dataset_type,
26-
family_guid=family_guid,
27-
),
28-
)
22+
for sample_type in SampleType:
23+
for family_guid in self.family_guids:
24+
self.dynamic_delete_family_table_tasks.add(
25+
DeleteFamilyTableTask(
26+
reference_genome=self.reference_genome,
27+
dataset_type=self.dataset_type,
28+
sample_type=sample_type,
29+
family_guid=family_guid,
30+
),
31+
)
2932
yield self.dynamic_delete_family_table_tasks

0 commit comments

Comments
 (0)