Skip to content

Commit 61eda3b

Browse files
committed
Merge branch 'main' of github.com:broadinstitute/seqr-loading-pipelines into dev
2 parents 1ab1a25 + 993356d commit 61eda3b

11 files changed

+147
-40
lines changed

v03_pipeline/bin/download_vep_reference_data.bash

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,15 @@ if [ -f $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS ]; then
4949
fi
5050

5151
mkdir -p $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME;
52+
rm -rf $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/*;
53+
5254
for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do
5355
if [[ $vep_reference_data_file == *.tar.gz ]]; then
5456
echo "Downloading and extracting" $vep_reference_data_file;
5557
gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ &
5658
else
5759
echo "Downloading" $vep_reference_data_file;
58-
gsutil cp $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ &
60+
gsutil cp $vep_reference_data_file $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ &
5961
fi
6062
done;
6163
wait
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import os
2+
3+
import hail as hl
4+
import hailtop.fs as hfs
5+
6+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
7+
from v03_pipeline.lib.paths import project_table_path
8+
9+
10+
def get_valid_project_tables(
11+
reference_genome: ReferenceGenome,
12+
dataset_type: DatasetType,
13+
project_guid: str,
14+
) -> set[tuple[hl.Table, SampleType]]:
15+
project_tables = set()
16+
for sample_type in SampleType:
17+
project_ht_path = project_table_path(
18+
reference_genome,
19+
dataset_type,
20+
sample_type,
21+
project_guid,
22+
)
23+
if hfs.exists(project_ht_path) and hfs.exists(
24+
os.path.join(project_ht_path, '_SUCCESS'),
25+
):
26+
project_ht = hl.read_table(project_ht_path)
27+
project_tables.add((project_ht, sample_type))
28+
if len(project_tables) == 0:
29+
msg = f'No project tables found for {project_guid}'
30+
raise RuntimeError(msg)
31+
return project_tables

v03_pipeline/lib/model/environment.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,13 @@
1111
'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
1212
)
1313
HAIL_TMP_DIR = os.environ.get('HAIL_TMP_DIR', '/tmp') # noqa: S108
14-
HAIL_SEARCH_DATA_DIR = os.environ.get('HAIL_SEARCH_DATA_DIR', '/seqr/hail-search-data')
14+
HAIL_SEARCH_DATA_DIR = os.environ.get(
15+
'HAIL_SEARCH_DATA_DIR',
16+
'/seqr/seqr-hail-search-data',
17+
)
1518
LOADING_DATASETS_DIR = os.environ.get('LOADING_DATASETS_DIR', '/seqr/seqr-loading-temp')
16-
PRIVATE_REFERENCE_DATASETS_DIR_DIR = os.environ.get(
17-
'PRIVATE_REFERENCE_DATASETS_DIR_DIR',
19+
PRIVATE_REFERENCE_DATASETS_DIR = os.environ.get(
20+
'PRIVATE_REFERENCE_DATASETS_DIR',
1821
'/seqr/seqr-reference-data-private',
1922
)
2023
REFERENCE_DATASETS_DIR = os.environ.get(
@@ -50,7 +53,7 @@ class Env:
5053
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH
5154
GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH
5255
LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR
53-
PRIVATE_REFERENCE_DATASETS_DIR_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR_DIR
56+
PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR
5457
PROJECT_ID: str | None = PROJECT_ID
5558
REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
5659
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES

v03_pipeline/lib/paths.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def _v03_reference_data_prefix(
3232
reference_genome: ReferenceGenome,
3333
) -> str:
3434
root = (
35-
Env.PRIVATE_REFERENCE_DATASETS_DIR_DIR
35+
Env.PRIVATE_REFERENCE_DATASETS_DIR
3636
if access_control == AccessControl.PRIVATE
3737
else Env.REFERENCE_DATASETS_DIR
3838
)

v03_pipeline/lib/paths_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def test_family_table_path(self) -> None:
4848
SampleType.WES,
4949
'franklin',
5050
),
51-
'/seqr/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
51+
'/seqr/seqr-hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
5252
)
5353
with patch('v03_pipeline.lib.paths.Env') as mock_env:
5454
mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/'
@@ -90,7 +90,7 @@ def test_project_table_path(self) -> None:
9090
SampleType.WES,
9191
'R0652_pipeline_test',
9292
),
93-
'/seqr/hail-search-data/v3.1/GRCh38/MITO/projects/WES/R0652_pipeline_test.ht',
93+
'/seqr/seqr-hail-search-data/v3.1/GRCh38/MITO/projects/WES/R0652_pipeline_test.ht',
9494
)
9595

9696
def test_valid_reference_dataset_collection_path(self) -> None:
@@ -119,7 +119,7 @@ def test_lookup_table_path(self) -> None:
119119
ReferenceGenome.GRCh37,
120120
DatasetType.SV,
121121
),
122-
'/seqr/hail-search-data/v3.1/GRCh37/SV/lookup.ht',
122+
'/seqr/seqr-hail-search-data/v3.1/GRCh37/SV/lookup.ht',
123123
)
124124

125125
def test_sex_check_table_path(self) -> None:
@@ -159,7 +159,7 @@ def test_metadata_for_run_path(self) -> None:
159159
DatasetType.SNV_INDEL,
160160
'manual__2023-06-26T18:30:09.349671+00:00',
161161
),
162-
'/seqr/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json',
162+
'/seqr/seqr-hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json',
163163
)
164164

165165
def test_variant_annotations_table_path(self) -> None:
@@ -168,7 +168,7 @@ def test_variant_annotations_table_path(self) -> None:
168168
ReferenceGenome.GRCh38,
169169
DatasetType.GCNV,
170170
),
171-
'/seqr/hail-search-data/v3.1/GRCh38/GCNV/annotations.ht',
171+
'/seqr/seqr-hail-search-data/v3.1/GRCh38/GCNV/annotations.ht',
172172
)
173173

174174
def test_remapped_and_subsetted_callset_path(self) -> None:
@@ -218,7 +218,7 @@ def test_new_variants_table_path(self) -> None:
218218
DatasetType.SNV_INDEL,
219219
'manual__2023-06-26T18:30:09.349671+00:00',
220220
),
221-
'/seqr/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
221+
'/seqr/seqr-hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
222222
)
223223

224224
def test_project_remap_path(self) -> None:

v03_pipeline/lib/tasks/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from v03_pipeline.lib.tasks.delete_project_family_tables import (
44
DeleteProjectFamilyTablesTask,
55
)
6-
from v03_pipeline.lib.tasks.delete_project_table import DeleteProjectTableTask
6+
from v03_pipeline.lib.tasks.delete_project_tables import DeleteProjectTablesTask
77
from v03_pipeline.lib.tasks.migrate_all_lookup_tables import MigrateAllLookupTablesTask
88
from v03_pipeline.lib.tasks.migrate_all_variant_annotations_tables import (
99
MigrateAllVariantAnnotationsTablesTask,
@@ -21,8 +21,8 @@
2121
UpdateLookupTableWithDeletedProjectTask,
2222
)
2323
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
24-
from v03_pipeline.lib.tasks.update_project_table_with_deleted_families import (
25-
UpdateProjectTableWithDeletedFamiliesTask,
24+
from v03_pipeline.lib.tasks.update_project_tables_with_deleted_families import (
25+
UpdateProjectTablesWithDeletedFamiliesTask,
2626
)
2727
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_deleted_families import (
2828
UpdateVariantAnnotationsTableWithDeletedFamiliesTask,
@@ -42,11 +42,11 @@
4242
'DeleteFamilyTableTask',
4343
'DeleteFamilyTablesTask',
4444
'DeleteProjectFamilyTablesTask',
45-
'DeleteProjectTableTask',
45+
'DeleteProjectTablesTask',
4646
'MigrateAllLookupTablesTask',
4747
'MigrateAllVariantAnnotationsTablesTask',
4848
'UpdateProjectTableTask',
49-
'UpdateProjectTableWithDeletedFamiliesTask',
49+
'UpdateProjectTablesWithDeletedFamiliesTask',
5050
'UpdateLookupTableTask',
5151
'UpdateLookupTableWithDeletedProjectTask',
5252
'UpdateLookupTableWithDeletedFamiliesTask',

v03_pipeline/lib/tasks/delete_project_family_tables.py

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
import hail as hl
2-
import hailtop.fs as hfs
32
import luigi
43
import luigi.util
54

6-
from v03_pipeline.lib.model import SampleType
7-
from v03_pipeline.lib.paths import project_table_path
5+
from v03_pipeline.lib.misc.project_tables import get_valid_project_tables
86
from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
97
BaseLoadingPipelineParams,
108
)
119
from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask
12-
from v03_pipeline.lib.tasks.files import HailTableTask
1310

1411

1512
@luigi.util.inherits(BaseLoadingPipelineParams)
@@ -27,23 +24,11 @@ def complete(self) -> bool:
2724
)
2825

2926
def run(self):
30-
project_tables = set()
31-
for sample_type in SampleType:
32-
project_ht_path = project_table_path(
33-
self.reference_genome,
34-
self.dataset_type,
35-
sample_type,
36-
self.project_guid,
37-
)
38-
if hfs.exists(project_ht_path):
39-
project_table_task: luigi.Target = yield HailTableTask(project_ht_path)
40-
project_ht = hl.read_table(project_table_task.path)
41-
project_tables.add((project_ht, sample_type))
42-
43-
if len(project_tables) == 0:
44-
msg = f'No project tables found for {self.project_guid}'
45-
raise RuntimeError(msg)
46-
27+
project_tables = get_valid_project_tables(
28+
self.reference_genome,
29+
self.dataset_type,
30+
self.project_guid,
31+
)
4732
for project_ht, sample_type in project_tables:
4833
family_guids = hl.eval(project_ht.globals.family_guids)
4934
for family_guid in family_guids:
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import luigi
2+
3+
from v03_pipeline.lib.model import SampleType
4+
from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
5+
BaseLoadingPipelineParams,
6+
)
7+
from v03_pipeline.lib.tasks.delete_project_table import DeleteProjectTableTask
8+
9+
10+
@luigi.util.inherits(BaseLoadingPipelineParams)
11+
class DeleteProjectTablesTask(luigi.Task):
12+
project_guid = luigi.Parameter()
13+
14+
def __init__(self, *args, **kwargs):
15+
super().__init__(*args, **kwargs)
16+
self.dynamic_delete_project_table_tasks = set()
17+
18+
def complete(self) -> bool:
19+
return len(self.dynamic_delete_project_table_tasks) >= 1 and all(
20+
delete_project_table_task.complete()
21+
for delete_project_table_task in self.dynamic_delete_project_table_tasks
22+
)
23+
24+
def run(self):
25+
for sample_type in SampleType:
26+
self.dynamic_delete_project_table_tasks.add(
27+
DeleteProjectTableTask(
28+
reference_genome=self.reference_genome,
29+
dataset_type=self.dataset_type,
30+
sample_type=sample_type,
31+
project_guid=self.project_guid,
32+
),
33+
)
34+
yield self.dynamic_delete_project_table_tasks

v03_pipeline/lib/tasks/update_lookup_table.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import luigi
33
import luigi.util
44

5-
from v03_pipeline.lib.misc.io import remap_pedigree_hash
5+
from v03_pipeline.lib.misc.io import checkpoint, remap_pedigree_hash
66
from v03_pipeline.lib.misc.lookup import (
77
compute_callset_lookup_ht,
88
join_lookup_hts,
@@ -111,4 +111,5 @@ def update_table(self, ht: hl.Table) -> hl.Table:
111111
),
112112
migrations=ht.migrations,
113113
)
114+
ht, _ = checkpoint(ht)
114115
return ht
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import luigi
2+
import luigi.util
3+
4+
from v03_pipeline.lib.misc.project_tables import get_valid_project_tables
5+
from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
6+
BaseLoadingPipelineParams,
7+
)
8+
from v03_pipeline.lib.tasks.update_project_table_with_deleted_families import (
9+
UpdateProjectTableWithDeletedFamiliesTask,
10+
)
11+
12+
13+
@luigi.util.inherits(BaseLoadingPipelineParams)
14+
class UpdateProjectTablesWithDeletedFamiliesTask(luigi.Task):
15+
project_guid = luigi.Parameter()
16+
family_guids = luigi.ListParameter()
17+
18+
def __init__(self, *args, **kwargs):
19+
super().__init__(*args, **kwargs)
20+
self.dynamic_update_project_table_tasks = set()
21+
22+
def complete(self) -> bool:
23+
return len(self.dynamic_update_project_table_tasks) >= 1 and all(
24+
update_project_table_task.complete()
25+
for update_project_table_task in self.dynamic_update_project_table_tasks
26+
)
27+
28+
def run(self):
29+
project_tables = get_valid_project_tables(
30+
self.reference_genome,
31+
self.dataset_type,
32+
self.project_guid,
33+
)
34+
for _, sample_type in project_tables:
35+
self.dynamic_update_project_table_tasks.add(
36+
UpdateProjectTableWithDeletedFamiliesTask(
37+
reference_genome=self.reference_genome,
38+
dataset_type=self.dataset_type,
39+
sample_type=sample_type,
40+
project_guid=self.project_guid,
41+
family_guids=self.family_guids,
42+
),
43+
)
44+
yield self.dynamic_update_project_table_tasks

0 commit comments

Comments
 (0)