Skip to content

Commit 96725e4

Browse files
committed
Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines
2 parents 3d5e4ac + 97b90a6 commit 96725e4

36 files changed

+48
-79
lines changed

v03_pipeline/lib/misc/io.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,21 +200,24 @@ def import_pedigree(pedigree_path: str) -> hl.Table:
200200
)
201201

202202

203-
def write(
204-
t: hl.Table | hl.MatrixTable,
205-
destination_path: str,
206-
) -> hl.Table | hl.MatrixTable:
203+
def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, str]:
207204
suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht'
208205
read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table
209206
checkpoint_path = os.path.join(
210207
Env.HAIL_TMPDIR,
211208
f'{uuid.uuid4()}.{suffix}',
212209
)
213-
# not using checkpoint to read/write here because the checkpoint codec is different, leading to a different on disk size.
214210
t.write(checkpoint_path)
215-
t = read_fn(checkpoint_path)
211+
return read_fn(checkpoint_path), checkpoint_path
212+
213+
214+
def write(
215+
t: hl.Table | hl.MatrixTable,
216+
destination_path: str,
217+
) -> hl.Table | hl.MatrixTable:
218+
t, path = checkpoint(t)
216219
t = t.repartition(
217-
compute_hail_n_partitions(file_size_bytes(checkpoint_path)),
220+
compute_hail_n_partitions(file_size_bytes(path)),
218221
shuffle=False,
219222
)
220223
return t.write(destination_path, overwrite=True)

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
183183
os.path.basename(tmp_file.name),
184184
)
185185
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
186-
return hl.import_table(
186+
ht = hl.import_table(
187187
gcs_tmp_file_name,
188188
force=True,
189189
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
@@ -193,5 +193,7 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
193193
'ReportedPhenotypeInfo': hl.tstr,
194194
},
195195
missing='-',
196-
min_partitions=MIN_HT_PARTITIONS,
197196
)
197+
# NB: min_partitions fails with force=True during `import_table`, but
198+
# an immediate repartition here works.
199+
return ht.repartition(MIN_HT_PARTITIONS)

v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import hail as hl
55
import pytz
66

7+
from v03_pipeline.lib.misc.io import checkpoint
78
from v03_pipeline.lib.misc.nested_field import parse_nested_field
89
from v03_pipeline.lib.model import (
910
DatasetType,
@@ -36,6 +37,7 @@ def update_or_create_joined_ht(
3637

3738
# Join the new one!
3839
dataset_ht = get_dataset_ht(dataset, reference_genome)
40+
dataset_ht, _ = checkpoint(dataset_ht)
3941
joined_ht = joined_ht.join(dataset_ht, 'outer')
4042
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
4143

@@ -213,6 +215,7 @@ def join_hts(
213215
)
214216
for dataset in reference_dataset_collection.datasets(dataset_type):
215217
dataset_ht = get_dataset_ht(dataset, reference_genome)
218+
dataset_ht, _ = checkpoint(dataset_ht)
216219
joined_ht = joined_ht.join(dataset_ht, 'outer')
217220
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
218221
return joined_ht

v03_pipeline/lib/tasks/base/base_hail_table.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import luigi
33

44
from v03_pipeline.lib.logger import get_logger
5-
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, SampleType
5+
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome
66
from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget
77

88
logger = get_logger(__name__)
@@ -11,7 +11,6 @@
1111
class BaseHailTableTask(luigi.Task):
1212
reference_genome = luigi.EnumParameter(enum=ReferenceGenome)
1313
dataset_type = luigi.EnumParameter(enum=DatasetType)
14-
sample_type = luigi.EnumParameter(enum=SampleType)
1514

1615
def output(self) -> luigi.Target:
1716
raise NotImplementedError

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ def requires(self) -> list[luigi.Task]:
3939
UpdatedReferenceDatasetCollectionTask(
4040
self.reference_genome,
4141
self.dataset_type,
42-
self.sample_type,
4342
rdc,
4443
)
4544
if Env.REFERENCE_DATA_AUTO_UPDATE

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
DatasetType,
99
ReferenceDatasetCollection,
1010
ReferenceGenome,
11-
SampleType,
1211
)
1312
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
1413
from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import (
@@ -59,7 +58,6 @@ def test_should_create_initialized_table(self, mock_update_rdc_task) -> None:
5958
vat_task = BaseUpdateVariantAnnotationsTableTask(
6059
reference_genome=ReferenceGenome.GRCh38,
6160
dataset_type=DatasetType.SNV_INDEL,
62-
sample_type=SampleType.WGS,
6361
)
6462
self.assertTrue('annotations.ht' in vat_task.output().path)
6563
self.assertTrue(DatasetType.SNV_INDEL.value in vat_task.output().path)

v03_pipeline/lib/tasks/delete_family_table_test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import hail as hl
44
import luigi.worker
55

6-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
6+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
77
from v03_pipeline.lib.paths import family_table_path
88
from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask
99
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
@@ -50,7 +50,6 @@ def test_delete_family_table_task(self) -> None:
5050
task = DeleteFamilyTableTask(
5151
reference_genome=ReferenceGenome.GRCh38,
5252
dataset_type=DatasetType.SNV_INDEL,
53-
sample_type=SampleType.WGS,
5453
family_guid='abc_1',
5554
)
5655
worker.add(task)

v03_pipeline/lib/tasks/delete_family_tables.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def run(self):
2323
DeleteFamilyTableTask(
2424
reference_genome=self.reference_genome,
2525
dataset_type=self.dataset_type,
26-
sample_type=self.sample_type,
2726
family_guid=family_guid,
2827
),
2928
)

v03_pipeline/lib/tasks/delete_family_tables_test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import hail as hl
44
import luigi.worker
55

6-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
6+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
77
from v03_pipeline.lib.paths import family_table_path
88
from v03_pipeline.lib.tasks.delete_family_tables import (
99
DeleteFamilyTablesTask,
@@ -38,7 +38,6 @@ def test_delete_project_family_tables_task(self) -> None:
3838
task = DeleteFamilyTablesTask(
3939
reference_genome=ReferenceGenome.GRCh38,
4040
dataset_type=DatasetType.SNV_INDEL,
41-
sample_type=SampleType.WGS,
4241
family_guids=['family_a', 'family_b'],
4342
)
4443
worker.add(task)

v03_pipeline/lib/tasks/delete_project_family_tables.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ def run(self):
3535
DeleteFamilyTableTask(
3636
reference_genome=self.reference_genome,
3737
dataset_type=self.dataset_type,
38-
sample_type=self.sample_type,
3938
family_guid=family_guid,
4039
),
4140
)

v03_pipeline/lib/tasks/delete_project_family_tables_test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import hail as hl
44
import luigi.worker
55

6-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
6+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
77
from v03_pipeline.lib.paths import family_table_path, project_table_path
88
from v03_pipeline.lib.tasks.delete_project_family_tables import (
99
DeleteProjectFamilyTablesTask,
@@ -149,7 +149,6 @@ def test_delete_project_family_tables_task(self) -> None:
149149
task = DeleteProjectFamilyTablesTask(
150150
reference_genome=ReferenceGenome.GRCh38,
151151
dataset_type=DatasetType.SNV_INDEL,
152-
sample_type=SampleType.WGS,
153152
project_guid='project_a',
154153
)
155154
worker.add(task)

v03_pipeline/lib/tasks/delete_project_table.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ def requires(self) -> luigi.Task:
1515
return DeleteProjectFamilyTablesTask(
1616
self.reference_genome,
1717
self.dataset_type,
18-
self.sample_type,
1918
self.project_guid,
2019
)
2120

v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
CachedReferenceDatasetQuery,
55
DatasetType,
66
ReferenceGenome,
7-
SampleType,
87
)
98
from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import (
109
UpdatedCachedReferenceDatasetQuery,
@@ -14,7 +13,6 @@
1413
class UpdateCachedReferenceDatasetQueries(luigi.Task):
1514
reference_genome = luigi.EnumParameter(enum=ReferenceGenome)
1615
dataset_type = luigi.EnumParameter(enum=DatasetType)
17-
sample_type = luigi.EnumParameter(enum=SampleType)
1816

1917
def __init__(self, *args, **kwargs):
2018
super().__init__(*args, **kwargs)

v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
CachedReferenceDatasetQuery,
88
DatasetType,
99
ReferenceGenome,
10-
SampleType,
1110
)
1211
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
1312
UpdateCachedReferenceDatasetQueries,
@@ -25,7 +24,6 @@ def test_37_snv_indel(self, mock_crdq_task):
2524
task = UpdateCachedReferenceDatasetQueries(
2625
reference_genome=ReferenceGenome.GRCh37,
2726
dataset_type=DatasetType.SNV_INDEL,
28-
sample_type=SampleType.WGS,
2927
)
3028
worker.add(task)
3129
worker.run()
@@ -35,25 +33,21 @@ def test_37_snv_indel(self, mock_crdq_task):
3533
mock.call(
3634
reference_genome=ReferenceGenome.GRCh37,
3735
dataset_type=DatasetType.SNV_INDEL,
38-
sample_type=SampleType.WGS,
3936
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
4037
),
4138
mock.call(
4239
reference_genome=ReferenceGenome.GRCh37,
4340
dataset_type=DatasetType.SNV_INDEL,
44-
sample_type=SampleType.WGS,
4541
crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
4642
),
4743
mock.call(
4844
reference_genome=ReferenceGenome.GRCh37,
4945
dataset_type=DatasetType.SNV_INDEL,
50-
sample_type=SampleType.WGS,
5146
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
5247
),
5348
mock.call(
5449
reference_genome=ReferenceGenome.GRCh37,
5550
dataset_type=DatasetType.SNV_INDEL,
56-
sample_type=SampleType.WGS,
5751
crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS,
5852
),
5953
],
@@ -65,7 +59,6 @@ def test_38_snv_indel(self, mock_crdq_task):
6559
task = UpdateCachedReferenceDatasetQueries(
6660
reference_genome=ReferenceGenome.GRCh38,
6761
dataset_type=DatasetType.SNV_INDEL,
68-
sample_type=SampleType.WGS,
6962
)
7063
worker.add(task)
7164
worker.run()
@@ -75,25 +68,21 @@ def test_38_snv_indel(self, mock_crdq_task):
7568
mock.call(
7669
reference_genome=ReferenceGenome.GRCh38,
7770
dataset_type=DatasetType.SNV_INDEL,
78-
sample_type=SampleType.WGS,
7971
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
8072
),
8173
mock.call(
8274
reference_genome=ReferenceGenome.GRCh38,
8375
dataset_type=DatasetType.SNV_INDEL,
84-
sample_type=SampleType.WGS,
8576
crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
8677
),
8778
mock.call(
8879
reference_genome=ReferenceGenome.GRCh38,
8980
dataset_type=DatasetType.SNV_INDEL,
90-
sample_type=SampleType.WGS,
9181
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
9282
),
9383
mock.call(
9484
reference_genome=ReferenceGenome.GRCh38,
9585
dataset_type=DatasetType.SNV_INDEL,
96-
sample_type=SampleType.WGS,
9786
crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS,
9887
),
9988
],
@@ -105,7 +94,6 @@ def test_38_mito(self, mock_crdq_task):
10594
task = UpdateCachedReferenceDatasetQueries(
10695
reference_genome=ReferenceGenome.GRCh38,
10796
dataset_type=DatasetType.MITO,
108-
sample_type=SampleType.WGS,
10997
)
11098
worker.add(task)
11199
worker.run()
@@ -115,7 +103,6 @@ def test_38_mito(self, mock_crdq_task):
115103
mock.call(
116104
reference_genome=ReferenceGenome.GRCh38,
117105
dataset_type=DatasetType.MITO,
118-
sample_type=SampleType.WGS,
119106
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
120107
),
121108
],
@@ -127,7 +114,6 @@ def test_38_sv(self, mock_crdq_task):
127114
task = UpdateCachedReferenceDatasetQueries(
128115
reference_genome=ReferenceGenome.GRCh38,
129116
dataset_type=DatasetType.SV,
130-
sample_type=SampleType.WGS,
131117
)
132118
worker.add(task)
133119
worker.run()

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
DatasetType,
2020
ReferenceDatasetCollection,
2121
ReferenceGenome,
22-
SampleType,
2322
)
2423
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
2524
from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS
@@ -734,7 +733,6 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
734733
task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
735734
reference_genome=ReferenceGenome.GRCh38,
736735
dataset_type=DatasetType.SNV_INDEL,
737-
sample_type=SampleType.WGS,
738736
)
739737
worker = luigi.worker.Worker()
740738
worker.add(task)
@@ -941,7 +939,6 @@ def test_update_vat_with_updated_rdc_mito_38(
941939
task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
942940
reference_genome=ReferenceGenome.GRCh38,
943941
dataset_type=DatasetType.MITO,
944-
sample_type=SampleType.WGS,
945942
)
946943
worker = luigi.worker.Worker()
947944
worker.add(task)
@@ -1084,7 +1081,6 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
10841081
task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
10851082
reference_genome=ReferenceGenome.GRCh37,
10861083
dataset_type=DatasetType.SNV_INDEL,
1087-
sample_type=SampleType.WGS,
10881084
)
10891085
worker = luigi.worker.Worker()
10901086
worker.add(task)

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ def requires(self) -> luigi.Task:
6868
return UpdatedReferenceDatasetCollectionTask(
6969
self.reference_genome,
7070
self.dataset_type,
71-
self.sample_type,
7271
ReferenceDatasetCollection.COMBINED,
7372
)
7473
if self.crdq.query_raw_dataset:

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
DatasetType,
1212
ReferenceDatasetCollection,
1313
ReferenceGenome,
14-
SampleType,
1514
)
1615
from v03_pipeline.lib.paths import (
1716
cached_reference_dataset_query_path,
@@ -109,7 +108,6 @@ def test_gnomad_qc(
109108
task = UpdatedCachedReferenceDatasetQuery(
110109
reference_genome=ReferenceGenome.GRCh38,
111110
dataset_type=DatasetType.SNV_INDEL,
112-
sample_type=SampleType.WGS,
113111
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
114112
)
115113
worker.add(task)
@@ -199,7 +197,6 @@ def _clinvar_path_variants(table, **_: Any):
199197
task = UpdatedCachedReferenceDatasetQuery(
200198
reference_genome=ReferenceGenome.GRCh38,
201199
dataset_type=DatasetType.SNV_INDEL,
202-
sample_type=SampleType.WGS,
203200
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
204201
)
205202
worker.add(task)

v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
DatasetType,
1111
ReferenceDatasetCollection,
1212
ReferenceGenome,
13-
SampleType,
1413
)
1514
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
1615
from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS
@@ -170,7 +169,6 @@ def test_update_task_with_empty_reference_data_table(
170169
task = UpdatedReferenceDatasetCollectionTask(
171170
reference_genome=ReferenceGenome.GRCh38,
172171
dataset_type=DatasetType.SNV_INDEL,
173-
sample_type=SampleType.WGS,
174172
reference_dataset_collection=ReferenceDatasetCollection.COMBINED,
175173
)
176174
worker.add(task)
@@ -280,7 +278,6 @@ def test_update_task_with_existing_reference_dataset_collection_table(
280278
task = UpdatedReferenceDatasetCollectionTask(
281279
reference_genome=ReferenceGenome.GRCh38,
282280
dataset_type=DatasetType.SNV_INDEL,
283-
sample_type=SampleType.WGS,
284281
reference_dataset_collection=ReferenceDatasetCollection.COMBINED,
285282
)
286283
worker.add(task)

0 commit comments

Comments
 (0)