Skip to content

Commit 879294c

Browse files
authored
Merge pull request #895 from broadinstitute/dev
Dev
2 parents 6f70a53 + 552acdd commit 879294c

17 files changed

+53
-40
lines changed

.github/workflows/unit-tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ jobs:
3636
run: ruff . --output-format github
3737
- name: Unit Tests
3838
run: |
39+
export HAIL_TMP_DIR=/tmp
3940
export GRCH37_TO_GRCH38_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz
4041
export GRCH38_TO_GRCH37_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz
4142
export ACCESS_PRIVATE_REFERENCE_DATASETS=1

v03_pipeline/lib/misc/allele_registry.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ def register_alleles(
7777
base_url: str,
7878
) -> hl.Table:
7979
uuid4 = uuid.uuid4()
80-
raw_vcf_file_name = f'{Env.HAIL_TMPDIR}/r_{uuid4}.vcf'
81-
formatted_vcf_file_name = f'{Env.HAIL_TMPDIR}/f_{uuid4}.vcf'
80+
raw_vcf_file_name = f'{Env.HAIL_TMP_DIR}/r_{uuid4}.vcf'
81+
formatted_vcf_file_name = f'{Env.HAIL_TMP_DIR}/f_{uuid4}.vcf'
8282

8383
# Export the variants to a VCF
8484
hl.export_vcf(ht, raw_vcf_file_name)

v03_pipeline/lib/misc/allele_registry_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def test_register_alleles_38(
3939
mock_put_request: Mock,
4040
):
4141
mock_get_credentials.return_value = ('', '')
42-
mock_env.HAIL_TMPDIR = self.temp_dir.name
42+
mock_env.HAIL_TMP_DIR = self.temp_dir.name
4343

4444
new_variants_ht = hl.Table.parallelize(
4545
[

v03_pipeline/lib/misc/io.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable,
217217
suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht'
218218
read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table
219219
checkpoint_path = os.path.join(
220-
Env.HAIL_TMPDIR,
220+
Env.HAIL_TMP_DIR,
221221
f'{uuid.uuid4()}.{suffix}',
222222
)
223223
t.write(checkpoint_path)

v03_pipeline/lib/misc/validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def validate_sample_type(
155155
msg = f'Sample type validation error: dataset contains noncoding variants but is missing common coding variants for {reference_genome.value}. Please verify that the dataset contains coding variants.'
156156
raise SeqrValidationError(msg)
157157
if has_coding and not has_noncoding and sample_type != SampleType.WES:
158-
msg = 'Sample type validation error: dataset sample-type is specified as WGS but appears to be WES because it contains many common coding variants'
158+
msg = 'Sample type validation error: dataset sample-type is specified as WGS but appears to be WES because it contains many common coding variants but is missing common non-coding variants'
159159
raise SeqrValidationError(msg)
160160
if has_noncoding and has_coding and sample_type != SampleType.WGS:
161161
msg = 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants'

v03_pipeline/lib/model/environment.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
from dataclasses import dataclass
33

44
# NB: using os.environ.get inside the dataclass defaults gives a lint error.
5-
HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108
6-
HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/seqr/hail-search-data')
75
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = os.environ.get(
86
'GRCH37_TO_GRCH38_LIFTOVER_REF_PATH',
97
'gs://hail-common/references/grch37_to_grch38.over.chain.gz',
@@ -12,15 +10,18 @@
1210
'GRCH38_TO_GRCH37_LIFTOVER_REF_PATH',
1311
'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
1412
)
15-
LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr/seqr-loading-temp')
16-
PRIVATE_REFERENCE_DATASETS = os.environ.get(
17-
'PRIVATE_REFERENCE_DATASETS',
13+
HAIL_TMP_DIR = os.environ.get('HAIL_TMP_DIR', '/seqr/tmp')
14+
HAIL_SEARCH_DATA_DIR = os.environ.get('HAIL_SEARCH_DATA_DIR', '/seqr/hail-search-data')
15+
LOADING_DATASETS_DIR = os.environ.get('LOADING_DATASETS_DIR', '/seqr/seqr-loading-temp')
16+
PRIVATE_REFERENCE_DATASETS_DIR_DIR = os.environ.get(
17+
'PRIVATE_REFERENCE_DATASETS_DIR_DIR',
1818
'/seqr/seqr-reference-data-private',
1919
)
20-
REFERENCE_DATASETS = os.environ.get(
21-
'REFERENCE_DATASETS',
20+
REFERENCE_DATASETS_DIR = os.environ.get(
21+
'REFERENCE_DATASETS_DIR',
2222
'/seqr/seqr-reference-data',
2323
)
24+
2425
# Allele registry secrets :/
2526
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
2627
PROJECT_ID = os.environ.get('PROJECT_ID', None)
@@ -40,12 +41,12 @@ class Env:
4041
ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
4142
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
4243
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
43-
HAIL_TMPDIR: str = HAIL_TMPDIR
44-
HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
44+
HAIL_TMP_DIR: str = HAIL_TMP_DIR
45+
HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR
4546
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH
4647
GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH
47-
LOADING_DATASETS: str = LOADING_DATASETS
48-
PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
48+
LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR
49+
PRIVATE_REFERENCE_DATASETS_DIR_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR_DIR
4950
PROJECT_ID: str | None = PROJECT_ID
50-
REFERENCE_DATASETS: str = REFERENCE_DATASETS
51+
REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
5152
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES

v03_pipeline/lib/paths.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ def _v03_reference_data_prefix(
3232
reference_genome: ReferenceGenome,
3333
) -> str:
3434
root = (
35-
Env.PRIVATE_REFERENCE_DATASETS
35+
Env.PRIVATE_REFERENCE_DATASETS_DIR_DIR
3636
if access_control == AccessControl.PRIVATE
37-
else Env.REFERENCE_DATASETS
37+
else Env.REFERENCE_DATASETS_DIR
3838
)
3939
return os.path.join(
4040
root,
@@ -67,7 +67,7 @@ def family_table_path(
6767
) -> str:
6868
return os.path.join(
6969
_pipeline_prefix(
70-
Env.HAIL_SEARCH_DATA,
70+
Env.HAIL_SEARCH_DATA_DIR,
7171
reference_genome,
7272
dataset_type,
7373
),
@@ -84,7 +84,7 @@ def imputed_sex_path(
8484
) -> str:
8585
return os.path.join(
8686
_pipeline_prefix(
87-
Env.LOADING_DATASETS,
87+
Env.LOADING_DATASETS_DIR,
8888
reference_genome,
8989
dataset_type,
9090
),
@@ -100,7 +100,7 @@ def imported_callset_path(
100100
) -> str:
101101
return os.path.join(
102102
_pipeline_prefix(
103-
Env.LOADING_DATASETS,
103+
Env.LOADING_DATASETS_DIR,
104104
reference_genome,
105105
dataset_type,
106106
),
@@ -132,7 +132,7 @@ def project_table_path(
132132
) -> str:
133133
return os.path.join(
134134
_pipeline_prefix(
135-
Env.HAIL_SEARCH_DATA,
135+
Env.HAIL_SEARCH_DATA_DIR,
136136
reference_genome,
137137
dataset_type,
138138
),
@@ -149,7 +149,7 @@ def relatedness_check_table_path(
149149
) -> str:
150150
return os.path.join(
151151
_pipeline_prefix(
152-
Env.LOADING_DATASETS,
152+
Env.LOADING_DATASETS_DIR,
153153
reference_genome,
154154
dataset_type,
155155
),
@@ -166,7 +166,7 @@ def remapped_and_subsetted_callset_path(
166166
) -> str:
167167
return os.path.join(
168168
_pipeline_prefix(
169-
Env.LOADING_DATASETS,
169+
Env.LOADING_DATASETS_DIR,
170170
reference_genome,
171171
dataset_type,
172172
),
@@ -182,7 +182,7 @@ def lookup_table_path(
182182
) -> str:
183183
return os.path.join(
184184
_pipeline_prefix(
185-
Env.HAIL_SEARCH_DATA,
185+
Env.HAIL_SEARCH_DATA_DIR,
186186
reference_genome,
187187
dataset_type,
188188
),
@@ -196,7 +196,7 @@ def runs_path(
196196
) -> str:
197197
return os.path.join(
198198
_pipeline_prefix(
199-
Env.HAIL_SEARCH_DATA,
199+
Env.HAIL_SEARCH_DATA_DIR,
200200
reference_genome,
201201
dataset_type,
202202
),
@@ -211,7 +211,7 @@ def sex_check_table_path(
211211
) -> str:
212212
return os.path.join(
213213
_pipeline_prefix(
214-
Env.LOADING_DATASETS,
214+
Env.LOADING_DATASETS_DIR,
215215
reference_genome,
216216
dataset_type,
217217
),
@@ -265,7 +265,7 @@ def variant_annotations_table_path(
265265
) -> str:
266266
return os.path.join(
267267
_pipeline_prefix(
268-
Env.HAIL_SEARCH_DATA,
268+
Env.HAIL_SEARCH_DATA_DIR,
269269
reference_genome,
270270
dataset_type,
271271
),
@@ -279,7 +279,7 @@ def variant_annotations_vcf_path(
279279
) -> str:
280280
return os.path.join(
281281
_pipeline_prefix(
282-
Env.HAIL_SEARCH_DATA,
282+
Env.HAIL_SEARCH_DATA_DIR,
283283
reference_genome,
284284
dataset_type,
285285
),
@@ -304,7 +304,7 @@ def new_variants_table_path(
304304

305305
def clinvar_dataset_path(reference_genome: ReferenceGenome, etag: str) -> str:
306306
return os.path.join(
307-
Env.HAIL_TMPDIR,
307+
Env.HAIL_TMP_DIR,
308308
f'clinvar-{reference_genome.value}-{etag}.ht',
309309
)
310310

@@ -317,7 +317,7 @@ def project_remap_path(
317317
) -> str:
318318
return os.path.join(
319319
_pipeline_prefix(
320-
Env.LOADING_DATASETS,
320+
Env.LOADING_DATASETS_DIR,
321321
reference_genome,
322322
dataset_type,
323323
),
@@ -335,7 +335,7 @@ def project_pedigree_path(
335335
) -> str:
336336
return os.path.join(
337337
_pipeline_prefix(
338-
Env.LOADING_DATASETS,
338+
Env.LOADING_DATASETS_DIR,
339339
reference_genome,
340340
dataset_type,
341341
),
@@ -347,7 +347,7 @@ def project_pedigree_path(
347347

348348
def loading_pipeline_queue_path() -> str:
349349
return os.path.join(
350-
Env.LOADING_DATASETS,
350+
Env.LOADING_DATASETS_DIR,
351351
'loading_pipeline_queue',
352352
'request.json',
353353
)

v03_pipeline/lib/paths_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def test_family_table_path(self) -> None:
5050
'/seqr/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
5151
)
5252
with patch('v03_pipeline.lib.paths.Env') as mock_env:
53-
mock_env.HAIL_SEARCH_DATA = 'gs://seqr-datasets/'
53+
mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/'
5454
self.assertEqual(
5555
family_table_path(
5656
ReferenceGenome.GRCh37,

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def download_and_import_latest_clinvar_vcf(
139139
with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file:
140140
urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310
141141
gcs_tmp_file_name = os.path.join(
142-
Env.HAIL_TMPDIR,
142+
Env.HAIL_TMP_DIR,
143143
os.path.basename(tmp_file.name),
144144
)
145145
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
@@ -203,7 +203,7 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
203203
shutil.copyfileobj(f_in, f_out)
204204

205205
gcs_tmp_file_name = os.path.join(
206-
Env.HAIL_TMPDIR,
206+
Env.HAIL_TMP_DIR,
207207
os.path.basename(unzipped_tmp_file.name),
208208
)
209209
safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name)

v03_pipeline/lib/tasks/base/base_hail_table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def complete(self) -> bool:
2323

2424
def init_hail(self):
2525
# Need to use the GCP bucket as temp storage for very large callset joins
26-
hl.init(tmp_dir=Env.HAIL_TMPDIR, idempotent=True)
26+
hl.init(tmp_dir=Env.HAIL_TMP_DIR, idempotent=True)
2727

2828
# Interval ref data join causes shuffle death, this prevents it
2929
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001

v03_pipeline/lib/tasks/base/base_update_lookup_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,6 @@ def initialize_table(self) -> hl.Table:
4646
remap_pedigree_hash=hl.tint32,
4747
),
4848
),
49+
migrations=hl.empty_array(hl.tstr),
4950
),
5051
)

v03_pipeline/lib/tasks/update_lookup_table.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,5 +115,6 @@ def update_table(self, ht: hl.Table) -> hl.Table:
115115
),
116116
),
117117
),
118+
migrations=ht.migrations,
118119
)
119120
return ht

v03_pipeline/lib/tasks/update_lookup_table_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def test_skip_update_lookup_table_task(self) -> None:
5151
),
5252
),
5353
},
54+
migrations=[],
5455
),
5556
],
5657
)
@@ -89,6 +90,7 @@ def test_update_lookup_table_task(self) -> None:
8990
),
9091
),
9192
},
93+
migrations=[],
9294
),
9395
],
9496
)

v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def test_delete_project_empty_table(
3333
project_sample_types=[],
3434
project_families={},
3535
updates=set(),
36+
migrations=[],
3637
),
3738
],
3839
)
@@ -137,6 +138,7 @@ def test_delete_project(
137138
remap_pedigree_hash=123,
138139
),
139140
},
141+
migrations=hl.empty_array(hl.tstr),
140142
),
141143
)
142144
worker = luigi.worker.Worker()
@@ -172,6 +174,7 @@ def test_delete_project(
172174
remap_pedigree_hash=123,
173175
),
174176
},
177+
migrations=[],
175178
),
176179
],
177180
)

v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def test_delete_project_empty_table(
3232
project_sample_types=[],
3333
project_families={},
3434
updates=set(),
35+
migrations=[],
3536
),
3637
],
3738
)
@@ -136,6 +137,7 @@ def test_delete_project(
136137
remap_pedigree_hash=123,
137138
),
138139
},
140+
migrations=hl.empty_array(hl.tstr),
139141
),
140142
)
141143
worker = luigi.worker.Worker()
@@ -162,6 +164,7 @@ def test_delete_project(
162164
remap_pedigree_hash=123,
163165
),
164166
},
167+
migrations=[],
165168
),
166169
],
167170
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def setUp(self) -> None:
107107
remap_pedigree_hash=123,
108108
),
109109
},
110+
migrations=hl.empty_array(hl.tstr),
110111
),
111112
)
112113
ht.write(

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,7 +1221,7 @@ def test_sv_update_vat(
12211221
self.assertTrue(update_variant_annotations_task.complete())
12221222
self.assertFalse(
12231223
GCSorLocalFolderTarget(
1224-
f'{self.mock_env.REFERENCE_DATASETS}/v03/GRCh38/SV/lookup.ht',
1224+
f'{self.mock_env.REFERENCE_DATASETS_DIR}/v03/GRCh38/SV/lookup.ht',
12251225
).exists(),
12261226
)
12271227
ht = hl.read_table(update_variant_annotations_task.output().path)
@@ -1800,7 +1800,7 @@ def test_gcnv_update_vat(
18001800
self.assertTrue(update_variant_annotations_task.complete())
18011801
self.assertFalse(
18021802
GCSorLocalFolderTarget(
1803-
f'{self.mock_env.REFERENCE_DATASETS}/v03/GRCh38/GCNV/lookup.ht',
1803+
f'{self.mock_env.REFERENCE_DATASETS_DIR}/v03/GRCh38/GCNV/lookup.ht',
18041804
).exists(),
18051805
)
18061806
ht = hl.read_table(update_variant_annotations_task.output().path)

0 commit comments

Comments
 (0)