Skip to content

Commit 907eee4

Browse files
committed
use predetermined filters/imputed_sex paths
1 parent 9657823 commit 907eee4

File tree

6 files changed

+105
-24
lines changed

6 files changed

+105
-24
lines changed

v03_pipeline/lib/model/environment.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@
22
from dataclasses import dataclass
33

44
# NB: using os.environ.get inside the dataclass defaults gives a lint error.
5-
ACCESS_PRIVATE_REFERENCE_DATASETS = (
6-
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
7-
)
8-
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
95
HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108
106
HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data')
117
LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp')
@@ -19,22 +15,32 @@
1915
)
2016
VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
2117
VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
22-
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
18+
19+
# Allele registry secrets :/
2320
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
2421
PROJECT_ID = os.environ.get('PROJECT_ID', None)
2522

23+
# Feature Flags
24+
ACCESS_PRIVATE_REFERENCE_DATASETS = (
25+
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
26+
)
27+
EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
28+
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
29+
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
30+
2631

2732
@dataclass
2833
class Env:
2934
ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
3035
ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
31-
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
36+
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
3237
HAIL_TMPDIR: str = HAIL_TMPDIR
3338
HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
3439
LOADING_DATASETS: str = LOADING_DATASETS
3540
PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
3641
PROJECT_ID: str | None = PROJECT_ID
3742
REFERENCE_DATASETS: str = REFERENCE_DATASETS
43+
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
3844
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
3945
VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH
4046
VEP_CONFIG_URI: str | None = VEP_CONFIG_URI

v03_pipeline/lib/paths.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import hashlib
22
import os
3+
import re
34

45
from v03_pipeline.lib.model import (
56
AccessControl,
@@ -9,6 +10,7 @@
910
PipelineVersion,
1011
ReferenceDatasetCollection,
1112
ReferenceGenome,
13+
SampleType,
1214
)
1315

1416

@@ -73,6 +75,22 @@ def family_table_path(
7375
)
7476

7577

78+
def imputed_sex_path(
79+
reference_genome: ReferenceGenome,
80+
dataset_type: DatasetType,
81+
callset_path: str,
82+
) -> str:
83+
return os.path.join(
84+
_v03_pipeline_prefix(
85+
Env.LOADING_DATASETS,
86+
reference_genome,
87+
dataset_type,
88+
),
89+
'imputed_sex',
90+
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
91+
)
92+
93+
7694
def imported_callset_path(
7795
reference_genome: ReferenceGenome,
7896
dataset_type: DatasetType,
@@ -198,6 +216,25 @@ def sex_check_table_path(
198216
)
199217

200218

219+
def valid_filters_path(
220+
dataset_type: DatasetType,
221+
sample_type: SampleType,
222+
callset_path: str,
223+
) -> str | None:
224+
if (
225+
not Env.EXPECT_WES_FILTERS
226+
or dataset_type != DatasetType.SNV_INDEL
227+
or sample_type != SampleType.WES
228+
or 'part_one_outputs' not in callset_path
229+
):
230+
return None
231+
return re.sub(
232+
'part_one_outputs/.*$',
233+
'part_two_outputs/*.filtered.*.vcf.gz',
234+
callset_path,
235+
)
236+
237+
201238
def valid_reference_dataset_collection_path(
202239
reference_genome: ReferenceGenome,
203240
dataset_type: DatasetType,

v03_pipeline/lib/paths_test.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,21 @@
66
DatasetType,
77
ReferenceDatasetCollection,
88
ReferenceGenome,
9+
SampleType,
910
)
1011
from v03_pipeline.lib.paths import (
1112
cached_reference_dataset_query_path,
1213
family_table_path,
1314
imported_callset_path,
15+
imputed_sex_path,
1416
lookup_table_path,
1517
metadata_for_run_path,
1618
new_variants_table_path,
1719
project_table_path,
1820
relatedness_check_table_path,
1921
remapped_and_subsetted_callset_path,
2022
sex_check_table_path,
23+
valid_filters_path,
2124
valid_reference_dataset_collection_path,
2225
variant_annotations_table_path,
2326
)
@@ -54,6 +57,24 @@ def test_family_table_path(self) -> None:
5457
'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
5558
)
5659

60+
def test_valid_filters_path(self) -> None:
61+
self.assertEqual(
62+
valid_filters_path(
63+
SampleType.WES,
64+
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
65+
),
66+
None,
67+
)
68+
with patch('v03_pipeline.lib.paths.Env') as mock_env:
69+
mock_env.EXPECT_WES_FILTERS = True
70+
self.assertEqual(
71+
valid_filters_path(
72+
SampleType.WES,
73+
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
74+
),
75+
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
76+
)
77+
5778
def test_project_table_path(self) -> None:
5879
self.assertEqual(
5980
project_table_path(
@@ -162,6 +183,16 @@ def test_imported_callset_path(self) -> None:
162183
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
163184
)
164185

186+
def test_imputed_sex_path(self) -> None:
187+
self.assertEqual(
188+
imputed_sex_path(
189+
ReferenceGenome.GRCh38,
190+
DatasetType.SNV_INDEL,
191+
'gs://abc.efg/callset.vcf.gz',
192+
),
193+
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
194+
)
195+
165196
def test_new_variants_table_path(self) -> None:
166197
self.assertEqual(
167198
new_variants_table_path(

v03_pipeline/lib/tasks/base/base_loading_run_params.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,6 @@ class BaseLoadingRunParams(luigi.Task):
1010
# but nothing else.
1111
sample_type = luigi.EnumParameter(enum=SampleType)
1212
callset_path = luigi.Parameter()
13-
# HINT: OptionalParameter vs Parameter is significant here.
14-
# The default Parameter will case `None` to the string "None".
15-
imputed_sex_path = luigi.OptionalParameter(
16-
default=None,
17-
description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.',
18-
)
19-
filters_path = luigi.OptionalParameter(
20-
default=None,
21-
description='Optional path to part two outputs from callset (VCF shards containing filter information)',
22-
)
2313
ignore_missing_samples_when_remapping = luigi.BoolParameter(
2414
default=False,
2515
parsing=luigi.BoolParameter.EXPLICIT_PARSING,

v03_pipeline/lib/tasks/write_imported_callset.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
validate_sample_type,
1717
)
1818
from v03_pipeline.lib.misc.vets import annotate_vets
19-
from v03_pipeline.lib.model import CachedReferenceDatasetQuery
19+
from v03_pipeline.lib.model import CachedReferenceDatasetQuery, DatasetType, SampleType
2020
from v03_pipeline.lib.model.environment import Env
2121
from v03_pipeline.lib.paths import (
2222
cached_reference_dataset_query_path,
2323
imported_callset_path,
2424
sex_check_table_path,
25+
valid_filters_path,
2526
)
2627
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
2728
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
@@ -53,10 +54,18 @@ def output(self) -> luigi.Target:
5354

5455
def requires(self) -> list[luigi.Task]:
5556
requirements = []
56-
if self.filters_path:
57+
if (
58+
Env.EXPECT_WES_FILTERS
59+
and self.dataset_type == DatasetType.SNV_INDEL
60+
and self.sample_type == SampleType.WES
61+
):
5762
requirements = [
5863
*requirements,
59-
CallsetTask(self.filters_path),
64+
CallsetTask(
65+
valid_filters_path(
66+
self.dataset_type, self.sample_type, self.callset_path,
67+
),
68+
),
6069
]
6170
if self.validate and self.dataset_type.can_run_validation:
6271
requirements = [
@@ -108,11 +117,14 @@ def additional_row_fields(self, mt):
108117
}
109118

110119
def create_table(self) -> hl.MatrixTable:
120+
filters_path = valid_filters_path(
121+
self.dataset_type, self.sample_type, self.callset_path,
122+
)
111123
mt = import_callset(
112124
self.callset_path,
113125
self.reference_genome,
114126
self.dataset_type,
115-
self.filters_path,
127+
filters_path,
116128
)
117129
mt = select_relevant_fields(
118130
mt,
@@ -174,6 +186,6 @@ def create_table(self) -> hl.MatrixTable:
174186
)
175187
return mt.annotate_globals(
176188
callset_path=self.callset_path,
177-
filters_path=self.filters_path or hl.missing(hl.tstr),
189+
filters_path=filters_path or hl.missing(hl.tstr),
178190
sample_type=self.sample_type.value,
179191
)

v03_pipeline/lib/tasks/write_sex_check_table.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22
import luigi
33

44
from v03_pipeline.lib.misc.io import import_imputed_sex
5-
from v03_pipeline.lib.paths import sex_check_table_path
5+
from v03_pipeline.lib.paths import imputed_sex_path, sex_check_table_path
66
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
77
from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask
88

99

1010
class WriteSexCheckTableTask(BaseWriteTask):
1111
callset_path = luigi.Parameter()
12-
imputed_sex_path = luigi.Parameter()
1312

1413
def output(self) -> luigi.Target:
1514
return GCSorLocalTarget(
@@ -21,7 +20,13 @@ def output(self) -> luigi.Target:
2120
)
2221

2322
def requires(self) -> luigi.Task:
24-
return RawFileTask(self.imputed_sex_path)
23+
return RawFileTask(
24+
imputed_sex_path(
25+
self.reference_genome,
26+
self.dataset_type,
27+
self.callset_path,
28+
),
29+
)
2530

2631
def create_table(self) -> hl.Table:
2732
return import_imputed_sex(self.input().path)

0 commit comments

Comments
 (0)