Skip to content

Commit 0fa712a

Browse files
committed
Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines into dev
2 parents cc50910 + e2d6433 commit 0fa712a

25 files changed

+322
-578
lines changed

v03_pipeline/lib/misc/callsets.py

Lines changed: 3 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,11 @@
66
from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
77

88

9-
def get_callset_ht( # noqa: PLR0913
9+
def get_callset_ht(
1010
reference_genome: ReferenceGenome,
1111
dataset_type: DatasetType,
12-
callset_paths: list[str],
12+
callset_path: str,
1313
project_guids: list[str],
14-
project_remap_paths: list[str],
15-
project_pedigree_paths: list[str],
16-
imputed_sex_paths: list[str] | None,
1714
):
1815
callset_hts = [
1916
hl.read_matrix_table(
@@ -24,58 +21,10 @@ def get_callset_ht( # noqa: PLR0913
2421
project_guid,
2522
),
2623
).rows()
27-
for (callset_path, project_guid, _, _, _) in callset_project_pairs(
28-
callset_paths,
29-
project_guids,
30-
project_remap_paths,
31-
project_pedigree_paths,
32-
imputed_sex_paths,
33-
)
24+
for project_guid in project_guids
3425
]
3526
callset_ht = functools.reduce(
3627
(lambda ht1, ht2: ht1.union(ht2, unify=True)),
3728
callset_hts,
3829
)
3930
return callset_ht.distinct()
40-
41-
42-
def callset_project_pairs(
43-
callset_paths: list[str],
44-
project_guids: list[str],
45-
project_remap_paths: list[str],
46-
project_pedigree_paths: list[str],
47-
imputed_sex_paths: list[str] | None,
48-
):
49-
if len(callset_paths) == len(project_guids):
50-
return zip(
51-
callset_paths,
52-
project_guids,
53-
project_remap_paths,
54-
project_pedigree_paths,
55-
imputed_sex_paths
56-
if imputed_sex_paths is not None
57-
else [None] * len(callset_paths),
58-
strict=True,
59-
)
60-
return (
61-
(
62-
callset_path,
63-
project_guid,
64-
project_remap_path,
65-
project_pedigree_path,
66-
imputed_sex_path,
67-
)
68-
for callset_path, imputed_sex_path in zip(
69-
callset_paths,
70-
imputed_sex_paths
71-
if imputed_sex_paths is not None
72-
else [None] * len(callset_paths),
73-
strict=False,
74-
)
75-
for (project_guid, project_remap_path, project_pedigree_path) in zip(
76-
project_guids,
77-
project_remap_paths,
78-
project_pedigree_paths,
79-
strict=True,
80-
)
81-
)

v03_pipeline/lib/misc/io.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ def import_callset(
121121
callset_path: str,
122122
reference_genome: ReferenceGenome,
123123
dataset_type: DatasetType,
124-
filters_path: str | None = None,
125124
) -> hl.MatrixTable:
126125
if dataset_type == DatasetType.GCNV:
127126
mt = import_gcnv_bed_file(callset_path)
@@ -131,9 +130,6 @@ def import_callset(
131130
mt = hl.read_matrix_table(callset_path)
132131
if dataset_type == DatasetType.SV:
133132
mt = mt.annotate_rows(variant_id=mt.rsid)
134-
if filters_path:
135-
filters_ht = import_vcf(filters_path, reference_genome).rows()
136-
mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
137133
return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)
138134

139135

v03_pipeline/lib/model/dataset_type.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import hail as hl
55

66
from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv
7-
from v03_pipeline.lib.model.definitions import ReferenceGenome
7+
from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType
88

99
MITO_MIN_HOM_THRESHOLD = 0.95
1010
ZERO = 0.0
@@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping(
155155
self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
156156
)
157157

158+
def expect_filters(
159+
self,
160+
sample_type: SampleType,
161+
) -> bool:
162+
return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES
163+
158164
@property
159165
def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool:
160166
return self == DatasetType.SV

v03_pipeline/lib/model/environment.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
from dataclasses import dataclass
33

44
# NB: using os.environ.get inside the dataclass defaults gives a lint error.
5-
ACCESS_PRIVATE_REFERENCE_DATASETS = (
6-
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
7-
)
8-
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
95
HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108
106
HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data')
7+
LIFTOVER_REF_PATH = os.environ.get(
8+
'LIFTOVER_REF_PATH',
9+
'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
10+
)
1111
LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp')
1212
PRIVATE_REFERENCE_DATASETS = os.environ.get(
1313
'PRIVATE_REFERENCE_DATASETS',
@@ -19,21 +19,34 @@
1919
)
2020
VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
2121
VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
22-
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
22+
23+
# Allele registry secrets :/
2324
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
2425
PROJECT_ID = os.environ.get('PROJECT_ID', None)
2526

27+
# Feature Flags
28+
ACCESS_PRIVATE_REFERENCE_DATASETS = (
29+
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
30+
)
31+
CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
32+
EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
33+
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
34+
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
35+
2636

2737
@dataclass
2838
class Env:
2939
ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
3040
ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
31-
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
41+
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
42+
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
3243
HAIL_TMPDIR: str = HAIL_TMPDIR
3344
HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
45+
LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH
3446
LOADING_DATASETS: str = LOADING_DATASETS
3547
PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
3648
PROJECT_ID: str | None = PROJECT_ID
49+
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
3750
REFERENCE_DATASETS: str = REFERENCE_DATASETS
3851
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
3952
VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH

v03_pipeline/lib/paths.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import hashlib
22
import os
3+
import re
34

45
from v03_pipeline.lib.model import (
56
AccessControl,
@@ -9,6 +10,7 @@
910
PipelineVersion,
1011
ReferenceDatasetCollection,
1112
ReferenceGenome,
13+
SampleType,
1214
)
1315

1416

@@ -73,6 +75,22 @@ def family_table_path(
7375
)
7476

7577

78+
def imputed_sex_path(
79+
reference_genome: ReferenceGenome,
80+
dataset_type: DatasetType,
81+
callset_path: str,
82+
) -> str:
83+
return os.path.join(
84+
_v03_pipeline_prefix(
85+
Env.LOADING_DATASETS,
86+
reference_genome,
87+
dataset_type,
88+
),
89+
'imputed_sex',
90+
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
91+
)
92+
93+
7694
def imported_callset_path(
7795
reference_genome: ReferenceGenome,
7896
dataset_type: DatasetType,
@@ -198,6 +216,24 @@ def sex_check_table_path(
198216
)
199217

200218

219+
def valid_filters_path(
220+
dataset_type: DatasetType,
221+
sample_type: SampleType,
222+
callset_path: str,
223+
) -> str | None:
224+
if (
225+
not Env.EXPECT_WES_FILTERS
226+
or not dataset_type.expect_filters(sample_type)
227+
or 'part_one_outputs' not in callset_path
228+
):
229+
return None
230+
return re.sub(
231+
'part_one_outputs/.*$',
232+
'part_two_outputs/*.filtered.*.vcf.gz',
233+
callset_path,
234+
)
235+
236+
201237
def valid_reference_dataset_collection_path(
202238
reference_genome: ReferenceGenome,
203239
dataset_type: DatasetType,

v03_pipeline/lib/paths_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,21 @@
66
DatasetType,
77
ReferenceDatasetCollection,
88
ReferenceGenome,
9+
SampleType,
910
)
1011
from v03_pipeline.lib.paths import (
1112
cached_reference_dataset_query_path,
1213
family_table_path,
1314
imported_callset_path,
15+
imputed_sex_path,
1416
lookup_table_path,
1517
metadata_for_run_path,
1618
new_variants_table_path,
1719
project_table_path,
1820
relatedness_check_table_path,
1921
remapped_and_subsetted_callset_path,
2022
sex_check_table_path,
23+
valid_filters_path,
2124
valid_reference_dataset_collection_path,
2225
variant_annotations_table_path,
2326
)
@@ -54,6 +57,26 @@ def test_family_table_path(self) -> None:
5457
'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
5558
)
5659

60+
def test_valid_filters_path(self) -> None:
61+
self.assertEqual(
62+
valid_filters_path(
63+
DatasetType.MITO,
64+
SampleType.WES,
65+
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
66+
),
67+
None,
68+
)
69+
with patch('v03_pipeline.lib.paths.Env') as mock_env:
70+
mock_env.EXPECT_WES_FILTERS = True
71+
self.assertEqual(
72+
valid_filters_path(
73+
DatasetType.SNV_INDEL,
74+
SampleType.WES,
75+
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
76+
),
77+
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
78+
)
79+
5780
def test_project_table_path(self) -> None:
5881
self.assertEqual(
5982
project_table_path(
@@ -162,6 +185,16 @@ def test_imported_callset_path(self) -> None:
162185
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
163186
)
164187

188+
def test_imputed_sex_path(self) -> None:
189+
self.assertEqual(
190+
imputed_sex_path(
191+
ReferenceGenome.GRCh38,
192+
DatasetType.SNV_INDEL,
193+
'gs://abc.efg/callset.vcf.gz',
194+
),
195+
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
196+
)
197+
165198
def test_new_variants_table_path(self) -> None:
166199
self.assertEqual(
167200
new_variants_table_path(
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import luigi
2+
3+
from v03_pipeline.lib.model import SampleType
4+
5+
6+
class BaseLoadingRunParams(luigi.Task):
7+
# NB:
8+
# These params are "inherited" with the special
9+
# luigi.util.inherits function, copying params
10+
# but nothing else.
11+
sample_type = luigi.EnumParameter(enum=SampleType)
12+
callset_path = luigi.Parameter()
13+
ignore_missing_samples_when_remapping = luigi.BoolParameter(
14+
default=False,
15+
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
16+
)
17+
force = luigi.BoolParameter(
18+
default=False,
19+
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
20+
)
21+
skip_check_sex_and_relatedness = luigi.BoolParameter(
22+
default=False,
23+
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
24+
)
25+
skip_expect_filters = luigi.BoolParameter(
26+
default=False,
27+
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
28+
)
29+
skip_validation = luigi.BoolParameter(
30+
default=False,
31+
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
32+
)
33+
is_new_gcnv_joint_call = luigi.BoolParameter(
34+
default=False,
35+
description='Is this a fully joint-called callset.',
36+
)

0 commit comments

Comments
 (0)