Skip to content

Commit 7b81551

Browse files
authored
Merge pull request #903 from broadinstitute/dev
Dev
2 parents 8c18a89 + 24667eb commit 7b81551

13 files changed

+366
-45
lines changed

v03_pipeline/bin/dataproc_vep_init.bash

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ EOF
5252
gcc -Wall -Werror -O2 /vep.c -o /vep
5353
chmod u+s /vep
5454

55-
gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_data.bash /download_vep_data.bash
56-
chmod +x /download_vep_data.bash
57-
./download_vep_data.bash $REFERENCE_GENOME
55+
gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_reference_data.bash /download_vep_reference_data.bash
56+
chmod +x /download_vep_reference_data.bash
57+
./download_vep_reference_data.bash $REFERENCE_GENOME
5858

5959
gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/vep /vep.bash
6060
chmod +x /vep.bash

v03_pipeline/bin/download_vep_data.bash renamed to v03_pipeline/bin/download_vep_reference_data.bash

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
set -eux
44

55
REFERENCE_GENOME=$1
6-
VEP_DATA=/seqr/vep_data
6+
VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data}
77

88
case $REFERENCE_GENOME in
99
GRCh38)
@@ -43,20 +43,20 @@ case $REFERENCE_GENOME in
4343
exit 1
4444
esac
4545

46-
if [ -f $VEP_DATA/$REFERENCE_GENOME/_SUCCESS ]; then
46+
if [ -f $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS ]; then
4747
echo "Skipping download because already successful"
4848
exit 0;
4949
fi
5050

51-
mkdir -p $VEP_DATA/$REFERENCE_GENOME;
51+
mkdir -p $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME;
5252
for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do
5353
if [[ $vep_reference_data_file == *.tar.gz ]]; then
5454
echo "Downloading and extracting" $vep_reference_data_file;
55-
gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_DATA/$REFERENCE_GENOME/ &
55+
gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ &
5656
else
5757
echo "Downloading" $vep_reference_data_file;
5858
gsutil cp $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ &
5959
fi
6060
done;
6161
wait
62-
touch $VEP_DATA/$REFERENCE_GENOME/_SUCCESS
62+
touch $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS

v03_pipeline/bin/vep

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
set -eux
44

55
REFERENCE_GENOME=$1
6-
VEP_DATA=/seqr/vep_data
6+
VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data}
77
VEP_DOCKER_IMAGE="gcr.io/seqr-project/vep-docker-image"
88

99
case $REFERENCE_GENOME in
@@ -17,5 +17,5 @@ case $REFERENCE_GENOME in
1717
esac
1818

1919
shift # Remove the REFERENCE_GENOME arg.
20-
docker run --platform linux/amd64 -i -v $VEP_DATA/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \
20+
docker run --platform linux/amd64 -i -v $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \
2121
/opt/vep/src/ensembl-vep/vep $@

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33
import hail as hl
44
import numpy as np
55

6+
from v03_pipeline.lib.logger import get_logger
67
from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample
78
from v03_pipeline.lib.model import Sex
89

910
RELATEDNESS_TOLERANCE = 0.2
1011

12+
logger = get_logger(__name__)
13+
1114

1215
def passes_relatedness_check(
1316
relatedness_check_lookup: dict[tuple[str, str], list],
@@ -175,10 +178,19 @@ def get_families_failed_sex_check(
175178
failed_families = defaultdict(list)
176179
for family in families:
177180
for sample_id in family.samples:
178-
if family.samples[sample_id].sex not in {
179-
sex_check_lookup[sample_id],
180-
Sex.UNKNOWN,
181-
}: # NB: Unknown samples in pedigree are excluded from sex check.
181+
# NB: Both Unknown samples in pedigree and Unknown
182+
# samples in the predicted_sex are precluded from
183+
# failing the sex check.
184+
if (
185+
sex_check_lookup[sample_id] == Sex.UNKNOWN # noqa: PLR1714
186+
or family.samples[sample_id].sex == Sex.UNKNOWN
187+
):
188+
logger.info(
189+
f'Encountered sample with Unknown sex excluded from sex check: {sample_id}',
190+
)
191+
continue
192+
193+
if family.samples[sample_id].sex != sex_check_lookup[sample_id]:
182194
failed_families[family].append(
183195
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
184196
)

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,12 @@ def test_build_relatedness_check_lookup(self):
5656
def test_build_sex_check_lookup(self):
5757
ht = hl.Table.parallelize(
5858
[
59-
{'s': 'remapped_id', 'predicted_sex': 'M'},
60-
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'M'},
61-
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'M'},
62-
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'M'},
63-
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'},
64-
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'M'},
59+
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'F'},
60+
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'},
61+
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'},
62+
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'},
63+
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'M'},
64+
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'U'},
6565
],
6666
hl.tstruct(
6767
s=hl.tstr,
@@ -72,12 +72,12 @@ def test_build_sex_check_lookup(self):
7272
self.assertEqual(
7373
build_sex_check_lookup(ht, hl.dict({'ROS_006_18Y03226_D1': 'remapped_id'})),
7474
{
75-
'remapped_id': Sex.MALE,
76-
'ROS_006_18Y03227_D1': Sex.MALE,
77-
'ROS_006_18Y03228_D1': Sex.MALE,
78-
'ROS_007_19Y05919_D1': Sex.MALE,
79-
'ROS_007_19Y05939_D1': Sex.FEMALE,
80-
'ROS_007_19Y05987_D1': Sex.MALE,
75+
'remapped_id': Sex.FEMALE,
76+
'ROS_006_18Y03227_D1': Sex.FEMALE,
77+
'ROS_006_18Y03228_D1': Sex.FEMALE,
78+
'ROS_007_19Y05919_D1': Sex.FEMALE,
79+
'ROS_007_19Y05939_D1': Sex.MALE,
80+
'ROS_007_19Y05987_D1': Sex.UNKNOWN,
8181
},
8282
)
8383

@@ -178,12 +178,12 @@ def test_all_relatedness_checks(self):
178178
def test_get_families_failed_sex_check(self):
179179
sex_check_ht = hl.Table.parallelize(
180180
[
181-
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'M'},
182-
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'},
181+
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'F'},
182+
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'}, # Pedigree Sex U
183183
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'},
184184
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'},
185-
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'},
186-
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'F'},
185+
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'M'},
186+
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'U'}, # Pedigree Sex F
187187
],
188188
hl.tstruct(
189189
s=hl.tstr,
@@ -201,7 +201,7 @@ def test_get_families_failed_sex_check(self):
201201
failed_families.values(),
202202
[
203203
[
204-
'Sample ROS_006_18Y03226_D1 has pedigree sex F but imputed sex M',
204+
'Sample ROS_007_19Y05939_D1 has pedigree sex F but imputed sex M',
205205
],
206206
],
207207
)

v03_pipeline/lib/misc/io.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,46 @@
11
import hashlib
22
import math
33
import os
4+
import re
45
import uuid
6+
from collections.abc import Callable
7+
from string import Template
58

69
import hail as hl
710
import hailtop.fs as hfs
811

912
from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
1013
from v03_pipeline.lib.misc.nested_field import parse_nested_field
14+
from v03_pipeline.lib.misc.validation import SeqrValidationError
1115
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex
1216

1317
BIALLELIC = 2
1418
B_PER_MB = 1 << 20 # 1024 * 1024
1519
MB_PER_PARTITION = 128
1620
MAX_SAMPLES_SPLIT_MULTI_SHUFFLE = 100
1721

18-
MALE = 'Male'
19-
FEMALE = 'Female'
22+
23+
def validated_hl_function(
24+
regex_to_msg: dict[str, str | Template],
25+
) -> Callable[[Callable], Callable]:
26+
def decorator(fn: Callable) -> Callable:
27+
def wrapper(*args, **kwargs) -> hl.Table | hl.MatrixTable:
28+
try:
29+
t, _ = checkpoint(fn(*args, **kwargs))
30+
except Exception as e:
31+
for regex, msg in regex_to_msg.items():
32+
match = re.search(regex, str(e))
33+
if match and isinstance(msg, Template):
34+
msg = msg.substitute(match=match.group(1)) # noqa: PLW2901
35+
if match:
36+
raise SeqrValidationError(msg) from e
37+
raise
38+
else:
39+
return t
40+
41+
return wrapper
42+
43+
return decorator
2044

2145

2246
def does_file_exist(path: str) -> bool:
@@ -49,7 +73,15 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
4973
return math.ceil(file_size_b / B_PER_MB / MB_PER_PARTITION)
5074

5175

52-
def split_multi_hts(mt: hl.MatrixTable) -> hl.MatrixTable:
76+
@validated_hl_function(
77+
{
78+
'RVD error! Keys found out of order': 'Your callset failed while attempting to split multiallelic sites. This error can occur if the dataset contains both multiallelic variants and duplicated loci.',
79+
},
80+
)
81+
def split_multi_hts(
82+
mt: hl.MatrixTable,
83+
max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
84+
) -> hl.MatrixTable:
5385
bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
5486
# split_multi_hts filters star alleles by default, but we
5587
# need that behavior for bi-allelic variants in addition to
@@ -59,7 +91,7 @@ def split_multi_hts(mt: hl.MatrixTable) -> hl.MatrixTable:
5991
multi = mt.filter_rows(hl.len(mt.alleles) > BIALLELIC)
6092
split = hl.split_multi_hts(
6193
multi,
62-
permit_shuffle=mt.count()[1] < MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
94+
permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle,
6395
)
6496
mt = split.union_rows(bi)
6597
return mt.distinct_by_row()
@@ -103,6 +135,15 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
103135
return mt.unfilter_entries()
104136

105137

138+
@validated_hl_function(
139+
{
140+
'.*FileNotFoundException|GoogleJsonResponseException: 403 Forbidden|arguments refer to no files.*': 'Unable to access the VCF in cloud storage.',
141+
# NB: ?: is non-capturing group.
142+
'.*(?:InvalidHeader|VCFParseError): (.*)$': Template(
143+
'VCF failed file format validation: $match',
144+
),
145+
},
146+
)
106147
def import_vcf(
107148
callset_path: str,
108149
reference_genome: ReferenceGenome,
@@ -139,6 +180,13 @@ def import_callset(
139180
return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)
140181

141182

183+
@validated_hl_function(
184+
{
185+
'instance has no field (.*)': Template(
186+
'Your callset is missing a required field: $match',
187+
),
188+
},
189+
)
142190
def select_relevant_fields(
143191
mt: hl.MatrixTable,
144192
dataset_type: DatasetType,
@@ -165,12 +213,17 @@ def select_relevant_fields(
165213

166214
def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
167215
ht = hl.import_table(imputed_sex_path)
216+
imputed_sex_lookup = hl.dict(
217+
{s.imputed_sex_value: s.value for s in Sex},
218+
)
168219
ht = ht.select(
169220
s=ht.collaborator_sample_id,
170221
predicted_sex=(
171222
hl.case()
172-
.when(ht.predicted_sex == FEMALE, Sex.FEMALE.value)
173-
.when(ht.predicted_sex == MALE, Sex.MALE.value)
223+
.when(
224+
imputed_sex_lookup.contains(ht.predicted_sex),
225+
imputed_sex_lookup[ht.predicted_sex],
226+
)
174227
.or_error(
175228
hl.format(
176229
'Found unexpected value %s in imputed sex file',

0 commit comments

Comments
 (0)