Skip to content

Dev #903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 19, 2024
Merged

Dev #903

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions v03_pipeline/bin/dataproc_vep_init.bash
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ EOF
gcc -Wall -Werror -O2 /vep.c -o /vep
chmod u+s /vep

gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_data.bash /download_vep_data.bash
chmod +x /download_vep_data.bash
./download_vep_data.bash $REFERENCE_GENOME
gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_reference_data.bash /download_vep_reference_data.bash
chmod +x /download_vep_reference_data.bash
./download_vep_reference_data.bash $REFERENCE_GENOME

gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/vep /vep.bash
chmod +x /vep.bash
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -eux

REFERENCE_GENOME=$1
VEP_DATA=/seqr/vep_data
VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data}

case $REFERENCE_GENOME in
GRCh38)
Expand Down Expand Up @@ -43,20 +43,20 @@ case $REFERENCE_GENOME in
exit 1
esac

if [ -f $VEP_DATA/$REFERENCE_GENOME/_SUCCESS ]; then
if [ -f $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS ]; then
echo "Skipping download because already successful"
exit 0;
fi

mkdir -p $VEP_DATA/$REFERENCE_GENOME;
mkdir -p $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME;
for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do
if [[ $vep_reference_data_file == *.tar.gz ]]; then
echo "Downloading and extracting" $vep_reference_data_file;
gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_DATA/$REFERENCE_GENOME/ &
gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ &
else
echo "Downloading" $vep_reference_data_file;
gsutil cp $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ &
fi
done;
wait
touch $VEP_DATA/$REFERENCE_GENOME/_SUCCESS
touch $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS
4 changes: 2 additions & 2 deletions v03_pipeline/bin/vep
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -eux

REFERENCE_GENOME=$1
VEP_DATA=/seqr/vep_data
VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data}
VEP_DOCKER_IMAGE="gcr.io/seqr-project/vep-docker-image"

case $REFERENCE_GENOME in
Expand All @@ -17,5 +17,5 @@ case $REFERENCE_GENOME in
esac

shift # Remove the REFERENCE_GENOME arg.
docker run --platform linux/amd64 -i -v $VEP_DATA/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \
docker run --platform linux/amd64 -i -v $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \
/opt/vep/src/ensembl-vep/vep $@
20 changes: 16 additions & 4 deletions v03_pipeline/lib/misc/family_loading_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import hail as hl
import numpy as np

from v03_pipeline.lib.logger import get_logger
from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample
from v03_pipeline.lib.model import Sex

RELATEDNESS_TOLERANCE = 0.2

logger = get_logger(__name__)


def passes_relatedness_check(
relatedness_check_lookup: dict[tuple[str, str], list],
Expand Down Expand Up @@ -175,10 +178,19 @@ def get_families_failed_sex_check(
failed_families = defaultdict(list)
for family in families:
for sample_id in family.samples:
if family.samples[sample_id].sex not in {
sex_check_lookup[sample_id],
Sex.UNKNOWN,
}: # NB: Unknown samples in pedigree are excluded from sex check.
# NB: Both Unknown samples in pedigree and Unknown
# samples in the predicted_sex are precluded from
# failing the sex check.
if (
sex_check_lookup[sample_id] == Sex.UNKNOWN # noqa: PLR1714
or family.samples[sample_id].sex == Sex.UNKNOWN
):
logger.info(
f'Encountered sample with Unknown sex excluded from sex check: {sample_id}',
)
continue

if family.samples[sample_id].sex != sex_check_lookup[sample_id]:
failed_families[family].append(
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
)
Expand Down
34 changes: 17 additions & 17 deletions v03_pipeline/lib/misc/family_loading_failures_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ def test_build_relatedness_check_lookup(self):
def test_build_sex_check_lookup(self):
ht = hl.Table.parallelize(
[
{'s': 'remapped_id', 'predicted_sex': 'M'},
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'M'},
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'M'},
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'M'},
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'M'},
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'F'},
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'},
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'M'},
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'U'},
],
hl.tstruct(
s=hl.tstr,
Expand All @@ -72,12 +72,12 @@ def test_build_sex_check_lookup(self):
self.assertEqual(
build_sex_check_lookup(ht, hl.dict({'ROS_006_18Y03226_D1': 'remapped_id'})),
{
'remapped_id': Sex.MALE,
'ROS_006_18Y03227_D1': Sex.MALE,
'ROS_006_18Y03228_D1': Sex.MALE,
'ROS_007_19Y05919_D1': Sex.MALE,
'ROS_007_19Y05939_D1': Sex.FEMALE,
'ROS_007_19Y05987_D1': Sex.MALE,
'remapped_id': Sex.FEMALE,
'ROS_006_18Y03227_D1': Sex.FEMALE,
'ROS_006_18Y03228_D1': Sex.FEMALE,
'ROS_007_19Y05919_D1': Sex.FEMALE,
'ROS_007_19Y05939_D1': Sex.MALE,
'ROS_007_19Y05987_D1': Sex.UNKNOWN,
},
)

Expand Down Expand Up @@ -178,12 +178,12 @@ def test_all_relatedness_checks(self):
def test_get_families_failed_sex_check(self):
sex_check_ht = hl.Table.parallelize(
[
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'M'},
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'},
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'F'},
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'}, # Pedigree Sex U
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'F'},
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'M'},
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'U'}, # Pedigree Sex F
],
hl.tstruct(
s=hl.tstr,
Expand All @@ -201,7 +201,7 @@ def test_get_families_failed_sex_check(self):
failed_families.values(),
[
[
'Sample ROS_006_18Y03226_D1 has pedigree sex F but imputed sex M',
'Sample ROS_007_19Y05939_D1 has pedigree sex F but imputed sex M',
],
],
)
65 changes: 59 additions & 6 deletions v03_pipeline/lib/misc/io.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,46 @@
import hashlib
import math
import os
import re
import uuid
from collections.abc import Callable
from string import Template

import hail as hl
import hailtop.fs as hfs

from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
from v03_pipeline.lib.misc.nested_field import parse_nested_field
from v03_pipeline.lib.misc.validation import SeqrValidationError
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex

BIALLELIC = 2
B_PER_MB = 1 << 20 # 1024 * 1024
MB_PER_PARTITION = 128
MAX_SAMPLES_SPLIT_MULTI_SHUFFLE = 100

MALE = 'Male'
FEMALE = 'Female'

def validated_hl_function(
regex_to_msg: dict[str, str | Template],
) -> Callable[[Callable], Callable]:
def decorator(fn: Callable) -> Callable:
def wrapper(*args, **kwargs) -> hl.Table | hl.MatrixTable:
try:
t, _ = checkpoint(fn(*args, **kwargs))
except Exception as e:
for regex, msg in regex_to_msg.items():
match = re.search(regex, str(e))
if match and isinstance(msg, Template):
msg = msg.substitute(match=match.group(1)) # noqa: PLW2901
if match:
raise SeqrValidationError(msg) from e
raise
else:
return t

return wrapper

return decorator


def does_file_exist(path: str) -> bool:
Expand Down Expand Up @@ -49,7 +73,15 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
return math.ceil(file_size_b / B_PER_MB / MB_PER_PARTITION)


def split_multi_hts(mt: hl.MatrixTable) -> hl.MatrixTable:
@validated_hl_function(
{
'RVD error! Keys found out of order': 'Your callset failed while attempting to split multiallelic sites. This error can occur if the dataset contains both multiallelic variants and duplicated loci.',
},
)
def split_multi_hts(
mt: hl.MatrixTable,
max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
) -> hl.MatrixTable:
bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
# split_multi_hts filters star alleles by default, but we
# need that behavior for bi-allelic variants in addition to
Expand All @@ -59,7 +91,7 @@ def split_multi_hts(mt: hl.MatrixTable) -> hl.MatrixTable:
multi = mt.filter_rows(hl.len(mt.alleles) > BIALLELIC)
split = hl.split_multi_hts(
multi,
permit_shuffle=mt.count()[1] < MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle,
)
mt = split.union_rows(bi)
return mt.distinct_by_row()
Expand Down Expand Up @@ -103,6 +135,15 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
return mt.unfilter_entries()


@validated_hl_function(
{
'.*FileNotFoundException|GoogleJsonResponseException: 403 Forbidden|arguments refer to no files.*': 'Unable to access the VCF in cloud storage.',
# NB: ?: is non-capturing group.
'.*(?:InvalidHeader|VCFParseError): (.*)$': Template(
'VCF failed file format validation: $match',
),
},
)
def import_vcf(
callset_path: str,
reference_genome: ReferenceGenome,
Expand Down Expand Up @@ -139,6 +180,13 @@ def import_callset(
return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)


@validated_hl_function(
{
'instance has no field (.*)': Template(
'Your callset is missing a required field: $match',
),
},
)
def select_relevant_fields(
mt: hl.MatrixTable,
dataset_type: DatasetType,
Expand All @@ -165,12 +213,17 @@ def select_relevant_fields(

def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
ht = hl.import_table(imputed_sex_path)
imputed_sex_lookup = hl.dict(
{s.imputed_sex_value: s.value for s in Sex},
)
ht = ht.select(
s=ht.collaborator_sample_id,
predicted_sex=(
hl.case()
.when(ht.predicted_sex == FEMALE, Sex.FEMALE.value)
.when(ht.predicted_sex == MALE, Sex.MALE.value)
.when(
imputed_sex_lookup.contains(ht.predicted_sex),
imputed_sex_lookup[ht.predicted_sex],
)
.or_error(
hl.format(
'Found unexpected value %s in imputed sex file',
Expand Down
Loading
Loading