diff --git a/v03_pipeline/bin/dataproc_vep_init.bash b/v03_pipeline/bin/dataproc_vep_init.bash index 2be6c7b7e..fe880f260 100755 --- a/v03_pipeline/bin/dataproc_vep_init.bash +++ b/v03_pipeline/bin/dataproc_vep_init.bash @@ -52,9 +52,9 @@ EOF gcc -Wall -Werror -O2 /vep.c -o /vep chmod u+s /vep -gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_data.bash /download_vep_data.bash -chmod +x /download_vep_data.bash -./download_vep_data.bash $REFERENCE_GENOME +gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_reference_data.bash /download_vep_reference_data.bash +chmod +x /download_vep_reference_data.bash +./download_vep_reference_data.bash $REFERENCE_GENOME gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/vep /vep.bash chmod +x /vep.bash diff --git a/v03_pipeline/bin/download_vep_data.bash b/v03_pipeline/bin/download_vep_reference_data.bash similarity index 88% rename from v03_pipeline/bin/download_vep_data.bash rename to v03_pipeline/bin/download_vep_reference_data.bash index 57b65427d..98dbad91b 100755 --- a/v03_pipeline/bin/download_vep_data.bash +++ b/v03_pipeline/bin/download_vep_reference_data.bash @@ -3,7 +3,7 @@ set -eux REFERENCE_GENOME=$1 -VEP_DATA=/seqr/vep_data +VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data} case $REFERENCE_GENOME in GRCh38) @@ -43,20 +43,20 @@ case $REFERENCE_GENOME in exit 1 esac -if [ -f $VEP_DATA/$REFERENCE_GENOME/_SUCCESS ]; then +if [ -f $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS ]; then echo "Skipping download because already successful" exit 0; fi -mkdir -p $VEP_DATA/$REFERENCE_GENOME; +mkdir -p $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME; for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do if [[ $vep_reference_data_file == *.tar.gz ]]; then echo "Downloading and extracting" $vep_reference_data_file; - gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_DATA/$REFERENCE_GENOME/ & + gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ & else echo "Downloading" $vep_reference_data_file; gsutil cp $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ & fi done; wait -touch $VEP_DATA/$REFERENCE_GENOME/_SUCCESS +touch $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS diff --git a/v03_pipeline/bin/vep b/v03_pipeline/bin/vep index b3558545e..33996bf27 100755 --- a/v03_pipeline/bin/vep +++ b/v03_pipeline/bin/vep @@ -3,7 +3,7 @@ set -eux REFERENCE_GENOME=$1 -VEP_DATA=/seqr/vep_data +VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data} VEP_DOCKER_IMAGE="gcr.io/seqr-project/vep-docker-image" case $REFERENCE_GENOME in @@ -17,5 +17,5 @@ case $REFERENCE_GENOME in esac shift # Remove the REFERENCE_GENOME arg. -docker run --platform linux/amd64 -i -v $VEP_DATA/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \ +docker run --platform linux/amd64 -i -v $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \ /opt/vep/src/ensembl-vep/vep $@ diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index 80190b327..e3b1b59db 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -3,11 +3,14 @@ import hail as hl import numpy as np +from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample from v03_pipeline.lib.model import Sex RELATEDNESS_TOLERANCE = 0.2 +logger = get_logger(__name__) + def passes_relatedness_check( relatedness_check_lookup: dict[tuple[str, str], list], @@ -175,10 +178,19 @@ def get_families_failed_sex_check( failed_families = defaultdict(list) for family in families: for sample_id in family.samples: - if family.samples[sample_id].sex not in { - sex_check_lookup[sample_id], - Sex.UNKNOWN, - }: # NB: Unknown samples in pedigree are excluded from sex check. + # NB: Both Unknown samples in pedigree and Unknown + # samples in the predicted_sex are precluded from + # failing the sex check. + if ( + sex_check_lookup[sample_id] == Sex.UNKNOWN # noqa: PLR1714 + or family.samples[sample_id].sex == Sex.UNKNOWN + ): + logger.info( + f'Encountered sample with Unknown sex excluded from sex check: {sample_id}', + ) + continue + + if family.samples[sample_id].sex != sex_check_lookup[sample_id]: failed_families[family].append( f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}', ) diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index 3f3dbda4e..1a970a8c8 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -56,12 +56,12 @@ def test_build_relatedness_check_lookup(self): def test_build_sex_check_lookup(self): ht = hl.Table.parallelize( [ - {'s': 'remapped_id', 'predicted_sex': 'M'}, - {'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'M'}, - {'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'M'}, - {'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'M'}, - {'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'}, - {'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'M'}, + {'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'M'}, + {'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'U'}, ], hl.tstruct( s=hl.tstr, @@ -72,12 +72,12 @@ def test_build_sex_check_lookup(self): self.assertEqual( build_sex_check_lookup(ht, hl.dict({'ROS_006_18Y03226_D1': 'remapped_id'})), { - 'remapped_id': Sex.MALE, - 'ROS_006_18Y03227_D1': Sex.MALE, - 'ROS_006_18Y03228_D1': Sex.MALE, - 'ROS_007_19Y05919_D1': Sex.MALE, - 'ROS_007_19Y05939_D1': Sex.FEMALE, - 'ROS_007_19Y05987_D1': Sex.MALE, + 'remapped_id': Sex.FEMALE, + 'ROS_006_18Y03227_D1': Sex.FEMALE, + 'ROS_006_18Y03228_D1': Sex.FEMALE, + 'ROS_007_19Y05919_D1': Sex.FEMALE, + 'ROS_007_19Y05939_D1': Sex.MALE, + 'ROS_007_19Y05987_D1': Sex.UNKNOWN, }, ) @@ -178,12 +178,12 @@ def test_all_relatedness_checks(self): def test_get_families_failed_sex_check(self): sex_check_ht = hl.Table.parallelize( [ - {'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'M'}, - {'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'}, # Pedigree Sex U {'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'}, {'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'}, - {'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'}, - {'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'F'}, + {'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'M'}, + {'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'U'}, # Pedigree Sex F ], hl.tstruct( s=hl.tstr, @@ -201,7 +201,7 @@ def test_get_families_failed_sex_check(self): failed_families.values(), [ [ - 'Sample ROS_006_18Y03226_D1 has pedigree sex F but imputed sex M', + 'Sample ROS_007_19Y05939_D1 has pedigree sex F but imputed sex M', ], ], ) diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index ef2b26ecd..bea1b9bc5 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -1,13 +1,17 @@ import hashlib import math import os +import re import uuid +from collections.abc import Callable +from string import Template import hail as hl import hailtop.fs as hfs from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes from v03_pipeline.lib.misc.nested_field import parse_nested_field +from v03_pipeline.lib.misc.validation import SeqrValidationError from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex BIALLELIC = 2 @@ -15,8 +19,28 @@ MB_PER_PARTITION = 128 MAX_SAMPLES_SPLIT_MULTI_SHUFFLE = 100 -MALE = 'Male' -FEMALE = 'Female' + +def validated_hl_function( + regex_to_msg: dict[str, str | Template], +) -> Callable[[Callable], Callable]: + def decorator(fn: Callable) -> Callable: + def wrapper(*args, **kwargs) -> hl.Table | hl.MatrixTable: + try: + t, _ = checkpoint(fn(*args, **kwargs)) + except Exception as e: + for regex, msg in regex_to_msg.items(): + match = re.search(regex, str(e)) + if match and isinstance(msg, Template): + msg = msg.substitute(match=match.group(1)) # noqa: PLW2901 + if match: + raise SeqrValidationError(msg) from e + raise + else: + return t + + return wrapper + + return decorator def does_file_exist(path: str) -> bool: @@ -49,7 +73,15 @@ def compute_hail_n_partitions(file_size_b: int) -> int: return math.ceil(file_size_b / B_PER_MB / MB_PER_PARTITION) -def split_multi_hts(mt: hl.MatrixTable) -> hl.MatrixTable: +@validated_hl_function( + { + 'RVD error! Keys found out of order': 'Your callset failed while attempting to split multiallelic sites. This error can occur if the dataset contains both multiallelic variants and duplicated loci.', + }, +) +def split_multi_hts( + mt: hl.MatrixTable, + max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE, +) -> hl.MatrixTable: bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC) # split_multi_hts filters star alleles by default, but we # need that behavior for bi-allelic variants in addition to @@ -59,7 +91,7 @@ def split_multi_hts(mt: hl.MatrixTable) -> hl.MatrixTable: multi = mt.filter_rows(hl.len(mt.alleles) > BIALLELIC) split = hl.split_multi_hts( multi, - permit_shuffle=mt.count()[1] < MAX_SAMPLES_SPLIT_MULTI_SHUFFLE, + permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle, ) mt = split.union_rows(bi) return mt.distinct_by_row() @@ -103,6 +135,15 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable: return mt.unfilter_entries() +@validated_hl_function( + { + '.*FileNotFoundException|GoogleJsonResponseException: 403 Forbidden|arguments refer to no files.*': 'Unable to access the VCF in cloud storage.', + # NB: ?: is non-capturing group. + '.*(?:InvalidHeader|VCFParseError): (.*)$': Template( + 'VCF failed file format validation: $match', + ), + }, +) def import_vcf( callset_path: str, reference_genome: ReferenceGenome, @@ -139,6 +180,13 @@ def import_callset( return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields) +@validated_hl_function( + { + 'instance has no field (.*)': Template( + 'Your callset is missing a required field: $match', + ), + }, +) def select_relevant_fields( mt: hl.MatrixTable, dataset_type: DatasetType, @@ -165,12 +213,17 @@ def select_relevant_fields( def import_imputed_sex(imputed_sex_path: str) -> hl.Table: ht = hl.import_table(imputed_sex_path) + imputed_sex_lookup = hl.dict( + {s.imputed_sex_value: s.value for s in Sex}, + ) ht = ht.select( s=ht.collaborator_sample_id, predicted_sex=( hl.case() - .when(ht.predicted_sex == FEMALE, Sex.FEMALE.value) - .when(ht.predicted_sex == MALE, Sex.MALE.value) + .when( + imputed_sex_lookup.contains(ht.predicted_sex), + imputed_sex_lookup[ht.predicted_sex], + ) .or_error( hl.format( 'Found unexpected value %s in imputed sex file', diff --git a/v03_pipeline/lib/misc/io_test.py b/v03_pipeline/lib/misc/io_test.py index ab0638d8c..24792755a 100644 --- a/v03_pipeline/lib/misc/io_test.py +++ b/v03_pipeline/lib/misc/io_test.py @@ -1,4 +1,5 @@ import unittest +from unittest import mock import hail as hl @@ -6,13 +7,19 @@ compute_hail_n_partitions, file_size_bytes, import_imputed_sex, + import_vcf, remap_pedigree_hash, + select_relevant_fields, + split_multi_hts, ) +from v03_pipeline.lib.misc.validation import SeqrValidationError +from v03_pipeline.lib.model import DatasetType, ReferenceGenome TEST_IMPUTED_SEX = 'v03_pipeline/var/test/sex_check/test_imputed_sex.tsv' TEST_IMPUTED_SEX_UNEXPECTED_VALUE = ( 'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv' ) +TEST_INVALID_VCF = 'v03_pipeline/var/test/callsets/improperly_formatted.vcf' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' @@ -38,7 +45,7 @@ def test_import_imputed_sex(self) -> None: [ hl.Struct(s='abc_1', predicted_sex='M'), hl.Struct(s='abc_2', predicted_sex='F'), - hl.Struct(s='abc_3', predicted_sex='M'), + hl.Struct(s='abc_3', predicted_sex='U'), ], ) @@ -46,7 +53,7 @@ def test_import_imputed_sex_unexpected_value(self) -> None: ht = import_imputed_sex(TEST_IMPUTED_SEX_UNEXPECTED_VALUE) self.assertRaisesRegex( hl.utils.java.HailUserError, - 'Found unexpected value Unknown in imputed sex file', + 'Found unexpected value UNKNOWN in imputed sex file', ht.collect, ) @@ -60,3 +67,108 @@ def test_remap_pedigree_hash(self) -> None: ), -560434714, ) + + def test_import_vcf(self) -> None: + self.assertRaisesRegex( + TypeError, + 'missing 1 required positional argument', + import_vcf, + 'abc', + ) + self.assertRaisesRegex( + SeqrValidationError, + 'Unable to access the VCF in cloud storage', + import_vcf, + 'bad.vcf', + ReferenceGenome.GRCh38, + ) + with mock.patch('v03_pipeline.lib.misc.io.hl.read_table') as mock_read_table: + mock_read_table.side_effect = hl.utils.java.FatalError( + 'GoogleJsonResponseException: 403 Forbidden', + ) + self.assertRaisesRegex( + SeqrValidationError, + 'Unable to access the VCF in cloud storage', + import_vcf, + 'abc123/bad.vcf', + ReferenceGenome.GRCh38, + ) + self.assertRaisesRegex( + SeqrValidationError, + 'VCF failed file format validation: Your input file has a malformed header: We never saw the required CHROM header line \\(starting with one #\\) for the input VCF file', + import_vcf, + TEST_PEDIGREE_3, + ReferenceGenome.GRCh38, + ) + self.assertRaisesRegex( + SeqrValidationError, + "VCF failed file format validation: invalid character 'N' in integer literal", + import_vcf, + TEST_INVALID_VCF, + ReferenceGenome.GRCh38, + ) + + def test_select_missing_field(self) -> None: + self.assertRaisesRegex( + SeqrValidationError, + "Your callset is missing a required field: 'a magic field'", + select_relevant_fields, + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['A', 'C'], + ], + 'rsid': ['rs1233'], + 'filters': [{'PASS'}], + }, + cols={'s': ['sample_1']}, + entries={ + 'GT': [[hl.Call([0, 0])]], + 'AD': [[[0, 20]]], + 'GQ': [[99]], + }, + ).key_rows_by('locus', 'alleles'), + DatasetType.SNV_INDEL, + {'a magic field': hl.tint32}, + ) + + def test_split_multi_failure(self) -> None: + self.assertRaisesRegex( + SeqrValidationError, + 'Your callset failed while attempting to split multiallelic sites. This error can occur if the dataset contains both multiallelic variants and duplicated loci.', + split_multi_hts, + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['A', 'G', 'AC'], + ['A', 'AT', 'C', 'G'], + ], + }, + cols={'s': ['sample_1']}, + entries={ + 'GQ': [[99], [98]], + }, + ) + .key_rows_by('locus', 'alleles') + .repartition(1), + 1, + ) diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index da2ab08c9..1bad09e28 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -13,6 +13,14 @@ class Sex(str, Enum): MALE = 'M' UNKNOWN = 'U' + @property + def imputed_sex_value(self): + return { + Sex.MALE: 'Male', + Sex.FEMALE: 'Female', + Sex.UNKNOWN: 'Unknown', + }[self] + class PipelineVersion(str, Enum): V02 = 'v02' diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 91e69a6a2..7b5a9792d 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -10,7 +10,7 @@ 'GRCH38_TO_GRCH37_LIFTOVER_REF_PATH', 'gs://hail-common/references/grch38_to_grch37.over.chain.gz', ) -HAIL_TMP_DIR = os.environ.get('HAIL_TMP_DIR', '/seqr/tmp') +HAIL_TMP_DIR = os.environ.get('HAIL_TMP_DIR', '/tmp') # noqa: S108 HAIL_SEARCH_DATA_DIR = os.environ.get('HAIL_SEARCH_DATA_DIR', '/seqr/hail-search-data') LOADING_DATASETS_DIR = os.environ.get('LOADING_DATASETS_DIR', '/seqr/seqr-loading-temp') PRIVATE_REFERENCE_DATASETS_DIR_DIR = os.environ.get( @@ -21,6 +21,10 @@ 'REFERENCE_DATASETS_DIR', '/seqr/seqr-reference-data', ) +VEP_REFERENCE_DATASETS_DIR = os.environ.get( + 'VEP_REFERENCE_DATASETS_DIR', + '/seqr/vep-reference-data', +) # Allele registry secrets :/ ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) @@ -50,3 +54,4 @@ class Env: PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES + VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR diff --git a/v03_pipeline/lib/vep.py b/v03_pipeline/lib/vep.py index 6f84d4646..bc9befd32 100644 --- a/v03_pipeline/lib/vep.py +++ b/v03_pipeline/lib/vep.py @@ -2,10 +2,10 @@ import hail as hl -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome VEP_CONFIG_URI = Template( - 'file:///seqr/vep_data/$reference_genome/vep-$reference_genome.json', + 'file://$vep_reference_datasets_dir/$reference_genome/vep-$reference_genome.json', ) @@ -18,7 +18,10 @@ def run_vep( return ht return hl.vep( ht, - config=VEP_CONFIG_URI.substitute(reference_genome=reference_genome.value), + config=VEP_CONFIG_URI.substitute( + vep_reference_datasets_dir=Env.VEP_REFERENCE_DATASETS_DIR, + reference_genome=reference_genome.value, + ), name='vep', block_size=1000, tolerate_parse_error=True, diff --git a/v03_pipeline/var/test/callsets/improperly_formatted.vcf b/v03_pipeline/var/test/callsets/improperly_formatted.vcf new file mode 100644 index 000000000..3d834dc5f --- /dev/null +++ b/v03_pipeline/var/test/callsets/improperly_formatted.vcf @@ -0,0 +1,128 @@ +##fileformat=VCFv4.2 +##hailversion=0.2.8-70304a52d33d +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00731 HG00732 HG00733 NA19675 NA19678 NA19679 NA20870 NA20872 NA20874 NA20875 NA20876 NA20877 NA20878 NA20881 NA20885 NA20888 +1 871269 . A C 368.47 PASS AC=1;AF=3.10000e-02;AN=32;BaseQRankSum=-1.74060e+01;DP=351;Dels=0.00000e+00;FS=9.28030e+01;HaplotypeScore=5.13800e-01;InbreedingCoeff=-3.32000e-02;MQ=5.93000e+01;MQ0=0;MQRankSum=2.37300e+00;QD=4.80000e-01;ReadPosRankSum=-1.17980e+01;SNPEFF_AMINO_ACID_CHANGE=R141;SNPEFF_CODON_CHANGE=cgA/cgC;SNPEFF_EFFECT=SYNONYMOUS_CODING;SNPEFF_EXON_ID=exon_1_871152_871276;SNPEFF_FUNCTIONAL_CLASS=SILENT;SNPEFF_GENE_BIOTYPE=protein_coding;SNPEFF_GENE_NAME=SAMD11;SNPEFF_IMPACT=LOW;SNPEFF_TRANSCRIPT_ID=ENST00000342066;VQSLOD=-8.16700e-01;culprit=QD;CSQ=C|ENSG00000187634|ENST00000455979|Transcript|upstream_gene_variant|||||||1|3386|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000412228|||UPI000155D479||||||||||||||||||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000420190|Transcript|synonymous_variant|512|423|141|R|cgA/cgC||1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000411579||Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI000155D47C|||5/7|||ENST00000420190.1:c.423N>C|ENST00000420190.1:c.423N>C(p.%3D)|||||||||||||||POSITION:0.787709497206704||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000268179|ENST00000598827|Transcript|upstream_gene_variant|||||||1|4824|-1|AL645608.1|Clone_based_ensembl_gene||protein_coding|YES|||ENSP00000471152||M0R0C9_HUMAN|UPI0000D61E05||||||||||||||||||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000437963|Transcript|downstream_gene_variant|||||||1|96|1|SAMD11|HGNC|28706|protein_coding||||ENSP00000393181||Q5SV95_HUMAN&I7FV93_HUMAN|UPI000155D47B||||||||||||||||||||||||EXON_INTRON_UNDEF|||||||,C|ENSG00000187634|ENST00000478729|Transcript|upstream_gene_variant|||||||1|4457|1|SAMD11|HGNC|28706|processed_transcript||||||||||||||||||||||||||||||||||||||,C|ENSG00000187634|ENST00000342066|Transcript|synonymous_variant|506|423|141|R|cgA/cgC||1||1|SAMD11|HGNC|28706|protein_coding|YES||CCDS2.2|ENSP00000342313|SAM11_HUMAN|Q5SV95_HUMAN&I7FV93_HUMAN&A6PWC8_HUMAN|UPI0000D61E04|||5/14|||ENST00000342066.3:c.423N>C|ENST00000342066.3:c.423N>C(p.%3D)|||||||||||||||POSITION:0.206744868035191||NON_CAN_SPLICE_SURR|||||||,C|ENSG00000187634|ENST00000341065|Transcript|synonymous_variant|194|195|65|R|cgA/cgC||1||1|SAMD11|HGNC|28706|protein_coding||||ENSP00000349216|||UPI000155D47A|||3/12|||ENST00000341065.4:c.194N>C|ENST00000341065.4:c.194N>C(p.%3D)|||||||||||||||POSITION:0.110231769361221||NON_CAN_SPLICE_SURR|||||||,C||ENSR00000528855|RegulatoryFeature|regulatory_region_variant|||||||1||||||regulatory_region|||||||||||||||||||||||||||||||||||||| GT:AD:DP:GQ:PL 0/0:34,0:34:99:0,102,1073 0/0:34,0:34:99:0,102,1064 0/0:37,0:37:99:0,108,1155 0/0:8,3:11:24:0,24,226 0/1:11,4:16:32:32,0,300 0/0:10,0:10:30:0,30,306 0/0:13,0:13:39:0,39,410 0/0:11,0:11:33:0,33,323 0/0:21,3:23:12:0,12,434 0/0:19,0:19:57:0,57,581 0/0:25,4:28:27:0,27,553 0/0:17,1:18:51:0,51,524 0/0:25,0:25:75:0,75,759 0/0:21,0:21:63:0,63,687 0/0:23,4:27:69:0,69,709 0/0:22,2:24:60:0,60,NABC diff --git a/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv b/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv index 5d5aea039..e745fbddd 100644 --- a/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv +++ b/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv @@ -1,4 +1,4 @@ entity:sample_id collaborator_participant_id collaborator_sample_id contamination_rate coverage_region_1_metrics_file crai_path cram_md5_path cram_path datarepo_row_id dragen_version import:snapshot_id import:timestamp mapped_percentage mapping_metrics_file material_type mean_coverage original_material_type participant_id pass_fail_value pdo percent_bases_at_20x percent_callability predicted_sex product receipt_date reported_sex research_project single_sample_vcf_index_path single_sample_vcf_md5_path single_sample_vcf_path total_bases variant_calling_metrics_file SM-DM66X abc_1 abc_1 0E+00 gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/6f30a41f-1d91-44d1-915c-5c10c6d87fcd/WAL_LIS6100_LIS6101.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/3e204a66-f044-4bdc-ade4-1671a0269214/WAL_LIS6100_LIS6101.cram.crai gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/a6ed4850-6a69-412e-a071-bf8cce04fca0/WAL_LIS6100_LIS6101.cram.md5sum gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/c51bbfd6-42f0-40ca-aa0c-b5eece935516/WAL_LIS6100_LIS6101.cram 8a07ce00-16a1-40f4-8666-c4cfaad1bbe1 07.021.604.3.7.8 cc9d9ed9-785a-407d-910e-d9bd46936fa6 2024-04-17T14:58:10 98.450000000 gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/9e745b1d-2c00-44ce-bbfb-31c44369f4fe/WAL_LIS6100_LIS6101.mapping_metrics.csv DNA:DNA Genomic 35.730000000 Whole Blood:Whole Blood PT-24FB4 Pass PDO-32851 96.140000000 97.850000000 Male P-WG-0139 2017-03-15 04:00:00 Male RP-3071 gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/360ec721-0af8-4085-a677-38c018069559/WAL_LIS6100_LIS6101.vcf.gz.tbi gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/8da8cda2-497f-4a8b-a642-af4a4ad28aac/WAL_LIS6100_LIS6101.vcf.gz.md5sum gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/0a2d93fb-8837-4b6f-ac68-a6b9701f9a08/WAL_LIS6100_LIS6101.vcf.gz 134324623400.000000000 gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0-b2b3d73dca3a/f7b62337-1339-4c2e-8280-281c48604e07/WAL_LIS6100_LIS6101.vc_metrics.csv SM-DM69X abc_2 abc_2 0E+00 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c4c07edf-7735-4aa7-9283-7cb2607b60a2/GLE-5774-3-3.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/dcd4c271-0249-47f1-8e91-81f74735c5a1/GLE-5774-3-3.cram.crai gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/ec41ec06-673f-4fe2-a063-23dc5fe1dcce/GLE-5774-3-3.cram.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/aad0e270-2ad5-4f39-b968-9b4beafeb5cc/GLE-5774-3-3.cram a4b04a39-9234-4028-a155-442c4acf12a0 07.021.604.3.7.8 ce74d94c-c33d-49d7-85c9-5f3cbd08aff7 2024-04-17T15:02:46 99.800000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c3a9e6f2-4c68-410b-823d-46ca406e5061/GLE-5774-3-3.mapping_metrics.csv DNA:DNA Genomic 35.300000000 Whole Blood:Whole Blood PT-24OHM Pass PDO-32755 96.320000000 97.340000000 Female P-WG-0139 2017-04-12 04:00:00 Female RP-3061 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c71cd2a1-c789-4715-9ebc-dbfc40d9f2e2/GLE-5774-3-3.vcf.gz.tbi gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/957a99cb-c9a9-4fc5-a0ec-53f9e461469e/GLE-5774-3-3.vcf.gz.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/df520949-5f2b-4976-9d46-80d1cc299813/GLE-5774-3-3.vcf.gz 133253714921.000000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/2e98e51b-9394-4e64-977f-e9010a4e16dc/GLE-5774-3-3.vc_metrics.csv -SM-DPB5G abc_3 abc_3 0E+00 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram ffb50687-165e-425a-a545-c3797d3a28d4 07.021.604.3.7.8 55729ba9-3ce4-47b3-9c3b-1148737ae40f 2024-04-17T15:07:57 99.670000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv DNA:DNA Genomic 41.910000000 Whole Blood:Whole Blood PT-25BR5 Pass PDO-32756 92.920000000 97.990000000 Male P-WG-0139 2017-05-19 04:00:00 Male RP-3062 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz 156149580126.000000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv +SM-DPB5G abc_3 abc_3 0E+00 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram ffb50687-165e-425a-a545-c3797d3a28d4 07.021.604.3.7.8 55729ba9-3ce4-47b3-9c3b-1148737ae40f 2024-04-17T15:07:57 99.670000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv DNA:DNA Genomic 41.910000000 Whole Blood:Whole Blood PT-25BR5 Pass PDO-32756 92.920000000 97.990000000 Unknown P-WG-0139 2017-05-19 04:00:00 Unknown RP-3062 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz 156149580126.000000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv diff --git a/v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv b/v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv index 7ab98ac61..03a5f120e 100644 --- a/v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv +++ b/v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv @@ -1,2 +1,2 @@ entity:sample_id collaborator_participant_id collaborator_sample_id contamination_rate coverage_region_1_metrics_file crai_path cram_md5_path cram_path datarepo_row_id dragen_version import:snapshot_id import:timestamp mapped_percentage mapping_metrics_file material_type mean_coverage original_material_type participant_id pass_fail_value pdo percent_bases_at_20x percent_callability predicted_sex product receipt_date reported_sex research_project single_sample_vcf_index_path single_sample_vcf_md5_path single_sample_vcf_path total_bases variant_calling_metrics_file -SM-DM69X abc_2 abc_2 0E+00 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c4c07edf-7735-4aa7-9283-7cb2607b60a2/GLE-5774-3-3.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/dcd4c271-0249-47f1-8e91-81f74735c5a1/GLE-5774-3-3.cram.crai gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/ec41ec06-673f-4fe2-a063-23dc5fe1dcce/GLE-5774-3-3.cram.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/aad0e270-2ad5-4f39-b968-9b4beafeb5cc/GLE-5774-3-3.cram a4b04a39-9234-4028-a155-442c4acf12a0 07.021.604.3.7.8 ce74d94c-c33d-49d7-85c9-5f3cbd08aff7 2024-04-17T15:02:46 99.800000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c3a9e6f2-4c68-410b-823d-46ca406e5061/GLE-5774-3-3.mapping_metrics.csv DNA:DNA Genomic 35.300000000 Whole Blood:Whole Blood PT-24OHM Pass PDO-32755 96.320000000 97.340000000 Unknown P-WG-0139 2017-04-12 04:00:00 Unknown RP-3061 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c71cd2a1-c789-4715-9ebc-dbfc40d9f2e2/GLE-5774-3-3.vcf.gz.tbi gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/957a99cb-c9a9-4fc5-a0ec-53f9e461469e/GLE-5774-3-3.vcf.gz.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/df520949-5f2b-4976-9d46-80d1cc299813/GLE-5774-3-3.vcf.gz 133253714921.000000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/2e98e51b-9394-4e64-977f-e9010a4e16dc/GLE-5774-3-3.vc_metrics.csv +SM-DM69X abc_2 abc_2 0E+00 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c4c07edf-7735-4aa7-9283-7cb2607b60a2/GLE-5774-3-3.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/dcd4c271-0249-47f1-8e91-81f74735c5a1/GLE-5774-3-3.cram.crai gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/ec41ec06-673f-4fe2-a063-23dc5fe1dcce/GLE-5774-3-3.cram.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/aad0e270-2ad5-4f39-b968-9b4beafeb5cc/GLE-5774-3-3.cram a4b04a39-9234-4028-a155-442c4acf12a0 07.021.604.3.7.8 ce74d94c-c33d-49d7-85c9-5f3cbd08aff7 2024-04-17T15:02:46 99.800000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c3a9e6f2-4c68-410b-823d-46ca406e5061/GLE-5774-3-3.mapping_metrics.csv DNA:DNA Genomic 35.300000000 Whole Blood:Whole Blood PT-24OHM Pass PDO-32755 96.320000000 97.340000000 UNKNOWN P-WG-0139 2017-04-12 04:00:00 UNKNOWN RP-3061 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c71cd2a1-c789-4715-9ebc-dbfc40d9f2e2/GLE-5774-3-3.vcf.gz.tbi gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/957a99cb-c9a9-4fc5-a0ec-53f9e461469e/GLE-5774-3-3.vcf.gz.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/df520949-5f2b-4976-9d46-80d1cc299813/GLE-5774-3-3.vcf.gz 133253714921.000000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/2e98e51b-9394-4e64-977f-e9010a4e16dc/GLE-5774-3-3.vc_metrics.csv