Skip to content

Commit 820a573

Browse files
authored
Merge pull request #779 from broadinstitute/dev
main <- dev
2 parents d4af227 + 9893364 commit 820a573

File tree

68 files changed

+557
-290
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+557
-290
lines changed

v03_pipeline/bin/vep-110-GRCh38.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)"
1212
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
1313
export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)"
1414
export ASSEMBLY=GRCh38
15-
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image
15+
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110
1616

1717
mkdir -p /vep_data
1818

@@ -61,7 +61,7 @@ gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/h
6161
# gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
6262
# bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa
6363
# samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
64-
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*' /vep_data/ &
64+
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
6565
docker pull ${VEP_DOCKER_IMAGE} &
6666
wait
6767

@@ -88,3 +88,4 @@ docker run -i -v /vep_data/:/opt/vep/.vep/:ro ${VEP_DOCKER_IMAGE} \
8888
/opt/vep/src/ensembl-vep/vep "\$@"
8989
EOF
9090
chmod +x /vep.sh
91+

v03_pipeline/lib/methods/sex_check.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

v03_pipeline/lib/methods/sex_check_test.py

Lines changed: 0 additions & 79 deletions
This file was deleted.

v03_pipeline/lib/misc/callsets.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@
66
from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
77

88

9-
def get_callset_ht(
9+
def get_callset_ht( # noqa: PLR0913
1010
reference_genome: ReferenceGenome,
1111
dataset_type: DatasetType,
1212
callset_paths: list[str],
1313
project_guids: list[str],
1414
project_remap_paths: list[str],
1515
project_pedigree_paths: list[str],
16+
imputed_sex_paths: list[str] | None,
1617
):
1718
callset_hts = [
1819
hl.read_matrix_table(
@@ -23,11 +24,12 @@ def get_callset_ht(
2324
project_guid,
2425
),
2526
).rows()
26-
for (callset_path, project_guid, _, _) in callset_project_pairs(
27+
for (callset_path, project_guid, _, _, _) in callset_project_pairs(
2728
callset_paths,
2829
project_guids,
2930
project_remap_paths,
3031
project_pedigree_paths,
32+
imputed_sex_paths,
3133
)
3234
]
3335

@@ -49,18 +51,34 @@ def callset_project_pairs(
4951
project_guids: list[str],
5052
project_remap_paths: list[str],
5153
project_pedigree_paths: list[str],
54+
imputed_sex_paths: list[str] | None,
5255
):
5356
if len(callset_paths) == len(project_guids):
5457
return zip(
5558
callset_paths,
5659
project_guids,
5760
project_remap_paths,
5861
project_pedigree_paths,
62+
imputed_sex_paths
63+
if imputed_sex_paths is not None
64+
else [None] * len(callset_paths),
5965
strict=True,
6066
)
6167
return (
62-
(callset_path, project_guid, project_remap_path, project_pedigree_path)
63-
for callset_path in callset_paths
68+
(
69+
callset_path,
70+
project_guid,
71+
project_remap_path,
72+
project_pedigree_path,
73+
imputed_sex_path,
74+
)
75+
for callset_path, imputed_sex_path in zip(
76+
callset_paths,
77+
imputed_sex_paths
78+
if imputed_sex_paths is not None
79+
else [None] * len(callset_paths),
80+
strict=False,
81+
)
6482
for (project_guid, project_remap_path, project_pedigree_path) in zip(
6583
project_guids,
6684
project_remap_paths,

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ def build_relatedness_check_lookup(
107107
relatedness_check_ht: hl.Table,
108108
remap_lookup: hl.dict,
109109
) -> dict[tuple[str, str], list]:
110-
# Build relatedness check lookup
111110
relatedness_check_ht = relatedness_check_ht.key_by(
112111
i=remap_lookup.get(relatedness_check_ht.i, relatedness_check_ht.i),
113112
j=remap_lookup.get(relatedness_check_ht.j, relatedness_check_ht.j),
@@ -124,12 +123,11 @@ def build_sex_check_lookup(
124123
sex_check_ht: hl.Table,
125124
remap_lookup: hl.dict,
126125
) -> dict[str, Sex]:
127-
# Build sex check lookup
128126
sex_check_ht = sex_check_ht.key_by(
129127
s=remap_lookup.get(sex_check_ht.s, sex_check_ht.s),
130128
)
131-
sex_check_ht = sex_check_ht.select('sex')
132-
return {r.s: Sex(r.sex) for r in sex_check_ht.collect()}
129+
sex_check_ht = sex_check_ht.select('predicted_sex')
130+
return {r.s: Sex(r.predicted_sex) for r in sex_check_ht.collect()}
133131

134132

135133
def get_families_failed_missing_samples(

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,16 @@ def test_build_relatedness_check_lookup(self):
5656
def test_build_sex_check_lookup(self):
5757
ht = hl.Table.parallelize(
5858
[
59-
{'s': 'remapped_id', 'sex': 'M'},
60-
{'s': 'ROS_006_18Y03227_D1', 'sex': 'M'},
61-
{'s': 'ROS_006_18Y03228_D1', 'sex': 'M'},
62-
{'s': 'ROS_007_19Y05919_D1', 'sex': 'M'},
63-
{'s': 'ROS_007_19Y05939_D1', 'sex': 'F'},
64-
{'s': 'ROS_007_19Y05987_D1', 'sex': 'M'},
59+
{'s': 'remapped_id', 'predicted_sex': 'M'},
60+
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'M'},
61+
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'M'},
62+
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'M'},
63+
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'},
64+
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'M'},
6565
],
6666
hl.tstruct(
6767
s=hl.tstr,
68-
sex=hl.tstr,
68+
predicted_sex=hl.tstr,
6969
),
7070
key='s',
7171
)
@@ -178,16 +178,16 @@ def test_all_relatedness_checks(self):
178178
def test_get_families_failed_sex_check(self):
179179
sex_check_ht = hl.Table.parallelize(
180180
[
181-
{'s': 'ROS_006_18Y03226_D1', 'sex': 'M'},
182-
{'s': 'ROS_006_18Y03227_D1', 'sex': 'F'},
183-
{'s': 'ROS_006_18Y03228_D1', 'sex': 'F'},
184-
{'s': 'ROS_007_19Y05919_D1', 'sex': 'F'},
185-
{'s': 'ROS_007_19Y05939_D1', 'sex': 'F'},
186-
{'s': 'ROS_007_19Y05987_D1', 'sex': 'F'},
181+
{'s': 'ROS_006_18Y03226_D1', 'predicted_sex': 'M'},
182+
{'s': 'ROS_006_18Y03227_D1', 'predicted_sex': 'F'},
183+
{'s': 'ROS_006_18Y03228_D1', 'predicted_sex': 'F'},
184+
{'s': 'ROS_007_19Y05919_D1', 'predicted_sex': 'F'},
185+
{'s': 'ROS_007_19Y05939_D1', 'predicted_sex': 'F'},
186+
{'s': 'ROS_007_19Y05987_D1', 'predicted_sex': 'F'},
187187
],
188188
hl.tstruct(
189189
s=hl.tstr,
190-
sex=hl.tstr,
190+
predicted_sex=hl.tstr,
191191
),
192192
key='s',
193193
)

v03_pipeline/lib/misc/io.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
import hail as hl
66

77
from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
8-
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome
8+
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex
99

1010
BIALLELIC = 2
1111
B_PER_MB = 1 << 20 # 1024 * 1024
1212
MB_PER_PARTITION = 128
1313

14+
MALE = 'Male'
15+
FEMALE = 'Female'
16+
1417

1518
def does_file_exist(path: str) -> bool:
1619
if path.startswith('gs://'):
@@ -156,6 +159,25 @@ def select_relevant_fields(
156159
return mt.select_entries(*dataset_type.entries_fields)
157160

158161

162+
def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
163+
ht = hl.import_table(imputed_sex_path)
164+
ht = ht.select(
165+
s=ht.collaborator_sample_id,
166+
predicted_sex=(
167+
hl.case()
168+
.when(ht.predicted_sex == FEMALE, Sex.FEMALE.value)
169+
.when(ht.predicted_sex == MALE, Sex.MALE.value)
170+
.or_error(
171+
hl.format(
172+
'Found unexpected value %s in imputed sex file',
173+
ht.predicted_sex,
174+
),
175+
)
176+
),
177+
)
178+
return ht.key_by(ht.s)
179+
180+
159181
def import_remap(remap_path: str) -> hl.Table:
160182
ht = hl.import_table(remap_path)
161183
ht = ht.select(

v03_pipeline/lib/misc/io_test.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
import unittest
22

3-
from v03_pipeline.lib.misc.io import compute_hail_n_partitions, file_size_bytes
3+
import hail as hl
44

5+
from v03_pipeline.lib.misc.io import (
6+
compute_hail_n_partitions,
7+
file_size_bytes,
8+
import_imputed_sex,
9+
)
10+
11+
TEST_IMPUTED_SEX = 'v03_pipeline/var/test/sex_check/test_imputed_sex.tsv'
12+
TEST_IMPUTED_SEX_UNEXPECTED_VALUE = (
13+
'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv'
14+
)
515
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
616
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
717

@@ -17,3 +27,22 @@ def test_compute_hail_n_partitions(self) -> None:
1727
self.assertEqual(compute_hail_n_partitions(23), 1)
1828
self.assertEqual(compute_hail_n_partitions(191310), 1)
1929
self.assertEqual(compute_hail_n_partitions(1913100000), 15)
30+
31+
def test_import_imputed_sex(self) -> None:
32+
ht = import_imputed_sex(TEST_IMPUTED_SEX)
33+
self.assertListEqual(
34+
ht.collect(),
35+
[
36+
hl.Struct(s='abc_1', predicted_sex='M'),
37+
hl.Struct(s='abc_2', predicted_sex='F'),
38+
hl.Struct(s='abc_3', predicted_sex='M'),
39+
],
40+
)
41+
42+
def test_import_imputed_sex_unexpected_value(self) -> None:
43+
ht = import_imputed_sex(TEST_IMPUTED_SEX_UNEXPECTED_VALUE)
44+
self.assertRaisesRegex(
45+
hl.utils.java.HailUserError,
46+
'Found unexpected value Unknown in imputed sex file',
47+
ht.collect,
48+
)

0 commit comments

Comments
 (0)