Skip to content

Commit 7d14822

Browse files
committed
Merge remote-tracking branch 'origin/dev' into sample-qc-filtered-callrate
2 parents 170af80 + 7e8311d commit 7d14822

File tree

9 files changed

+147
-41
lines changed

9 files changed

+147
-41
lines changed

v03_pipeline/lib/misc/nested_field.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22

33

44
def parse_nested_field(t: hl.MatrixTable | hl.Table, fields: str):
5-
# Grab the field and continually select it from the hail table.
65
expression = t
7-
for field in fields.split('.'):
6+
# Behavior here allows only a single nested field.
7+
# Additional nesting is considered to be part of the
8+
# name of the field. e.g. `gnomadv4.1_AF`.
9+
for field in fields.split('.', maxsplit=1):
810
# Select from multi-allelic list.
911
if field.endswith('#'):
1012
expression = expression[field[:-1]][

v03_pipeline/lib/misc/nested_field_test.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def test_parse_nested_field(self):
1818
'alleles': ['A', 'C'],
1919
'a': hl.Struct(d=1),
2020
'b': hl.Struct(e=[2, 9]),
21+
'h': hl.Struct(**{'i.j': 1}),
2122
'a_index': 1,
2223
},
2324
{
@@ -29,6 +30,7 @@ def test_parse_nested_field(self):
2930
'alleles': ['A', 'C'],
3031
'a': hl.Struct(d=3),
3132
'b': hl.Struct(e=[4, 5]),
33+
'h': hl.Struct(**{'i.j': 2}),
3234
'a_index': 1,
3335
},
3436
],
@@ -37,6 +39,7 @@ def test_parse_nested_field(self):
3739
alleles=hl.tarray(hl.tstr),
3840
a=hl.tstruct(d=hl.tint32),
3941
b=hl.tstruct(e=hl.tarray(hl.tint32)),
42+
h=hl.tstruct(**{'i.j': hl.tint32}),
4043
a_index=hl.tint32,
4144
),
4245
key=['locus', 'alleles'],
@@ -45,6 +48,7 @@ def test_parse_nested_field(self):
4548
d=parse_nested_field(ht, 'a.d'),
4649
e=parse_nested_field(ht, 'b.e#'),
4750
f=parse_nested_field(ht, 'a'),
51+
g=parse_nested_field(ht, 'h.i.j'),
4852
)
4953
self.assertListEqual(
5054
ht.collect(),
@@ -59,6 +63,7 @@ def test_parse_nested_field(self):
5963
d=1,
6064
e=2,
6165
f=hl.Struct(d=1),
66+
g=1,
6267
),
6368
hl.Struct(
6469
locus=hl.Locus(
@@ -70,6 +75,7 @@ def test_parse_nested_field(self):
7075
d=3,
7176
e=4,
7277
f=hl.Struct(d=3),
78+
g=2,
7379
),
7480
],
7581
)

v03_pipeline/lib/misc/pedigree.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ class Family:
8080
def __hash__(self):
8181
return hash(self.family_guid)
8282

83+
def __eq__(self, other):
84+
return self.family_guid == other.family_guid
85+
8386
@staticmethod
8487
def parse_direct_lineage(rows: list[hl.Struct]) -> dict[str, Sample]:
8588
samples = {}
@@ -162,7 +165,7 @@ def parse_pedigree_ht_to_families(
162165
) -> set[Family]:
163166
families = set()
164167
for family_guid, rows in itertools.groupby(
165-
pedigree_ht.collect(),
168+
sorted(pedigree_ht.collect(), key=lambda x: x.family_guid),
166169
lambda x: x.family_guid,
167170
):
168171
families.add(Family.parse(family_guid, list(rows)))

v03_pipeline/lib/misc/pedigree_test.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
TEST_PEDIGREE_1 = 'v03_pipeline/var/test/pedigrees/test_pedigree_1.tsv'
1010
TEST_PEDIGREE_2 = 'v03_pipeline/var/test/pedigrees/test_pedigree_2.tsv'
11+
TEST_PEDIGREE_9 = 'v03_pipeline/var/test/pedigrees/test_pedigree_9.tsv'
1112

1213

1314
class PedigreesTest(unittest.TestCase):
@@ -490,3 +491,20 @@ def test_subsetted_pedigree_with_removed_parent(self) -> None:
490491
family.samples['BBL_BC1-000345_01_D1'].mother,
491492
'BBL_BC1-000345_03_D1',
492493
)
494+
495+
def test_pedigree_ungrouped_families(self) -> None:
496+
pedigree_ht = import_pedigree(TEST_PEDIGREE_9)
497+
parsed_pedigree = parse_pedigree_ht_to_families(pedigree_ht)
498+
self.assertEqual(len(parsed_pedigree), 2)
499+
family = next(
500+
iter(
501+
[
502+
family
503+
for family in parsed_pedigree
504+
if family.family_guid == 'family_2_1'
505+
],
506+
),
507+
)
508+
self.assertTrue(
509+
family.samples.keys() == {'RGP_164_1', 'RGP_164_2', 'RGP_164_4'},
510+
)

v03_pipeline/lib/paths.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import os
33
import re
44

5+
import hailtop.fs as hfs
6+
57
from v03_pipeline.lib.model import (
68
AccessControl,
79
DatasetType,
@@ -59,6 +61,25 @@ def _v03_reference_dataset_prefix(
5961
)
6062

6163

64+
def _callset_path_hash(callset_path: str) -> str:
65+
# Include the most recent modified time of any
66+
# of the callset shards if they exist.
67+
try:
68+
# hfs.ls throws FileNotFoundError if a non-wildcard is passed
69+
# but not found, but does not throw if a wildcard is passed and
70+
# there are no results.
71+
shards = hfs.ls(callset_path)
72+
if not shards:
73+
key = callset_path
74+
else:
75+
key = callset_path + str(max(f.modification_time for f in shards))
76+
except FileNotFoundError:
77+
key = callset_path
78+
return hashlib.sha256(
79+
key.encode('utf8'),
80+
).hexdigest()
81+
82+
6283
def family_table_path(
6384
reference_genome: ReferenceGenome,
6485
dataset_type: DatasetType,
@@ -114,7 +135,7 @@ def imported_callset_path(
114135
dataset_type,
115136
),
116137
'imported_callsets',
117-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.mt',
138+
f'{_callset_path_hash(callset_path)}.mt',
118139
)
119140

120141

@@ -178,7 +199,7 @@ def relatedness_check_table_path(
178199
dataset_type,
179200
),
180201
'relatedness_check',
181-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.ht',
202+
f'{_callset_path_hash(callset_path)}.ht',
182203
)
183204

184205

@@ -194,7 +215,7 @@ def relatedness_check_tsv_path(
194215
dataset_type,
195216
),
196217
'relatedness_check',
197-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
218+
f'{_callset_path_hash(callset_path)}.tsv',
198219
)
199220

200221

@@ -228,7 +249,7 @@ def remapped_and_subsetted_callset_path(
228249
),
229250
'remapped_and_subsetted_callsets',
230251
project_guid,
231-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.mt',
252+
f'{_callset_path_hash(callset_path)}.mt',
232253
)
233254

234255

@@ -272,7 +293,7 @@ def sex_check_table_path(
272293
dataset_type,
273294
),
274295
'sex_check',
275-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.ht',
296+
f'{_callset_path_hash(callset_path)}.ht',
276297
)
277298

278299

v03_pipeline/lib/paths_test.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import unittest
22
from unittest.mock import patch
33

4+
import hailtop.fs as hfs
5+
46
from v03_pipeline.lib.model import (
57
DatasetType,
68
ReferenceGenome,
@@ -24,6 +26,8 @@
2426
variant_annotations_table_path,
2527
)
2628

29+
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30varia*.vcf'
30+
2731

2832
class TestPaths(unittest.TestCase):
2933
def test_family_table_path(self) -> None:
@@ -39,15 +43,15 @@ def test_family_table_path(self) -> None:
3943
with patch('v03_pipeline.lib.paths.Env') as mock_env, patch(
4044
'v03_pipeline.lib.paths.FeatureFlag',
4145
) as mock_ff:
42-
mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/'
46+
mock_env.HAIL_SEARCH_DATA_DIR = '/var/bucket/'
4347
self.assertEqual(
4448
family_table_path(
4549
ReferenceGenome.GRCh37,
4650
DatasetType.SNV_INDEL,
4751
SampleType.WES,
4852
'franklin',
4953
),
50-
'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
54+
'/var/bucket/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
5155
)
5256
mock_ff.INCLUDE_PIPELINE_VERSION_IN_PREFIX = False
5357
self.assertEqual(
@@ -57,15 +61,15 @@ def test_family_table_path(self) -> None:
5761
SampleType.WES,
5862
'franklin',
5963
),
60-
'gs://seqr-datasets/GRCh37/SNV_INDEL/families/WES/franklin.ht',
64+
'/var/bucket/GRCh37/SNV_INDEL/families/WES/franklin.ht',
6165
)
6266

6367
def test_valid_filters_path(self) -> None:
6468
self.assertEqual(
6569
valid_filters_path(
6670
DatasetType.MITO,
6771
SampleType.WES,
68-
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
72+
'/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
6973
),
7074
None,
7175
)
@@ -75,9 +79,9 @@ def test_valid_filters_path(self) -> None:
7579
valid_filters_path(
7680
DatasetType.SNV_INDEL,
7781
SampleType.WES,
78-
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
82+
'/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
7983
),
80-
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
84+
'/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
8185
)
8286

8387
def test_project_table_path(self) -> None:
@@ -105,19 +109,19 @@ def test_sex_check_table_path(self) -> None:
105109
sex_check_table_path(
106110
ReferenceGenome.GRCh38,
107111
DatasetType.SNV_INDEL,
108-
'gs://abc.efg/callset.vcf.gz',
112+
'/var/abc.efg/callset.vcf.gz',
109113
),
110-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
114+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.ht',
111115
)
112116

113117
def test_relatedness_check_table_path(self) -> None:
114118
self.assertEqual(
115119
relatedness_check_table_path(
116120
ReferenceGenome.GRCh38,
117121
DatasetType.SNV_INDEL,
118-
'gs://abc.efg/callset.vcf.gz',
122+
'/var/abc.efg/callset.vcf.gz',
119123
),
120-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
124+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.ht',
121125
)
122126

123127
def test_validation_errors_for_run_path(self) -> None:
@@ -154,31 +158,50 @@ def test_remapped_and_subsetted_callset_path(self) -> None:
154158
remapped_and_subsetted_callset_path(
155159
ReferenceGenome.GRCh38,
156160
DatasetType.GCNV,
157-
'gs://abc.efg/callset.vcf.gz',
161+
'/var/abc.efg/callset.vcf.gz',
158162
'R0111_tgg_bblanken_wes',
159163
),
160-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
164+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.mt',
161165
)
162166
self.assertEqual(
163167
remapped_and_subsetted_callset_path(
164168
ReferenceGenome.GRCh38,
165169
DatasetType.GCNV,
166-
'gs://abc.efg/callset/*.vcf.gz',
170+
'/var/abc.efg/callset/*.vcf.gz',
167171
'R0111_tgg_bblanken_wes',
168172
),
169-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt',
173+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/26f481b386721f9889250c6549905660728ec9f77be4b8f7eeb6c4facc76282e.mt',
170174
)
171175

172176
def test_imported_callset_path(self) -> None:
173177
self.assertEqual(
174178
imported_callset_path(
175179
ReferenceGenome.GRCh38,
176180
DatasetType.SNV_INDEL,
177-
'gs://abc.efg/callset.vcf.gz',
181+
'/var/abc.efg/callset.vcf.gz',
178182
),
179-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
183+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.mt',
180184
)
181185

186+
with patch('v03_pipeline.lib.paths.hfs.ls') as mock_ls:
187+
mock_ls.return_value = [
188+
hfs.stat_result.FileListEntry(
189+
path='v03_pipeline/var/test/callsets/1kg_30variants.vcf',
190+
owner=None,
191+
size=104481,
192+
typ=hfs.stat_result.FileType(2),
193+
modification_time=1732033623.804012,
194+
),
195+
]
196+
self.assertEqual(
197+
imported_callset_path(
198+
ReferenceGenome.GRCh38,
199+
DatasetType.SNV_INDEL,
200+
TEST_VCF,
201+
),
202+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/42f2c9e2025c4b61106b3fecfd30443f882a1849b73c6f6903a7e421c20117e0.mt',
203+
)
204+
182205
def test_tdr_metrics_path(self) -> None:
183206
self.assertEqual(
184207
tdr_metrics_path(

0 commit comments

Comments
 (0)