Skip to content

Commit b975ab7

Browse files
authored
Dev (#1007)
* Add service account credentialing (#997) * Add service account credentialing * ruff * feat: Handle parsing empty predicted sex into Unknown (#1000) * Add helper functions for querying `Terra Data Repository` (#998) * Add service account credentialing * ruff * First pass * tests passing * add coverage of bigquery test * change function names * use generators everywhere * bq requirement * resolver * Update sample id name * Build Sex Check Table from TDR Metrics (#999) * refactor: Move feature flags to FeatureFlag enum. (#1002) * refactor: Move feature flags out of environment to their own dataclass * lint: ruff * ruff * bugfix: exclude samples from relationship checking that are not present in the expected loadable samples (#1003) * bugfix: exclude samples from relationship checking that are not present in the expected loadable samples * cleanup * feat: add remap and family loading failures as validation exceptions … (#1005) * feat: add remap and family loading failures as validation exceptions rather than runtime errors * move on * Update write_remapped_and_subsetted_callset_test.py * ruff * feat: Add ability to run tasks dataproc. (#948) * Support gcs dirs in rsync * ws * Add create dataproc cluster task * add dataproc * ruff * requirements * still struggling * Gencode refactor to remove gcs * bump reqs * Run dataproc job * lib * running * merge requirements * Flip'em * Better exception handling * Cleaner approach if less generalizable * write a test * Fix tests * lint * Add test for success * refactor to use a base class... better for adding support for multiple jobs * cleanup * ruff * Fix missing mock * Fix flapping test * pr comments
1 parent 00af9f5 commit b975ab7

28 files changed

+859
-261
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# This file is autogenerated by pip-compile with Python 3.10
33
# by the following command:
44
#
5-
# pip-compile --resolver=backtracking requirements.in
5+
# pip-compile requirements.in
66
#
77
aiodns==2.0.0
88
# via hail

v03_pipeline/bin/pipeline_worker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from v03_pipeline.api.model import LoadingPipelineRequest
99
from v03_pipeline.lib.logger import get_logger
10-
from v03_pipeline.lib.model import Env
10+
from v03_pipeline.lib.model import FeatureFlag
1111
from v03_pipeline.lib.paths import (
1212
loading_pipeline_queue_path,
1313
project_pedigree_path,
@@ -54,7 +54,7 @@ def main():
5454
'run_id': run_id,
5555
**{k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'},
5656
}
57-
if Env.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
57+
if FeatureFlag.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
5858
tasks = [
5959
TriggerHailBackendReload(**loading_run_task_params),
6060
]

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 47 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -16,93 +16,71 @@ def passes_relatedness_check(
1616
relatedness_check_lookup: dict[tuple[str, str], list],
1717
sample_id: str,
1818
other_id: str,
19-
relation: Relation,
19+
expected_relation: Relation,
20+
additional_allowed_relation: Relation | None,
2021
) -> tuple[bool, str | None]:
2122
# No relationship to check, return true
2223
if other_id is None:
2324
return True, None
2425
coefficients = relatedness_check_lookup.get(
2526
(min(sample_id, other_id), max(sample_id, other_id)),
2627
)
27-
if not coefficients or not np.allclose(
28-
coefficients,
29-
relation.coefficients,
30-
atol=RELATEDNESS_TOLERANCE,
28+
if not coefficients or not any(
29+
np.allclose(
30+
coefficients,
31+
relation.coefficients,
32+
atol=RELATEDNESS_TOLERANCE,
33+
)
34+
for relation in (
35+
[expected_relation, additional_allowed_relation]
36+
if additional_allowed_relation
37+
else [expected_relation]
38+
)
3139
):
3240
return (
3341
False,
34-
f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}',
42+
f'Sample {sample_id} has expected relation "{expected_relation.value}" to {other_id} but has coefficients {coefficients or []}',
3543
)
3644
return True, None
3745

3846

39-
def all_relatedness_checks( # noqa: C901
47+
def all_relatedness_checks(
4048
relatedness_check_lookup: dict[tuple[str, str], list],
49+
family: Family,
4150
sample: Sample,
4251
) -> list[str]:
4352
failure_reasons = []
44-
for parent_id in [sample.mother, sample.father]:
45-
success, reason = passes_relatedness_check(
46-
relatedness_check_lookup,
47-
sample.sample_id,
48-
parent_id,
49-
Relation.PARENT,
50-
)
51-
if not success:
52-
failure_reasons.append(reason)
53-
54-
for grandparent_id in [
55-
sample.maternal_grandmother,
56-
sample.maternal_grandfather,
57-
sample.paternal_grandmother,
58-
sample.paternal_grandfather,
53+
for relationship_set, relation, additional_allowed_relation in [
54+
([sample.mother, sample.father], Relation.PARENT_CHILD, None),
55+
(
56+
[
57+
sample.maternal_grandmother,
58+
sample.maternal_grandfather,
59+
sample.paternal_grandmother,
60+
sample.paternal_grandfather,
61+
],
62+
Relation.GRANDPARENT_GRANDCHILD,
63+
None,
64+
),
65+
(sample.siblings, Relation.SIBLING, None),
66+
(sample.half_siblings, Relation.HALF_SIBLING, Relation.SIBLING),
67+
(sample.aunt_nephews, Relation.AUNT_NEPHEW, None),
5968
]:
60-
success, reason = passes_relatedness_check(
61-
relatedness_check_lookup,
62-
sample.sample_id,
63-
grandparent_id,
64-
Relation.GRANDPARENT,
65-
)
66-
if not success:
67-
failure_reasons.append(reason)
68-
69-
for sibling_id in sample.siblings:
70-
success, reason = passes_relatedness_check(
71-
relatedness_check_lookup,
72-
sample.sample_id,
73-
sibling_id,
74-
Relation.SIBLING,
75-
)
76-
if not success:
77-
failure_reasons.append(reason)
78-
79-
for half_sibling_id in sample.half_siblings:
80-
# NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
81-
# through as well.
82-
success1, _ = passes_relatedness_check(
83-
relatedness_check_lookup,
84-
sample.sample_id,
85-
half_sibling_id,
86-
Relation.SIBLING,
87-
)
88-
success2, reason = passes_relatedness_check(
89-
relatedness_check_lookup,
90-
sample.sample_id,
91-
half_sibling_id,
92-
Relation.HALF_SIBLING,
93-
)
94-
if not success1 and not success2:
95-
failure_reasons.append(reason)
96-
97-
for aunt_nephew_id in sample.aunt_nephews:
98-
success, reason = passes_relatedness_check(
99-
relatedness_check_lookup,
100-
sample.sample_id,
101-
aunt_nephew_id,
102-
Relation.AUNT_NEPHEW,
103-
)
104-
if not success:
105-
failure_reasons.append(reason)
69+
for other_id in relationship_set:
70+
# Handle case where relation is identified in the
71+
# pedigree as a "dummy" but is not included in
72+
# the list of samples to load.
73+
if other_id not in family.samples:
74+
continue
75+
success, reason = passes_relatedness_check(
76+
relatedness_check_lookup,
77+
sample.sample_id,
78+
other_id,
79+
relation,
80+
additional_allowed_relation,
81+
)
82+
if not success:
83+
failure_reasons.append(reason)
10684
return failure_reasons
10785

10886

@@ -162,6 +140,7 @@ def get_families_failed_relatedness_check(
162140
for sample in family.samples.values():
163141
failure_reasons = all_relatedness_checks(
164142
relatedness_check_lookup,
143+
family,
165144
sample,
166145
)
167146
if failure_reasons:

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
get_families_failed_sex_check,
1010
)
1111
from v03_pipeline.lib.misc.io import import_pedigree
12-
from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
12+
from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
1313
from v03_pipeline.lib.model import Sex
1414

1515
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
@@ -104,7 +104,21 @@ def test_all_relatedness_checks(self):
104104
paternal_grandfather='sample_3',
105105
half_siblings=['sample_4'],
106106
)
107-
failure_reasons = all_relatedness_checks(relatedness_check_lookup, sample)
107+
family = Family(
108+
family_guid='family_1a',
109+
samples={
110+
'sample_1': sample,
111+
'sample_2': Sample(sex=Sex.MALE, sample_id='sample_2'),
112+
'sample_3': Sample(sex=Sex.MALE, sample_id='sample_3'),
113+
'sample_4': Sample(sex=Sex.MALE, sample_id='sample_4'),
114+
'sample_5': Sample(sex=Sex.MALE, sample_id='sample_5'),
115+
},
116+
)
117+
failure_reasons = all_relatedness_checks(
118+
relatedness_check_lookup,
119+
family,
120+
sample,
121+
)
108122
self.assertListEqual(failure_reasons, [])
109123

110124
# Defined grandparent missing in relatedness table
@@ -117,12 +131,13 @@ def test_all_relatedness_checks(self):
117131
)
118132
failure_reasons = all_relatedness_checks(
119133
relatedness_check_lookup,
134+
family,
120135
sample,
121136
)
122137
self.assertListEqual(
123138
failure_reasons,
124139
[
125-
'Sample sample_1 has expected relation "grandparent" to sample_5 but has coefficients []',
140+
'Sample sample_1 has expected relation "grandparent_grandchild" to sample_5 but has coefficients []',
126141
],
127142
)
128143

@@ -140,6 +155,7 @@ def test_all_relatedness_checks(self):
140155
)
141156
failure_reasons = all_relatedness_checks(
142157
relatedness_check_lookup,
158+
family,
143159
sample,
144160
)
145161
self.assertListEqual(
@@ -167,16 +183,42 @@ def test_all_relatedness_checks(self):
167183
)
168184
failure_reasons = all_relatedness_checks(
169185
relatedness_check_lookup,
186+
family,
170187
sample,
171188
)
172189
self.assertListEqual(
173190
failure_reasons,
174191
[
175-
'Sample sample_1 has expected relation "parent" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
192+
'Sample sample_1 has expected relation "parent_child" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
176193
'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]',
177194
],
178195
)
179196

197+
# Some samples will include relationships with
198+
# samples that are not expected to be included
199+
# in the callset. These should not trigger relatedness
200+
# failures.
201+
sample = Sample(
202+
sex=Sex.FEMALE,
203+
sample_id='sample_1',
204+
mother='sample_2',
205+
)
206+
family = Family(
207+
family_guid='family_1a',
208+
samples={
209+
'sample_1': sample,
210+
},
211+
)
212+
failure_reasons = all_relatedness_checks(
213+
{},
214+
family,
215+
sample,
216+
)
217+
self.assertListEqual(
218+
failure_reasons,
219+
[],
220+
)
221+
180222
def test_get_families_failed_sex_check(self):
181223
sex_check_ht = hl.Table.parallelize(
182224
[

v03_pipeline/lib/misc/pedigree.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88

99

1010
class Relation(Enum):
11-
PARENT = 'parent'
12-
GRANDPARENT = 'grandparent'
11+
PARENT_CHILD = 'parent_child'
12+
GRANDPARENT_GRANDCHILD = 'grandparent_grandchild'
1313
SIBLING = 'sibling'
1414
HALF_SIBLING = 'half_sibling'
1515
AUNT_NEPHEW = 'aunt_nephew'
1616

1717
@property
1818
def coefficients(self):
1919
return {
20-
Relation.PARENT: [0, 1, 0, 0.5],
21-
Relation.GRANDPARENT: [0.5, 0.5, 0, 0.25],
20+
Relation.PARENT_CHILD: [0, 1, 0, 0.5],
21+
Relation.GRANDPARENT_GRANDCHILD: [0.5, 0.5, 0, 0.25],
2222
Relation.SIBLING: [0.25, 0.5, 0.25, 0.5],
2323
Relation.HALF_SIBLING: [0.5, 0.5, 0, 0.25],
2424
Relation.AUNT_NEPHEW: [0.5, 0.5, 0, 0.25],

v03_pipeline/lib/misc/sample_ids.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,16 @@
33
import hail as hl
44

55
from v03_pipeline.lib.logger import get_logger
6+
from v03_pipeline.lib.misc.validation import SeqrValidationError
67

78
logger = get_logger(__name__)
89

910

10-
class MatrixTableSampleSetError(Exception):
11-
def __init__(self, message, missing_samples):
12-
super().__init__(message)
13-
self.missing_samples = missing_samples
14-
15-
16-
def vcf_remap(mt: hl.MatrixTable) -> hl.MatrixTable:
17-
# TODO: add logic from Mike to remap vcf samples delivered from Broad WGS
18-
return mt
19-
20-
2111
def remap_sample_ids(
2212
mt: hl.MatrixTable,
2313
project_remap_ht: hl.Table,
2414
ignore_missing_samples_when_remapping: bool,
2515
) -> hl.MatrixTable:
26-
mt = vcf_remap(mt)
27-
2816
collected_remap = project_remap_ht.collect()
2917
s_dups = [k for k, v in Counter([r.s for r in collected_remap]).items() if v > 1]
3018
seqr_dups = [
@@ -33,7 +21,7 @@ def remap_sample_ids(
3321

3422
if len(s_dups) > 0 or len(seqr_dups) > 0:
3523
msg = f'Duplicate s or seqr_id entries in remap file were found. Duplicate s:{s_dups}. Duplicate seqr_id:{seqr_dups}.'
36-
raise ValueError(msg)
24+
raise SeqrValidationError(msg)
3725

3826
missing_samples = project_remap_ht.anti_join(mt.cols()).collect()
3927
remap_count = len(collected_remap)
@@ -48,7 +36,7 @@ def remap_sample_ids(
4836
if ignore_missing_samples_when_remapping:
4937
logger.info(message)
5038
else:
51-
raise MatrixTableSampleSetError(message, missing_samples)
39+
raise SeqrValidationError(message)
5240

5341
mt = mt.annotate_cols(**project_remap_ht[mt.s])
5442
remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
@@ -67,7 +55,7 @@ def subset_samples(
6755
anti_join_ht_count = anti_join_ht.count()
6856
if subset_count == 0:
6957
message = '0 sample ids found the subset HT, something is probably wrong.'
70-
raise MatrixTableSampleSetError(message, [])
58+
raise SeqrValidationError(message)
7159

7260
if anti_join_ht_count != 0:
7361
missing_samples = anti_join_ht.s.collect()
@@ -77,7 +65,7 @@ def subset_samples(
7765
f"IDs that aren't in the callset: {missing_samples}\n"
7866
f'All callset sample IDs:{mt.s.collect()}'
7967
)
80-
raise MatrixTableSampleSetError(message, missing_samples)
68+
raise SeqrValidationError(message)
8169
logger.info(f'Subsetted to {subset_count} sample ids')
8270
mt = mt.semi_join_cols(sample_subset_ht)
8371
return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))

0 commit comments

Comments
 (0)