Skip to content

Dev #1007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 7, 2025
Merged

Dev #1007

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --resolver=backtracking requirements.in
# pip-compile requirements.in
#
aiodns==2.0.0
# via hail
Expand Down
4 changes: 2 additions & 2 deletions v03_pipeline/bin/pipeline_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from v03_pipeline.api.model import LoadingPipelineRequest
from v03_pipeline.lib.logger import get_logger
from v03_pipeline.lib.model import Env
from v03_pipeline.lib.model import FeatureFlag
from v03_pipeline.lib.paths import (
loading_pipeline_queue_path,
project_pedigree_path,
Expand Down Expand Up @@ -54,7 +54,7 @@ def main():
'run_id': run_id,
**{k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'},
}
if Env.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
if FeatureFlag.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
tasks = [
TriggerHailBackendReload(**loading_run_task_params),
]
Expand Down
115 changes: 47 additions & 68 deletions v03_pipeline/lib/misc/family_loading_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,93 +16,71 @@ def passes_relatedness_check(
relatedness_check_lookup: dict[tuple[str, str], list],
sample_id: str,
other_id: str,
relation: Relation,
expected_relation: Relation,
additional_allowed_relation: Relation | None,
) -> tuple[bool, str | None]:
# No relationship to check, return true
if other_id is None:
return True, None
coefficients = relatedness_check_lookup.get(
(min(sample_id, other_id), max(sample_id, other_id)),
)
if not coefficients or not np.allclose(
coefficients,
relation.coefficients,
atol=RELATEDNESS_TOLERANCE,
if not coefficients or not any(
np.allclose(
coefficients,
relation.coefficients,
atol=RELATEDNESS_TOLERANCE,
)
for relation in (
[expected_relation, additional_allowed_relation]
if additional_allowed_relation
else [expected_relation]
)
):
return (
False,
f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}',
f'Sample {sample_id} has expected relation "{expected_relation.value}" to {other_id} but has coefficients {coefficients or []}',
)
return True, None


def all_relatedness_checks( # noqa: C901
def all_relatedness_checks(
relatedness_check_lookup: dict[tuple[str, str], list],
family: Family,
sample: Sample,
) -> list[str]:
failure_reasons = []
for parent_id in [sample.mother, sample.father]:
success, reason = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
parent_id,
Relation.PARENT,
)
if not success:
failure_reasons.append(reason)

for grandparent_id in [
sample.maternal_grandmother,
sample.maternal_grandfather,
sample.paternal_grandmother,
sample.paternal_grandfather,
for relationship_set, relation, additional_allowed_relation in [
([sample.mother, sample.father], Relation.PARENT_CHILD, None),
(
[
sample.maternal_grandmother,
sample.maternal_grandfather,
sample.paternal_grandmother,
sample.paternal_grandfather,
],
Relation.GRANDPARENT_GRANDCHILD,
None,
),
(sample.siblings, Relation.SIBLING, None),
(sample.half_siblings, Relation.HALF_SIBLING, Relation.SIBLING),
(sample.aunt_nephews, Relation.AUNT_NEPHEW, None),
]:
success, reason = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
grandparent_id,
Relation.GRANDPARENT,
)
if not success:
failure_reasons.append(reason)

for sibling_id in sample.siblings:
success, reason = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
sibling_id,
Relation.SIBLING,
)
if not success:
failure_reasons.append(reason)

for half_sibling_id in sample.half_siblings:
# NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
# through as well.
success1, _ = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
half_sibling_id,
Relation.SIBLING,
)
success2, reason = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
half_sibling_id,
Relation.HALF_SIBLING,
)
if not success1 and not success2:
failure_reasons.append(reason)

for aunt_nephew_id in sample.aunt_nephews:
success, reason = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
aunt_nephew_id,
Relation.AUNT_NEPHEW,
)
if not success:
failure_reasons.append(reason)
for other_id in relationship_set:
# Handle case where relation is identified in the
# pedigree as a "dummy" but is not included in
# the list of samples to load.
if other_id not in family.samples:
continue
success, reason = passes_relatedness_check(
relatedness_check_lookup,
sample.sample_id,
other_id,
relation,
additional_allowed_relation,
)
if not success:
failure_reasons.append(reason)
return failure_reasons


Expand Down Expand Up @@ -162,6 +140,7 @@ def get_families_failed_relatedness_check(
for sample in family.samples.values():
failure_reasons = all_relatedness_checks(
relatedness_check_lookup,
family,
sample,
)
if failure_reasons:
Expand Down
50 changes: 46 additions & 4 deletions v03_pipeline/lib/misc/family_loading_failures_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
get_families_failed_sex_check,
)
from v03_pipeline.lib.misc.io import import_pedigree
from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
from v03_pipeline.lib.model import Sex

TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
Expand Down Expand Up @@ -104,7 +104,21 @@ def test_all_relatedness_checks(self):
paternal_grandfather='sample_3',
half_siblings=['sample_4'],
)
failure_reasons = all_relatedness_checks(relatedness_check_lookup, sample)
family = Family(
family_guid='family_1a',
samples={
'sample_1': sample,
'sample_2': Sample(sex=Sex.MALE, sample_id='sample_2'),
'sample_3': Sample(sex=Sex.MALE, sample_id='sample_3'),
'sample_4': Sample(sex=Sex.MALE, sample_id='sample_4'),
'sample_5': Sample(sex=Sex.MALE, sample_id='sample_5'),
},
)
failure_reasons = all_relatedness_checks(
relatedness_check_lookup,
family,
sample,
)
self.assertListEqual(failure_reasons, [])

# Defined grandparent missing in relatedness table
Expand All @@ -117,12 +131,13 @@ def test_all_relatedness_checks(self):
)
failure_reasons = all_relatedness_checks(
relatedness_check_lookup,
family,
sample,
)
self.assertListEqual(
failure_reasons,
[
'Sample sample_1 has expected relation "grandparent" to sample_5 but has coefficients []',
'Sample sample_1 has expected relation "grandparent_grandchild" to sample_5 but has coefficients []',
],
)

Expand All @@ -140,6 +155,7 @@ def test_all_relatedness_checks(self):
)
failure_reasons = all_relatedness_checks(
relatedness_check_lookup,
family,
sample,
)
self.assertListEqual(
Expand Down Expand Up @@ -167,16 +183,42 @@ def test_all_relatedness_checks(self):
)
failure_reasons = all_relatedness_checks(
relatedness_check_lookup,
family,
sample,
)
self.assertListEqual(
failure_reasons,
[
'Sample sample_1 has expected relation "parent" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
'Sample sample_1 has expected relation "parent_child" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]',
],
)

# Some samples will include relationships with
# samples that are not expected to be included
# in the callset. These should not trigger relatedness
# failures.
sample = Sample(
sex=Sex.FEMALE,
sample_id='sample_1',
mother='sample_2',
)
family = Family(
family_guid='family_1a',
samples={
'sample_1': sample,
},
)
failure_reasons = all_relatedness_checks(
{},
family,
sample,
)
self.assertListEqual(
failure_reasons,
[],
)

def test_get_families_failed_sex_check(self):
sex_check_ht = hl.Table.parallelize(
[
Expand Down
8 changes: 4 additions & 4 deletions v03_pipeline/lib/misc/pedigree.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@


class Relation(Enum):
PARENT = 'parent'
GRANDPARENT = 'grandparent'
PARENT_CHILD = 'parent_child'
GRANDPARENT_GRANDCHILD = 'grandparent_grandchild'
SIBLING = 'sibling'
HALF_SIBLING = 'half_sibling'
AUNT_NEPHEW = 'aunt_nephew'

@property
def coefficients(self):
return {
Relation.PARENT: [0, 1, 0, 0.5],
Relation.GRANDPARENT: [0.5, 0.5, 0, 0.25],
Relation.PARENT_CHILD: [0, 1, 0, 0.5],
Relation.GRANDPARENT_GRANDCHILD: [0.5, 0.5, 0, 0.25],
Relation.SIBLING: [0.25, 0.5, 0.25, 0.5],
Relation.HALF_SIBLING: [0.5, 0.5, 0, 0.25],
Relation.AUNT_NEPHEW: [0.5, 0.5, 0, 0.25],
Expand Down
22 changes: 5 additions & 17 deletions v03_pipeline/lib/misc/sample_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,16 @@
import hail as hl

from v03_pipeline.lib.logger import get_logger
from v03_pipeline.lib.misc.validation import SeqrValidationError

logger = get_logger(__name__)


class MatrixTableSampleSetError(Exception):
def __init__(self, message, missing_samples):
super().__init__(message)
self.missing_samples = missing_samples


def vcf_remap(mt: hl.MatrixTable) -> hl.MatrixTable:
# TODO: add logic from Mike to remap vcf samples delivered from Broad WGS
return mt


def remap_sample_ids(
mt: hl.MatrixTable,
project_remap_ht: hl.Table,
ignore_missing_samples_when_remapping: bool,
) -> hl.MatrixTable:
mt = vcf_remap(mt)

collected_remap = project_remap_ht.collect()
s_dups = [k for k, v in Counter([r.s for r in collected_remap]).items() if v > 1]
seqr_dups = [
Expand All @@ -33,7 +21,7 @@ def remap_sample_ids(

if len(s_dups) > 0 or len(seqr_dups) > 0:
msg = f'Duplicate s or seqr_id entries in remap file were found. Duplicate s:{s_dups}. Duplicate seqr_id:{seqr_dups}.'
raise ValueError(msg)
raise SeqrValidationError(msg)

missing_samples = project_remap_ht.anti_join(mt.cols()).collect()
remap_count = len(collected_remap)
Expand All @@ -48,7 +36,7 @@ def remap_sample_ids(
if ignore_missing_samples_when_remapping:
logger.info(message)
else:
raise MatrixTableSampleSetError(message, missing_samples)
raise SeqrValidationError(message)

mt = mt.annotate_cols(**project_remap_ht[mt.s])
remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
Expand All @@ -67,7 +55,7 @@ def subset_samples(
anti_join_ht_count = anti_join_ht.count()
if subset_count == 0:
message = '0 sample ids found the subset HT, something is probably wrong.'
raise MatrixTableSampleSetError(message, [])
raise SeqrValidationError(message)

if anti_join_ht_count != 0:
missing_samples = anti_join_ht.s.collect()
Expand All @@ -77,7 +65,7 @@ def subset_samples(
f"IDs that aren't in the callset: {missing_samples}\n"
f'All callset sample IDs:{mt.s.collect()}'
)
raise MatrixTableSampleSetError(message, missing_samples)
raise SeqrValidationError(message)
logger.info(f'Subsetted to {subset_count} sample ids')
mt = mt.semi_join_cols(sample_subset_ht)
return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))
Loading
Loading