Skip to content

Dev #1061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 20, 2025
50 changes: 50 additions & 0 deletions v03_pipeline/lib/methods/sex_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import hail as hl

from v03_pipeline.lib.model import Sex

AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
AAF_THRESHOLD: float = 0.05 # Alternate allele frequency threshold for `hl.impute_sex`.
BIALLELIC: int = 2
XX_FSTAT_THRESHOLD: float = (
0.5 # F-stat threshold below which a sample will be called XX
)
XY_FSTAT_THRESHOLD: float = (
0.75 # F-stat threshold above which a sample will be called XY.
)


def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table:
# Filter to SNVs and biallelics
# NB: We should already have filtered biallelics, but just in case.
mt = mt.filter_rows(
(hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
)
mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)))

# Filter to PASS variants only (variants with empty or missing filter set)
mt = mt.filter_rows(
hl.is_missing(mt.filters) | (mt.filters.length() == 0),
keep=True,
)
impute_sex_ht = hl.impute_sex(
mt.GT,
male_threshold=XY_FSTAT_THRESHOLD,
female_threshold=XX_FSTAT_THRESHOLD,
aaf_threshold=AAF_THRESHOLD,
)
ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols()
ht = ht.select(
predicted_sex=(
hl.case()
.when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
.when(ht.is_female, Sex.FEMALE.value)
.default(Sex.MALE.value)
),
)
ambiguous_perc = ht.aggregate(
hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value),
)
if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.'
raise ValueError(msg)
return ht
55 changes: 55 additions & 0 deletions v03_pipeline/lib/methods/sex_check_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import unittest
from unittest.mock import patch

import hail as hl

from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht

TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
)
TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'


class SexCheckTest(unittest.TestCase):
def test_compute_sex_check_ht(self):
mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
ht = compute_sex_check_ht(mt)
self.assertCountEqual(
ht.collect(),
[
hl.Struct(
s='ROS_006_18Y03226_D1',
predicted_sex='M',
),
hl.Struct(
s='ROS_006_18Y03227_D1',
predicted_sex='M',
),
hl.Struct(
s='ROS_006_18Y03228_D1',
predicted_sex='M',
),
hl.Struct(
s='ROS_007_19Y05919_D1',
predicted_sex='M',
),
hl.Struct(
s='ROS_007_19Y05939_D1',
predicted_sex='F',
),
hl.Struct(
s='ROS_007_19Y05987_D1',
predicted_sex='M',
),
],
)

def test_compute_sex_check_ht_ambiguous(self):
mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95):
self.assertRaises(
ValueError,
compute_sex_check_ht,
mt,
)
41 changes: 41 additions & 0 deletions v03_pipeline/lib/misc/family_loading_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,3 +174,44 @@ def get_families_failed_sex_check(
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
)
return dict(failed_families)


def get_families_failed_imputed_sex_ploidy(
families: set[Family],
mt: hl.MatrixTable,
sex_check_ht: hl.Table,
) -> dict[Family, str]:
mt = mt.select_cols(
discrepant=(
(
# All calls are diploid or missing but the sex is Male
hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
)
| (
# At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
hl.agg.any(~mt.GT.is_diploid())
& hl.literal(
{
Sex.FEMALE.value,
Sex.X0.value,
Sex.XYY.value,
Sex.XXY.value,
Sex.XXX.value,
},
).contains(sex_check_ht[mt.s].predicted_sex)
)
),
)
discrepant_samples = mt.aggregate_cols(
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
)
failed_families = defaultdict(list)
for family in families:
discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples
if discrepant_loadable_samples:
sorted_discrepant_samples = sorted(discrepant_loadable_samples)
failed_families[family].append(
f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}',
)
return failed_families
192 changes: 192 additions & 0 deletions v03_pipeline/lib/misc/family_loading_failures_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
all_relatedness_checks,
build_relatedness_check_lookup,
build_sex_check_lookup,
get_families_failed_imputed_sex_ploidy,
get_families_failed_sex_check,
)
from v03_pipeline.lib.misc.io import import_pedigree
from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
from v03_pipeline.lib.model import Sex

TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'


Expand Down Expand Up @@ -250,3 +252,193 @@ def test_get_families_failed_sex_check(self):
],
],
)

def test_get_families_failed_imputed_sex_ploidy(self) -> None:
female_sample = 'HG00731_1'
male_sample_1 = 'HG00732_1'
male_sample_2 = 'HG00732_1'
x0_sample = 'NA20899_1'
xxy_sample = 'NA20889_1'
xyy_sample = 'NA20891_1'
xxx_sample = 'NA20892_1'

sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
families = {
Family(
family_guid='',
samples={
female_sample: Sample(female_sample, Sex.FEMALE),
male_sample_1: Sample(male_sample_1, Sex.MALE),
male_sample_2: Sample(male_sample_2, Sex.MALE),
x0_sample: Sample(x0_sample, Sex.X0),
xxy_sample: Sample(xxy_sample, Sex.XXY),
xyy_sample: Sample(xyy_sample, Sex.XYY),
xxx_sample: Sample(xxx_sample, Sex.XXX),
},
),
}

# All calls on X chromosome are valid
mt = (
hl.MatrixTable.from_parts(
rows={
'locus': [
hl.Locus(
contig='chrX',
position=1,
reference_genome='GRCh38',
),
],
},
cols={
's': [
female_sample,
male_sample_1,
x0_sample,
xxy_sample,
xyy_sample,
xxx_sample,
],
},
entries={
'GT': [
[
hl.Call(alleles=[0, 0], phased=False),
hl.Call(alleles=[0], phased=False),
hl.Call(alleles=[0, 0], phased=False), # X0
hl.Call(alleles=[0, 0], phased=False), # XXY
hl.Call(alleles=[0, 0], phased=False), # XYY
hl.Call(alleles=[0, 0], phased=False), # XXX
],
],
},
)
.key_rows_by('locus')
.key_cols_by('s')
)
failed_families = get_families_failed_imputed_sex_ploidy(
families,
mt,
sex_check_ht,
)
self.assertDictEqual(failed_families, {})

# All calls on Y chromosome are valid
mt = (
hl.MatrixTable.from_parts(
rows={
'locus': [
hl.Locus(
contig='chrY',
position=1,
reference_genome='GRCh38',
),
],
},
cols={
's': [
female_sample,
male_sample_1,
x0_sample,
xxy_sample,
xyy_sample,
xxx_sample,
],
},
entries={
'GT': [
[
hl.missing(hl.tcall),
hl.Call(alleles=[0], phased=False),
hl.missing(hl.tcall), # X0
hl.Call(alleles=[0, 0], phased=False), # XXY
hl.Call(alleles=[0, 0], phased=False), # XYY
hl.missing(hl.tcall), # XXX
],
],
},
)
.key_rows_by('locus')
.key_cols_by('s')
)
failed_families = get_families_failed_imputed_sex_ploidy(
families,
mt,
sex_check_ht,
)
self.assertDictEqual(failed_families, {})

# Invalid X chromosome case
mt = (
hl.MatrixTable.from_parts(
rows={
'locus': [
hl.Locus(
contig='chrX',
position=1,
reference_genome='GRCh38',
),
],
},
cols={
's': [
female_sample,
male_sample_1,
male_sample_2,
x0_sample,
xxy_sample,
xyy_sample,
xxx_sample,
],
},
entries={
'GT': [
[
hl.Call(alleles=[0], phased=False), # invalid Female call
hl.Call(alleles=[0], phased=False), # valid Male call
hl.missing(hl.tcall), # invalid Male call
hl.Call(alleles=[0], phased=False), # invalid X0 call
hl.Call(alleles=[0], phased=False), # invalid XXY call
hl.missing(hl.tcall), # valid XYY call
hl.Call(alleles=[0, 0], phased=False), # valid XXX call
],
],
},
)
.key_rows_by('locus')
.key_cols_by('s')
)
failed_families = get_families_failed_imputed_sex_ploidy(
families,
mt,
sex_check_ht,
)
self.assertCountEqual(
failed_families.values(),
[
[
"Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
],
],
)

# Invalid X chromosome case, but only discrepant family samples are reported
families = {
Family(
family_guid='',
samples={female_sample: Sample(female_sample, Sex.FEMALE)},
),
}
failed_families = get_families_failed_imputed_sex_ploidy(
families,
mt,
sex_check_ht,
)
self.assertCountEqual(
failed_families.values(),
[
[
"Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']",
],
],
)
17 changes: 13 additions & 4 deletions v03_pipeline/lib/misc/terra_data_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
from v03_pipeline.lib.misc.requests import requests_retry_session

BIGQUERY_METRICS = [
'collaborator_sample_id',
'predicted_sex',
'contamination_rate',
'percent_bases_at_20x',
'collaborator_sample_id',
'mean_coverage',
]
BIGQUERY_RESOURCE = 'bigquery'
Expand Down Expand Up @@ -63,9 +63,18 @@ def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowItera
msg = f'{bq_table_name} does not match expected pattern'
raise ValueError(msg)
client = bigquery.Client()

# not all columns are guaranteed to be present, coalesce if missing
table_ddl = next(
client.query_and_wait(
f"""
SELECT ddl FROM `{bq_table_name}`.INFORMATION_SCHEMA.TABLES where table_name='sample';
""", # noqa: S608
),
)[0]
metrics = [(m if m in table_ddl else f'NULL AS {m}') for m in BIGQUERY_METRICS]
return client.query_and_wait(
f"""
SELECT {','.join(BIGQUERY_METRICS)}
FROM `{bq_table_name}.sample`
""", # noqa: S608
SELECT {','.join(metrics)} FROM `{bq_table_name}.sample`;
""", # noqa: S608
)
Loading