Skip to content

Commit 4ea8740

Browse files
bpblankenjklugherz
andauthored
feat: add support for over-writing genotypes for males on x non par (#1030)
* bump hail version (#1027) * First pass * Finish test * do it correctly :/ * ruff * correct feature flag * bump * fix test * fix svs * ruff * Benb/bugfix allow deleting project to succeed if no project (#1038) * First pass * add comment * ruff * tdr updates for sample qc * dead code (#1045) * delete * fixes * need loadable families * ruff --------- Co-authored-by: Julia Klugherz <juliaklugherz@gmail.com>
1 parent 7396413 commit 4ea8740

14 files changed

+335
-272
lines changed

v03_pipeline/lib/misc/io_test.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ def test_file_size_mb(self) -> None:
3131
# find v03_pipeline/var/test/callsets/mito_1.mt -type f | grep -v 'crc' | xargs ls -alt {} | awk '{sum += $5; print sum}'
3232
# 191310
3333
self.assertEqual(file_size_bytes(TEST_MITO_MT), 191310)
34-
self.assertEqual(file_size_bytes(TEST_SV_VCF), 20040)
3534

3635
def test_compute_hail_n_partitions(self) -> None:
3736
self.assertEqual(compute_hail_n_partitions(23), 1)

v03_pipeline/lib/misc/sv.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.annotations import sv
4+
from v03_pipeline.lib.misc.pedigree import Family
5+
from v03_pipeline.lib.model import ReferenceGenome, Sex
6+
7+
8+
def overwrite_male_non_par_calls(
9+
mt: hl.MatrixTable,
10+
families: set[Family],
11+
) -> hl.MatrixTable:
12+
male_sample_ids = {
13+
s.sample_id for f in families for s in f.samples.values() if s.sex == Sex.MALE
14+
}
15+
male_sample_ids = (
16+
hl.set(male_sample_ids) if male_sample_ids else hl.empty_set(hl.str)
17+
)
18+
par_intervals = hl.array(
19+
[
20+
i
21+
for i in hl.get_reference(ReferenceGenome.GRCh38).par
22+
if i.start.contig == ReferenceGenome.GRCh38.x_contig
23+
],
24+
)
25+
non_par_interval = hl.interval(
26+
par_intervals[0].end,
27+
par_intervals[1].start,
28+
)
29+
# NB: making use of existing formatting_annotation_fns.
30+
# We choose to annotate & drop here as the sample level
31+
# fields are dropped by the time we format variants.
32+
mt = mt.annotate_rows(
33+
start_locus=sv.start_locus(mt),
34+
end_locus=sv.end_locus(mt),
35+
)
36+
mt = mt.annotate_entries(
37+
GT=hl.if_else(
38+
(
39+
male_sample_ids.contains(mt.s)
40+
& non_par_interval.overlaps(
41+
hl.interval(
42+
mt.start_locus,
43+
mt.end_locus,
44+
),
45+
)
46+
& mt.GT.is_het()
47+
),
48+
hl.Call([1], phased=False),
49+
mt.GT,
50+
),
51+
)
52+
return mt.drop('start_locus', 'end_locus')

v03_pipeline/lib/misc/sv_test.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import unittest
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.misc.io import import_callset, select_relevant_fields
6+
from v03_pipeline.lib.misc.pedigree import Family, Sample
7+
from v03_pipeline.lib.misc.sample_ids import subset_samples
8+
from v03_pipeline.lib.misc.sv import overwrite_male_non_par_calls
9+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, Sex
10+
11+
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
12+
13+
14+
class SVTest(unittest.TestCase):
15+
def test_overwrite_male_non_par_calls(self) -> None:
16+
mt = import_callset(TEST_SV_VCF, ReferenceGenome.GRCh38, DatasetType.SV)
17+
mt = select_relevant_fields(
18+
mt,
19+
DatasetType.SV,
20+
)
21+
mt = subset_samples(
22+
mt,
23+
hl.Table.parallelize(
24+
[{'s': sample_id} for sample_id in ['RGP_164_1', 'RGP_164_2']],
25+
hl.tstruct(s=hl.dtype('str')),
26+
key='s',
27+
),
28+
)
29+
mt = overwrite_male_non_par_calls(
30+
mt,
31+
{
32+
Family(
33+
family_guid='family_1',
34+
samples={
35+
'RGP_164_1': Sample(sample_id='RGP_164_1', sex=Sex.FEMALE),
36+
'RGP_164_2': Sample(sample_id='RGP_164_2', sex=Sex.MALE),
37+
},
38+
),
39+
},
40+
)
41+
mt = mt.filter_rows(mt.locus.contig == 'chrX')
42+
self.assertEqual(
43+
[
44+
hl.Locus(contig='chrX', position=3, reference_genome='GRCh38'),
45+
hl.Locus(contig='chrX', position=2781700, reference_genome='GRCh38'),
46+
],
47+
mt.locus.collect(),
48+
)
49+
self.assertEqual(
50+
[
51+
hl.Call(alleles=[0, 0], phased=False),
52+
# END of this variant < start of the non-par region.
53+
hl.Call(alleles=[0, 1], phased=False),
54+
hl.Call(alleles=[0, 0], phased=False),
55+
hl.Call(alleles=[1], phased=False),
56+
],
57+
mt.GT.collect(),
58+
)
59+
self.assertFalse(
60+
hasattr(mt, 'start_locus'),
61+
)

v03_pipeline/lib/misc/terra_data_repository.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
BIGQUERY_METRICS = [
1313
'collaborator_sample_id',
1414
'predicted_sex',
15+
'contamination_rate',
16+
'percent_bases_at_20x',
17+
'mean_coverage',
1518
]
1619
BIGQUERY_RESOURCE = 'bigquery'
1720
TABLE_NAME_VALIDATION_REGEX = r'datarepo-\w+.datarepo_\w+'

v03_pipeline/lib/model/dataset_type.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,3 +387,7 @@ def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
387387
sv.info,
388388
],
389389
}[self]
390+
391+
@property
392+
def overwrite_male_non_par_calls(self) -> None:
393+
return self == DatasetType.SV

v03_pipeline/lib/model/definitions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ def mito_contig(self) -> str:
7070
ReferenceGenome.GRCh38: 'chrM',
7171
}[self]
7272

73+
@property
74+
def x_contig(self) -> str:
75+
return 'X' if self == ReferenceGenome.GRCh37 else 'chrX'
76+
7377
def contig_recoding(self, include_mt: bool = False) -> dict[str, str]:
7478
recode = {
7579
ReferenceGenome.GRCh37: {

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py

Lines changed: 0 additions & 141 deletions
This file was deleted.

0 commit comments

Comments
 (0)