Skip to content

Commit f69e782

Browse files
committed
working on excluded projects
1 parent f80a356 commit f69e782

File tree

9 files changed

+107
-88
lines changed

9 files changed

+107
-88
lines changed

v03_pipeline/lib/annotations/mito.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import hail as hl
44

5-
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
65
from v03_pipeline.lib.annotations.enums import MITOTIP_PATHOGENICITIES
76

87
MITOTIP_PATHOGENICITIES_LOOKUP = hl.dict(
@@ -68,24 +67,18 @@ def rsid(ht: hl.Table, **_: Any) -> hl.Expression:
6867

6968
def gt_stats(
7069
ht: hl.Table,
71-
sample_lookup_ht: hl.Table,
70+
lookup_ht: hl.Table,
7271
**_: Any,
7372
) -> hl.Expression:
74-
row = sample_lookup_ht[ht.key]
75-
AC_het, AC_hom, AN = 0, 0, 0 # noqa: N806
76-
for project_guid in row.ref_samples:
77-
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
78-
continue
79-
ref_samples_length = row.ref_samples[project_guid].length()
80-
heteroplasmic_samples_length = row.heteroplasmic_samples[project_guid].length()
81-
homoplasmic_samples_length = row.homoplasmic_samples[project_guid].length()
82-
AC_het += heteroplasmic_samples_length # noqa: N806
83-
AC_hom += homoplasmic_samples_length # noqa: N806
84-
AN += ( # noqa: N806
85-
ref_samples_length
86-
+ heteroplasmic_samples_length
87-
+ homoplasmic_samples_length
88-
)
73+
row = lookup_ht[ht.key]
74+
ref_samples = hl.sum(hl.flatten(row.project_stats.ref_samples))
75+
heteroplasmic_samples = hl.sum(hl.flatten(row.project_stats.heteroplasmic_samples))
76+
homoplasmic_samples = hl.sum(hl.flatten(row.project_stats.homoplasmic_samples))
77+
AC_het = heteroplasmic_samples # noqa: N806
78+
AC_hom = homoplasmic_samples # noqa: N806
79+
AN = ( # noqa: N806
80+
ref_samples + heteroplasmic_samples + homoplasmic_samples
81+
)
8982
return hl.Struct(
9083
AC_het=AC_het,
9184
AF_het=hl.float32(AC_het / AN),

v03_pipeline/lib/annotations/mito_test.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import hail as hl
44

55
from v03_pipeline.lib.annotations.mito import gt_stats
6+
from v03_pipeline.lib.model import DatasetType
67

78

89
class MITOTest(unittest.TestCase):
@@ -21,51 +22,55 @@ def test_allele_count_annotations(self) -> None:
2122
),
2223
key='id',
2324
)
24-
sample_lookup_ht = hl.Table.parallelize(
25+
lookup_ht = hl.Table.parallelize(
2526
[
2627
{
2728
'id': 0,
28-
'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set()),
29-
'heteroplasmic_samples': hl.Struct(
30-
project_1={'b', 'd'},
31-
project_2=set(),
32-
),
33-
'homoplasmic_samples': hl.Struct(
34-
project_1={'e', 'f'},
35-
project_2=set(),
36-
),
29+
'project_stats': [
30+
[
31+
hl.Struct(
32+
ref_samples=2,
33+
heteroplasmic_samples=2,
34+
homoplasmic_samples=2,
35+
),
36+
],
37+
[],
38+
],
3739
},
3840
{
3941
'id': 1,
40-
'ref_samples': hl.Struct(
41-
project_1={'a', 'b', 'c', 'd', 'e', 'f'},
42-
project_2=set(),
43-
),
44-
'heteroplasmic_samples': hl.Struct(
45-
project_1=set(),
46-
project_2=set(),
47-
),
48-
'homoplasmic_samples': hl.Struct(project_1=set(), project_2=set()),
42+
'project_stats': [
43+
[
44+
hl.Struct(
45+
ref_samples=6,
46+
heteroplasmic_samples=0,
47+
homoplasmic_samples=0,
48+
),
49+
],
50+
[],
51+
],
4952
},
5053
],
5154
hl.tstruct(
5255
id=hl.tint32,
53-
ref_samples=hl.tstruct(
54-
project_1=hl.tset(hl.tstr),
55-
project_2=hl.tset(hl.tstr),
56-
),
57-
heteroplasmic_samples=hl.tstruct(
58-
project_1=hl.tset(hl.tstr),
59-
project_2=hl.tset(hl.tstr),
60-
),
61-
homoplasmic_samples=hl.tstruct(
62-
project_1=hl.tset(hl.tstr),
63-
project_2=hl.tset(hl.tstr),
56+
project_stats=hl.tarray(
57+
hl.tarray(
58+
hl.tstruct(
59+
**{
60+
field: hl.tint32
61+
for field in DatasetType.MITO.lookup_table_fields_and_genotype_filter_fns
62+
},
63+
),
64+
),
6465
),
6566
),
6667
key='id',
68+
globals=hl.Struct(
69+
project_guids=['project_1', 'project_2'],
70+
project_families={'project_1': ['a'], 'project_2': []},
71+
),
6772
)
68-
ht = ht.select(gt_stats=gt_stats(ht, sample_lookup_ht))
73+
ht = ht.select(gt_stats=gt_stats(ht, lookup_ht))
6974
self.assertCountEqual(
7075
ht.collect(),
7176
[

v03_pipeline/lib/annotations/snv_indel.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
import hail as hl
55

6-
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
7-
86
N_ALT_REF = 0
97
N_ALT_HET = 1
108
N_ALT_HOM = 2
@@ -33,24 +31,16 @@ def DP(mt: hl.MatrixTable, **_: Any) -> hl.Expression: # noqa: N802
3331

3432
def gt_stats(
3533
ht: hl.Table,
36-
sample_lookup_ht: hl.Table,
34+
lookup_ht: hl.Table,
3735
**_: Any,
3836
) -> hl.Expression:
39-
row = sample_lookup_ht[ht.key]
40-
AC, AN, hom = 0, 0, 0
41-
for project_guid in row.ref_samples:
42-
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
43-
continue
44-
ref_samples_length = row.ref_samples[project_guid].length()
45-
het_samples_length = row.het_samples[project_guid].length()
46-
hom_samples_length = row.hom_samples[project_guid].length()
47-
AC += (
48-
ref_samples_length * N_ALT_REF
49-
+ het_samples_length * N_ALT_HET
50-
+ hom_samples_length * N_ALT_HOM
51-
)
52-
AN += 2 * (ref_samples_length + het_samples_length + hom_samples_length)
53-
hom += hom_samples_length
37+
row = lookup_ht[ht.key]
38+
ref_samples = hl.sum(hl.flatten(row.project_stats.ref_samples))
39+
het_samples = hl.sum(hl.flatten(row.project_stats.het_samples))
40+
hom_samples = hl.sum(hl.flatten(row.project_stats.hom_samples))
41+
AC = ref_samples * N_ALT_REF + het_samples * N_ALT_HET + hom_samples * N_ALT_HOM
42+
AN = 2 * (ref_samples + het_samples + hom_samples)
43+
hom = hom_samples
5444
return hl.Struct(
5545
AC=AC,
5646
AN=AN,

v03_pipeline/lib/annotations/snv_indel_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_allele_count_annotations(self) -> None:
2121
),
2222
key='id',
2323
)
24-
sample_lookup_ht = hl.Table.parallelize(
24+
lookup_ht = hl.Table.parallelize(
2525
[
2626
{
2727
'id': 0,
@@ -80,7 +80,7 @@ def test_allele_count_annotations(self) -> None:
8080
),
8181
key='id',
8282
)
83-
ht = ht.select(gt_stats=gt_stats(ht, sample_lookup_ht))
83+
ht = ht.select(gt_stats=gt_stats(ht, lookup_ht))
8484
self.assertCountEqual(
8585
ht.collect(),
8686
[

v03_pipeline/lib/annotations/constants.py renamed to v03_pipeline/lib/model/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PROJECTS_EXCLUDED_FROM_GT_STATS = {
1+
PROJECTS_EXCLUDED_FROM_LOOKUP = {
22
'R0555_seqr_demo',
33
'R0607_gregor_training_project_',
44
'R0608_gregor_training_project_',

v03_pipeline/lib/tasks/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
21
from v03_pipeline.lib.tasks.update_lookup_table import (
32
UpdateLookupTableTask,
43
)
4+
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
55
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
66
UpdateVariantAnnotationsTableWithNewSamplesTask,
77
)

v03_pipeline/lib/tasks/update_lookup_table.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
join_lookup_hts,
77
remove_new_callset_family_guids,
88
)
9+
from v03_pipeline.lib.model.constants import PROJECTS_EXCLUDED_FROM_LOOKUP
910
from v03_pipeline.lib.paths import lookup_table_path
1011
from v03_pipeline.lib.tasks.base.base_update_task import BaseUpdateTask
1112
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
@@ -76,7 +77,7 @@ def initialize_table(self) -> hl.Table:
7677
**{
7778
field: hl.tint32
7879
for field in self.dataset_type.lookup_table_fields_and_genotype_filter_fns
79-
}
80+
},
8081
),
8182
),
8283
),
@@ -90,6 +91,14 @@ def initialize_table(self) -> hl.Table:
9091
)
9192

9293
def update_table(self, ht: hl.Table) -> hl.Table:
94+
if self.project_guid in PROJECTS_EXCLUDED_FROM_LOOKUP:
95+
return ht.annotate_globals(
96+
updates=ht.updates.add(
97+
hl.Struct(
98+
callset=self.callset_path, project_guid=self.project_guid,
99+
),
100+
),
101+
)
93102
callset_mt = hl.read_matrix_table(self.input().path)
94103
ht = remove_new_callset_family_guids(
95104
ht,

v03_pipeline/lib/tasks/update_lookup_table_test.py

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
1313

1414

15-
class UpdateSampleLookupTableTest(MockedDatarootTestCase):
16-
def test_update_sample_lookup_table_task(self) -> None:
15+
class UpdateLookupTableTest(MockedDatarootTestCase):
16+
def test_update_lookup_table_task(self) -> None:
1717
worker = luigi.worker.Worker()
1818
uslt_task = UpdateLookupTableTask(
1919
reference_genome=ReferenceGenome.GRCh38,
@@ -52,7 +52,9 @@ def test_update_sample_lookup_table_task(self) -> None:
5252
reference_genome='GRCh38',
5353
),
5454
alleles=['A', 'C'],
55-
project_stats=[[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)]],
55+
project_stats=[
56+
[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)],
57+
],
5658
),
5759
hl.Struct(
5860
locus=hl.Locus(
@@ -61,7 +63,9 @@ def test_update_sample_lookup_table_task(self) -> None:
6163
reference_genome='GRCh38',
6264
),
6365
alleles=['C', 'T'],
64-
project_stats=[[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)]],
66+
project_stats=[
67+
[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)],
68+
],
6569
),
6670
hl.Struct(
6771
locus=hl.Locus(
@@ -70,7 +74,9 @@ def test_update_sample_lookup_table_task(self) -> None:
7074
reference_genome='GRCh38',
7175
),
7276
alleles=['A', 'G'],
73-
project_stats=[[hl.Struct(ref_samples=0, het_samples=0, hom_samples=3)]],
77+
project_stats=[
78+
[hl.Struct(ref_samples=0, het_samples=0, hom_samples=3)],
79+
],
7480
),
7581
hl.Struct(
7682
locus=hl.Locus(
@@ -79,7 +85,9 @@ def test_update_sample_lookup_table_task(self) -> None:
7985
reference_genome='GRCh38',
8086
),
8187
alleles=['G', 'C'],
82-
project_stats=[[hl.Struct(ref_samples=1, het_samples=2, hom_samples=0)]],
88+
project_stats=[
89+
[hl.Struct(ref_samples=1, het_samples=2, hom_samples=0)],
90+
],
8391
),
8492
hl.Struct(
8593
locus=hl.Locus(
@@ -88,7 +96,9 @@ def test_update_sample_lookup_table_task(self) -> None:
8896
reference_genome='GRCh38',
8997
),
9098
alleles=['C', 'T'],
91-
project_stats=[[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)]],
99+
project_stats=[
100+
[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)],
101+
],
92102
),
93103
hl.Struct(
94104
locus=hl.Locus(
@@ -97,7 +107,9 @@ def test_update_sample_lookup_table_task(self) -> None:
97107
reference_genome='GRCh38',
98108
),
99109
alleles=['C', 'T'],
100-
project_stats=[[hl.Struct(ref_samples=2, het_samples=1, hom_samples=0)]],
110+
project_stats=[
111+
[hl.Struct(ref_samples=2, het_samples=1, hom_samples=0)],
112+
],
101113
),
102114
hl.Struct(
103115
locus=hl.Locus(
@@ -106,7 +118,9 @@ def test_update_sample_lookup_table_task(self) -> None:
106118
reference_genome='GRCh38',
107119
),
108120
alleles=['G', 'A'],
109-
project_stats=[[hl.Struct(ref_samples=1, het_samples=1, hom_samples=1)]],
121+
project_stats=[
122+
[hl.Struct(ref_samples=1, het_samples=1, hom_samples=1)],
123+
],
110124
),
111125
hl.Struct(
112126
locus=hl.Locus(
@@ -115,7 +129,9 @@ def test_update_sample_lookup_table_task(self) -> None:
115129
reference_genome='GRCh38',
116130
),
117131
alleles=['G', 'A'],
118-
project_stats=[[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)]],
132+
project_stats=[
133+
[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)],
134+
],
119135
),
120136
hl.Struct(
121137
locus=hl.Locus(
@@ -124,7 +140,9 @@ def test_update_sample_lookup_table_task(self) -> None:
124140
reference_genome='GRCh38',
125141
),
126142
alleles=['G', 'A'],
127-
project_stats=[[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)]],
143+
project_stats=[
144+
[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)],
145+
],
128146
),
129147
hl.Struct(
130148
locus=hl.Locus(
@@ -133,7 +151,9 @@ def test_update_sample_lookup_table_task(self) -> None:
133151
reference_genome='GRCh38',
134152
),
135153
alleles=['C', 'T'],
136-
project_stats=[[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)]],
154+
project_stats=[
155+
[hl.Struct(ref_samples=3, het_samples=0, hom_samples=0)],
156+
],
137157
),
138158
hl.Struct(
139159
locus=hl.Locus(
@@ -142,7 +162,9 @@ def test_update_sample_lookup_table_task(self) -> None:
142162
reference_genome='GRCh38',
143163
),
144164
alleles=['A', 'G'],
145-
project_stats=[[hl.Struct(ref_samples=0, het_samples=0, hom_samples=3)]],
165+
project_stats=[
166+
[hl.Struct(ref_samples=0, het_samples=0, hom_samples=3)],
167+
],
146168
),
147169
],
148170
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from v03_pipeline.lib.misc.util import callset_project_pairs
1010
from v03_pipeline.lib.model import Env, ReferenceDatasetCollection
1111
from v03_pipeline.lib.paths import (
12-
remapped_and_subsetted_callset_path,
1312
lookup_table_path,
13+
remapped_and_subsetted_callset_path,
1414
)
1515
from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode
1616
from v03_pipeline.lib.tasks.base.base_variant_annotations_table import (
@@ -60,9 +60,9 @@ class UpdateVariantAnnotationsTableWithNewSamplesTask(BaseVariantAnnotationsTabl
6060
@property
6161
def other_annotation_dependencies(self) -> dict[str, hl.Table]:
6262
annotation_dependencies = {}
63-
if self.dataset_type.has_sample_lookup_table:
64-
annotation_dependencies['sample_lookup_ht'] = hl.read_table(
65-
sample_lookup_table_path(
63+
if self.dataset_type.has_lookup_table:
64+
annotation_dependencies['lookup_ht'] = hl.read_table(
65+
lookup_table_path(
6666
self.reference_genome,
6767
self.dataset_type,
6868
),

0 commit comments

Comments
 (0)