Skip to content

Commit ffee48a

Browse files
authored
bugfix: gcnv frequency annotations (#1016)
* Ensure gcnv frequency stats are updated on all variants present in callset * Support annotating from the callset * fix bad test * add test
1 parent 2453532 commit ffee48a

9 files changed

+95
-32
lines changed

v03_pipeline/lib/annotations/fields_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def test_get_lookup_table_fields(
199199
list(
200200
get_fields(
201201
ht,
202-
DatasetType.SNV_INDEL.lookup_table_annotation_fns,
202+
DatasetType.SNV_INDEL.variant_frequency_annotation_fns,
203203
lookup_ht=lookup_ht,
204204
dataset_type=DatasetType.SNV_INDEL,
205205
reference_genome=ReferenceGenome.GRCh38,

v03_pipeline/lib/annotations/gcnv.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@ def end_locus(
5757
return hl.locus(ht.chr, ht.end, reference_genome.value)
5858

5959

60-
def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:
60+
def gt_stats(ht: hl.Table, callset_ht: hl.Table, **_: Any) -> hl.Expression:
6161
return hl.struct(
62-
AF=hl.float32(ht.sf),
63-
AC=ht.sc,
64-
AN=hl.int32(ht.sc / ht.sf),
62+
AF=hl.float32(callset_ht[ht.variant_id].sf),
63+
AC=callset_ht[ht.variant_id].sc,
64+
AN=hl.int32(callset_ht[ht.variant_id].sc / callset_ht[ht.variant_id].sf),
6565
Hom=hl.missing(hl.tint32),
6666
Het=hl.missing(hl.tint32),
6767
)

v03_pipeline/lib/model/dataset_type.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,6 @@ def formatting_annotation_fns(
277277
],
278278
DatasetType.GCNV: [
279279
gcnv.end_locus,
280-
gcnv.gt_stats,
281280
gcnv.num_exon,
282281
gcnv.sorted_gene_consequences,
283282
gcnv.start_locus,
@@ -354,15 +353,18 @@ def genotype_entry_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
354353
}[self]
355354

356355
@property
357-
def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
356+
def variant_frequency_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
358357
return {
359358
DatasetType.SNV_INDEL: [
360359
snv_indel.gt_stats,
361360
],
362361
DatasetType.MITO: [
363362
mito.gt_stats,
364363
],
365-
}[self]
364+
DatasetType.GCNV: [
365+
gcnv.gt_stats,
366+
],
367+
}.get(self, [])
366368

367369
@property
368370
def should_send_to_allele_registry(self):

v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def update_table(self, ht: hl.Table) -> hl.Table:
4545
ht = ht.annotate(
4646
**get_fields(
4747
ht,
48-
self.dataset_type.lookup_table_annotation_fns,
48+
self.dataset_type.variant_frequency_annotation_fns,
4949
lookup_ht=lookup_ht,
5050
**self.param_kwargs,
5151
),

v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def update_table(self, ht: hl.Table) -> hl.Table:
4343
ht = ht.annotate(
4444
**get_fields(
4545
ht,
46-
self.dataset_type.lookup_table_annotation_fns,
46+
self.dataset_type.variant_frequency_annotation_fns,
4747
lookup_ht=lookup_ht,
4848
**self.param_kwargs,
4949
),

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,44 +66,42 @@ def update_table(self, ht: hl.Table) -> hl.Table:
6666

6767
# Union with the new variants table and annotate with the lookup table.
6868
ht = ht.union(new_variants_ht, unify=True)
69-
if self.dataset_type.has_lookup_table:
69+
if self.dataset_type.variant_frequency_annotation_fns:
7070
callset_ht = get_callset_ht(
7171
self.reference_genome,
7272
self.dataset_type,
7373
self.callset_path,
7474
self.project_guids,
7575
)
76+
lookup_ht = None
77+
if self.dataset_type.has_lookup_table:
78+
lookup_ht = hl.read_table(
79+
lookup_table_path(
80+
self.reference_genome,
81+
self.dataset_type,
82+
),
83+
)
84+
# Variants may have fallen out of the callset and
85+
# have been removed from the lookup table during modification.
86+
# Ensure we don't proceed with those variants.
87+
ht = ht.semi_join(lookup_ht)
88+
7689
# new_variants_ht consists of variants present in the new callset, fully annotated,
7790
# but NOT present in the existing annotations table.
7891
# callset_variants_ht consists of variants present in the new callset, fully annotated,
7992
# and either present or not present in the existing annotations table.
8093
callset_variants_ht = ht.semi_join(callset_ht)
81-
ht = ht.anti_join(callset_ht)
82-
lookup_ht = hl.read_table(
83-
lookup_table_path(
84-
self.reference_genome,
85-
self.dataset_type,
86-
),
87-
)
94+
non_callset_variants_ht = ht.anti_join(callset_ht)
8895
callset_variants_ht = callset_variants_ht.annotate(
8996
**get_fields(
9097
callset_variants_ht,
91-
self.dataset_type.lookup_table_annotation_fns,
92-
lookup_ht=hl.read_table(
93-
lookup_table_path(
94-
self.reference_genome,
95-
self.dataset_type,
96-
),
97-
),
98+
self.dataset_type.variant_frequency_annotation_fns,
99+
lookup_ht=lookup_ht,
100+
callset_ht=callset_ht,
98101
**self.param_kwargs,
99102
),
100103
)
101-
ht = ht.union(callset_variants_ht, unify=True)
102-
103-
# Variants may have fallen out of the callset and
104-
# have been removed from the lookup table during modification.
105-
# Ensure we don't proceed with those variants.
106-
ht = ht.semi_join(lookup_ht)
104+
ht = non_callset_variants_ht.union(callset_variants_ht, unify=True)
107105

108106
# Fix up the globals and mark the table as updated with these callset/project pairs.
109107
ht = self.annotate_globals(ht)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,12 @@
4848
TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
4949
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
5050
TEST_GCNV_BED_FILE = 'v03_pipeline/var/test/callsets/gcnv_1.tsv'
51+
TEST_GCNV_BED_FILE_2 = 'v03_pipeline/var/test/callsets/gcnv_2.tsv'
5152
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
5253
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
5354
TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv'
5455
TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
56+
TEST_PEDIGREE_8 = 'v03_pipeline/var/test/pedigrees/test_pedigree_8.tsv'
5557

5658
GENE_ID_MAPPING = {
5759
'OR4F5': 'ENSG00000186092',
@@ -1478,7 +1480,7 @@ def test_sv_update_vat(
14781480
],
14791481
)
14801482

1481-
def test_gcnv_update_vat(
1483+
def test_gcnv_update_vat_multiple(
14821484
self,
14831485
) -> None:
14841486
worker = luigi.worker.Worker()
@@ -1614,3 +1616,58 @@ def test_gcnv_update_vat(
16141616
),
16151617
],
16161618
)
1619+
update_variant_annotations_task = (
1620+
UpdateVariantAnnotationsTableWithNewSamplesTask(
1621+
reference_genome=ReferenceGenome.GRCh38,
1622+
dataset_type=DatasetType.GCNV,
1623+
sample_type=SampleType.WES,
1624+
callset_path=TEST_GCNV_BED_FILE_2,
1625+
project_guids=['R0115_test_project2'],
1626+
project_remap_paths=['not_a_real_file'],
1627+
project_pedigree_paths=[TEST_PEDIGREE_8],
1628+
skip_validation=True,
1629+
run_id='second_run_id',
1630+
)
1631+
)
1632+
worker.add(update_variant_annotations_task)
1633+
worker.run()
1634+
ht = hl.read_table(update_variant_annotations_task.output().path)
1635+
self.assertEqual(ht.count(), 3)
1636+
self.assertCountEqual(
1637+
ht.select('gt_stats').collect(),
1638+
[
1639+
hl.Struct(
1640+
# Existing variant, gt_stats are preserved
1641+
variant_id='suffix_16456_DEL',
1642+
gt_stats=hl.Struct(
1643+
AF=hl.eval(hl.float32(4.401408e-05)),
1644+
AC=1,
1645+
AN=22720,
1646+
Hom=None,
1647+
Het=None,
1648+
),
1649+
),
1650+
hl.Struct(
1651+
# Exiting variant, gt_stats are updated
1652+
variant_id='suffix_16457_DEL',
1653+
gt_stats=hl.Struct(
1654+
AF=8.111110946629196e-05,
1655+
AC=3,
1656+
AN=36986,
1657+
Hom=None,
1658+
Het=None,
1659+
),
1660+
),
1661+
hl.Struct(
1662+
# New variant.
1663+
variant_id='suffix_99999_DEL',
1664+
gt_stats=hl.Struct(
1665+
AF=8.802817319519818e-05,
1666+
AC=2,
1667+
AN=22719,
1668+
Hom=None,
1669+
Het=None,
1670+
),
1671+
),
1672+
],
1673+
)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
chr start end name sample sample_fix svtype GT CN NP QA QS QSE QSS ploidy strand variant_name ID rmsstd defragmented sf sc lt100_raw_calls lt10_highQS_rare_calls PASS_SAMPLE PASS_FREQ PASS_QS HIGH_QUALITY genes_any_overlap genes_any_overlap_exonsPerGene genes_any_overlap_totalExons genes_strict_overlap genes_strict_overlap_exonsPerGene genes_strict_overlap_totalExons genes_CG genes_LOF genes_any_overlap_Ensemble_ID genes_LOF_Ensemble_ID genes_CG_Ensemble_ID var_source callset_ovl identical_ovl partial_0_5_ovl any_ovl no_ovl strvctvre_score is_latest sample_cram_basename
2+
chr1 100006937 100007881 COHORT_1_Y_cnv_16326 RP-2037_RGP_999_1_v1_Exome_GCP RGP_999_1 DEL 1 1 1 4 4 4 4 2 - suffix_16456 115884 0 FALSE 4.401408e-05 1 TRUE TRUE TRUE TRUE FALSE FALSE AC118553.2,SLC35A3 1,1 2 None 0 0 NA AC118553.2,SLC35A3 ENSG00000283761.1,ENSG00000117620.15 ENSG00000283761.1,ENSG00000117620.15 round2 round1 cluster_6_CASE_cnv_74577 cluster_6_CASE_cnv_74577 cluster_6_CASE_cnv_74577 FALSE 0.583 FALSE RGP_999_1
3+
chr1 100017586 100023212 CASE_10_X_cnv_38690 C1992_RGP_999_1_v1_Exome_C1992 RGP_999_1 DEL 1 0 2 20 30 30 20 2 - suffix_16457 115887 0 FALSE 8.111111E-05 3 FALSE FALSE FALSE TRUE FALSE FALSE AC118553.2,SLC35A3 1,2 2 None 0 0 NA AC118553.2,SLC35A3 ENSG00000283761.1,ENSG22222222222.15 ENSG00000283761.1,ENSG00000117620.15 round2 round1 cluster_22_COHORT_cnv_56835 cluster_22_COHORT_cnv_56835 cluster_22_COHORT_cnv_56835 FALSE 0.507 FALSE RGP_999_1
4+
chr1 100017585 100023213 CASE_10_X_cnv_38690 C1992_RGP_999_1_v1_Exome_GCP RGP_999_1 DEL 1 0 2 20 30 30 20 2 - suffix_99999 115887 0 FALSE 8.802817e-05 2 FALSE FALSE FALSE TRUE FALSE FALSE AC118553.2,SLC35A3 1,2 3 None 0 0 NA AC118553.2,SLC35A3 ENSG00000283761.1,ENSG00000117620.15 ENSG00000283761.1,ENSG00000117620.15 round2 round1 cluster_22_COHORT_cnv_56835 cluster_22_COHORT_cnv_56835 cluster_22_COHORT_cnv_56835 FALSE 0.507 FALSE RGP_999_1
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex
2+
R0115_test_project2 family_3_1 family_3 RGP_999_1 F

0 commit comments

Comments
 (0)