Skip to content

Commit 586cfed

Browse files
authored
Merge pull request #913 from broadinstitute/dev
main <- dev
2 parents 993356d + 04376e8 commit 586cfed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+511
-112
lines changed

v03_pipeline/bin/pipeline_worker.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
project_remap_path,
1414
)
1515
from v03_pipeline.lib.tasks import (
16-
UpdateCachedReferenceDatasetQueries,
1716
UpdateVariantAnnotationsTableWithNewSamplesTask,
1817
WriteProjectFamilyTablesTask,
1918
)
@@ -50,10 +49,6 @@ def main():
5049
k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'
5150
}
5251
tasks = [
53-
UpdateCachedReferenceDatasetQueries(
54-
reference_genome=lpr.reference_genome,
55-
dataset_type=lpr.dataset_type,
56-
),
5752
UpdateVariantAnnotationsTableWithNewSamplesTask(
5853
project_guids=lpr.projects_to_run,
5954
project_remap_paths=project_remap_paths,

v03_pipeline/lib/misc/io.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
8080
)
8181
def split_multi_hts(
8282
mt: hl.MatrixTable,
83+
skip_validation: bool,
8384
max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
8485
) -> hl.MatrixTable:
8586
bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
@@ -94,7 +95,11 @@ def split_multi_hts(
9495
permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle,
9596
)
9697
mt = split.union_rows(bi)
97-
return mt.distinct_by_row()
98+
# If we've disabled validation (which is expected to throw an exception
99+
# for duplicate variants, we would like to distinc )
100+
if skip_validation:
101+
return mt.distinct_by_row()
102+
return mt
98103

99104

100105
def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:

v03_pipeline/lib/misc/io_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,5 +170,6 @@ def test_split_multi_failure(self) -> None:
170170
)
171171
.key_rows_by('locus', 'alleles')
172172
.repartition(1),
173+
False,
173174
1,
174175
)

v03_pipeline/lib/misc/validation.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,17 @@ def validate_allele_type(
7575

7676
def validate_no_duplicate_variants(
7777
mt: hl.MatrixTable,
78+
reference_genome: ReferenceGenome,
79+
dataset_type: DatasetType,
7880
**_: Any,
7981
) -> None:
8082
ht = mt.rows()
8183
ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
8284
ht = ht.filter(ht.n > 1)
85+
ht = ht.select()
8386
if ht.count() > 0:
84-
msg = f'Variants are present multiple times in the callset: {ht.collect()}'
87+
variant_format = dataset_type.table_key_format_fn(reference_genome)
88+
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()]}'
8589
raise SeqrValidationError(msg)
8690

8791

@@ -99,7 +103,7 @@ def validate_expected_contig_frequency(
99103
)
100104
if missing_contigs:
101105
msg = 'Missing the following expected contigs:{}'.format(
102-
', '.join(missing_contigs),
106+
', '.join(sorted(missing_contigs)),
103107
)
104108
raise SeqrValidationError(msg)
105109

v03_pipeline/lib/misc/validation_test.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,15 +171,22 @@ def test_validate_no_duplicate_variants(self) -> None:
171171
reference_genome='GRCh38',
172172
),
173173
],
174+
'alleles': [
175+
['A', 'C'],
176+
['A', 'C'],
177+
['A', 'C'],
178+
],
174179
},
175180
cols={'s': ['sample_1']},
176181
entries={'HL': [[0.0], [0.0], [0.0]]},
177-
).key_rows_by('locus')
182+
).key_rows_by('locus', 'alleles')
178183
self.assertRaisesRegex(
179184
SeqrValidationError,
180-
'Variants are present multiple times in the callset',
185+
"Variants are present multiple times in the callset: \\['1-2-A-C'\\]",
181186
validate_no_duplicate_variants,
182187
mt,
188+
ReferenceGenome.GRCh38,
189+
DatasetType.SNV_INDEL,
183190
)
184191

185192
def test_validate_expected_contig_frequency(self) -> None:

v03_pipeline/lib/misc/vets_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def test_annotate_vets(self) -> None:
8989
cols={'s': ['sample_1']},
9090
entries={'HL': [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]},
9191
).key_rows_by('locus', 'alleles')
92-
dragen_mt = split_multi_hts(dragen_mt)
92+
dragen_mt = split_multi_hts(dragen_mt, False)
9393
dragen_mt = annotate_vets(dragen_mt)
9494
self.assertListEqual(
9595
dragen_mt.filters.collect(),

v03_pipeline/lib/model/dataset_type.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ def table_key_type(
2929
DatasetType.SV: hl.tstruct(variant_id=hl.tstr),
3030
}.get(self, default_key)
3131

32+
def table_key_format_fn(
33+
self,
34+
reference_genome: ReferenceGenome,
35+
) -> Callable[[hl.StructExpression], str]:
36+
if self in {DatasetType.GCNV, DatasetType.SV}:
37+
return lambda s: s.variant_id
38+
return (
39+
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
40+
)
41+
3242
@property
3343
def col_fields(
3444
self,

v03_pipeline/lib/paths.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,21 @@ def imported_callset_path(
109109
)
110110

111111

112+
def validation_errors_for_run_path(
113+
reference_genome: ReferenceGenome,
114+
dataset_type: DatasetType,
115+
run_id: str,
116+
) -> str:
117+
return os.path.join(
118+
runs_path(
119+
reference_genome,
120+
dataset_type,
121+
),
122+
run_id,
123+
'validation_errors.json',
124+
)
125+
126+
112127
def metadata_for_run_path(
113128
reference_genome: ReferenceGenome,
114129
dataset_type: DatasetType,

v03_pipeline/lib/paths_test.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
sex_check_table_path,
2525
valid_filters_path,
2626
valid_reference_dataset_collection_path,
27+
validation_errors_for_run_path,
2728
variant_annotations_table_path,
2829
)
2930

@@ -141,6 +142,16 @@ def test_relatedness_check_table_path(self) -> None:
141142
'/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
142143
)
143144

145+
def test_validation_errors_for_run_path(self) -> None:
146+
self.assertEqual(
147+
validation_errors_for_run_path(
148+
ReferenceGenome.GRCh38,
149+
DatasetType.SNV_INDEL,
150+
'manual__2023-06-26T18:30:09.349671+00:00',
151+
),
152+
'/seqr/seqr-hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/validation_errors.json',
153+
)
154+
144155
def test_metadata_for_run_path(self) -> None:
145156
self.assertEqual(
146157
metadata_for_run_path(

v03_pipeline/lib/tasks/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@
88
from v03_pipeline.lib.tasks.migrate_all_variant_annotations_tables import (
99
MigrateAllVariantAnnotationsTablesTask,
1010
)
11-
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
12-
UpdateCachedReferenceDatasetQueries,
13-
)
1411
from v03_pipeline.lib.tasks.update_lookup_table import (
1512
UpdateLookupTableTask,
1613
)
@@ -53,7 +50,6 @@
5350
'UpdateVariantAnnotationsTableWithNewSamplesTask',
5451
'UpdateVariantAnnotationsTableWithDeletedProjectTask',
5552
'UpdateVariantAnnotationsTableWithDeletedFamiliesTask',
56-
'UpdateCachedReferenceDatasetQueries',
5753
'WriteMetadataForRunTask',
5854
'WriteProjectFamilyTablesTask',
5955
]

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
)
1414
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
1515
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
16+
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
17+
UpdateCachedReferenceDatasetQueries,
18+
)
1619
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import (
1720
UpdatedReferenceDatasetCollectionTask,
1821
)
@@ -32,19 +35,24 @@ def output(self) -> luigi.Target:
3235
)
3336

3437
def requires(self) -> list[luigi.Task]:
35-
return [
36-
(
37-
UpdatedReferenceDatasetCollectionTask(
38-
self.reference_genome,
39-
self.dataset_type,
40-
rdc,
41-
)
38+
requirements = [
39+
UpdateCachedReferenceDatasetQueries(
40+
reference_genome=self.reference_genome,
41+
dataset_type=self.dataset_type,
42+
),
43+
]
44+
requirements.extend(
45+
UpdatedReferenceDatasetCollectionTask(
46+
self.reference_genome,
47+
self.dataset_type,
48+
rdc,
4249
)
4350
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
4451
self.reference_genome,
4552
self.dataset_type,
4653
)
47-
]
54+
)
55+
return requirements
4856

4957
def initialize_table(self) -> hl.Table:
5058
key_type = self.dataset_type.table_key_type(self.reference_genome)

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,16 @@ def setUp(self) -> None:
5353
@patch(
5454
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask',
5555
)
56-
def test_should_create_initialized_table(self, mock_update_rdc_task) -> None:
56+
@patch(
57+
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries',
58+
)
59+
def test_should_create_initialized_table(
60+
self,
61+
mock_update_crdqs_task,
62+
mock_update_rdc_task,
63+
) -> None:
5764
mock_update_rdc_task.return_value = MockCompleteTask()
65+
mock_update_crdqs_task.return_value = MockCompleteTask()
5866
vat_task = BaseUpdateVariantAnnotationsTableTask(
5967
reference_genome=ReferenceGenome.GRCh38,
6068
dataset_type=DatasetType.SNV_INDEL,

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,9 @@
649649
@mock.patch(
650650
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask',
651651
)
652+
@mock.patch(
653+
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries',
654+
)
652655
@mock.patch(
653656
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.BaseUpdateVariantAnnotationsTableTask.initialize_table',
654657
)
@@ -719,9 +722,11 @@ def setUp(self) -> None:
719722
def test_update_vat_with_updated_rdc_snv_indel_38(
720723
self,
721724
mock_initialize_table,
725+
mock_update_crdqs_task,
722726
mock_update_rdc_task,
723727
):
724728
mock_update_rdc_task.return_value = MockCompleteTask()
729+
mock_update_crdqs_task.return_value = MockCompleteTask()
725730
mock_initialize_table.return_value = hl.Table.parallelize(
726731
[
727732
hl.Struct(
@@ -927,9 +932,11 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
927932
def test_update_vat_with_updated_rdc_mito_38(
928933
self,
929934
mock_initialize_table,
935+
mock_update_crdqs_task,
930936
mock_update_rdc_task,
931937
):
932938
mock_update_rdc_task.return_value = MockCompleteTask()
939+
mock_update_crdqs_task.return_value = MockCompleteTask()
933940
mock_initialize_table.return_value = hl.Table.parallelize(
934941
[
935942
hl.Struct(
@@ -1075,9 +1082,11 @@ def test_update_vat_with_updated_rdc_mito_38(
10751082
def test_update_vat_with_updated_rdc_snv_indel_37(
10761083
self,
10771084
mock_initialize_table,
1085+
mock_update_crdqs_task,
10781086
mock_update_rdc_task,
10791087
):
10801088
mock_update_rdc_task.return_value = MockCompleteTask()
1089+
mock_update_crdqs_task.return_value = MockCompleteTask()
10811090
mock_initialize_table.return_value = hl.Table.parallelize(
10821091
[
10831092
hl.Struct(

0 commit comments

Comments
 (0)