Skip to content

main <- dev #913

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions v03_pipeline/bin/pipeline_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
project_remap_path,
)
from v03_pipeline.lib.tasks import (
UpdateCachedReferenceDatasetQueries,
UpdateVariantAnnotationsTableWithNewSamplesTask,
WriteProjectFamilyTablesTask,
)
Expand Down Expand Up @@ -50,10 +49,6 @@ def main():
k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'
}
tasks = [
UpdateCachedReferenceDatasetQueries(
reference_genome=lpr.reference_genome,
dataset_type=lpr.dataset_type,
),
UpdateVariantAnnotationsTableWithNewSamplesTask(
project_guids=lpr.projects_to_run,
project_remap_paths=project_remap_paths,
Expand Down
7 changes: 6 additions & 1 deletion v03_pipeline/lib/misc/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
)
def split_multi_hts(
mt: hl.MatrixTable,
skip_validation: bool,
max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
) -> hl.MatrixTable:
bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
Expand All @@ -94,7 +95,11 @@ def split_multi_hts(
permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle,
)
mt = split.union_rows(bi)
return mt.distinct_by_row()
# If we've disabled validation (which is expected to throw an exception
# for duplicate variants, we would like to distinc )
if skip_validation:
return mt.distinct_by_row()
return mt


def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
Expand Down
1 change: 1 addition & 0 deletions v03_pipeline/lib/misc/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,5 +170,6 @@ def test_split_multi_failure(self) -> None:
)
.key_rows_by('locus', 'alleles')
.repartition(1),
False,
1,
)
8 changes: 6 additions & 2 deletions v03_pipeline/lib/misc/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,17 @@ def validate_allele_type(

def validate_no_duplicate_variants(
mt: hl.MatrixTable,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
**_: Any,
) -> None:
ht = mt.rows()
ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
ht = ht.filter(ht.n > 1)
ht = ht.select()
if ht.count() > 0:
msg = f'Variants are present multiple times in the callset: {ht.collect()}'
variant_format = dataset_type.table_key_format_fn(reference_genome)
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()]}'
raise SeqrValidationError(msg)


Expand All @@ -99,7 +103,7 @@ def validate_expected_contig_frequency(
)
if missing_contigs:
msg = 'Missing the following expected contigs:{}'.format(
', '.join(missing_contigs),
', '.join(sorted(missing_contigs)),
)
raise SeqrValidationError(msg)

Expand Down
11 changes: 9 additions & 2 deletions v03_pipeline/lib/misc/validation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,22 @@ def test_validate_no_duplicate_variants(self) -> None:
reference_genome='GRCh38',
),
],
'alleles': [
['A', 'C'],
['A', 'C'],
['A', 'C'],
],
},
cols={'s': ['sample_1']},
entries={'HL': [[0.0], [0.0], [0.0]]},
).key_rows_by('locus')
).key_rows_by('locus', 'alleles')
self.assertRaisesRegex(
SeqrValidationError,
'Variants are present multiple times in the callset',
"Variants are present multiple times in the callset: \\['1-2-A-C'\\]",
validate_no_duplicate_variants,
mt,
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
)

def test_validate_expected_contig_frequency(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/misc/vets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_annotate_vets(self) -> None:
cols={'s': ['sample_1']},
entries={'HL': [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]},
).key_rows_by('locus', 'alleles')
dragen_mt = split_multi_hts(dragen_mt)
dragen_mt = split_multi_hts(dragen_mt, False)
dragen_mt = annotate_vets(dragen_mt)
self.assertListEqual(
dragen_mt.filters.collect(),
Expand Down
10 changes: 10 additions & 0 deletions v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ def table_key_type(
DatasetType.SV: hl.tstruct(variant_id=hl.tstr),
}.get(self, default_key)

def table_key_format_fn(
self,
reference_genome: ReferenceGenome,
) -> Callable[[hl.StructExpression], str]:
if self in {DatasetType.GCNV, DatasetType.SV}:
return lambda s: s.variant_id
return (
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
)

@property
def col_fields(
self,
Expand Down
15 changes: 15 additions & 0 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,21 @@ def imported_callset_path(
)


def validation_errors_for_run_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
run_id: str,
) -> str:
return os.path.join(
runs_path(
reference_genome,
dataset_type,
),
run_id,
'validation_errors.json',
)


def metadata_for_run_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down
11 changes: 11 additions & 0 deletions v03_pipeline/lib/paths_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
sex_check_table_path,
valid_filters_path,
valid_reference_dataset_collection_path,
validation_errors_for_run_path,
variant_annotations_table_path,
)

Expand Down Expand Up @@ -141,6 +142,16 @@ def test_relatedness_check_table_path(self) -> None:
'/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
)

def test_validation_errors_for_run_path(self) -> None:
self.assertEqual(
validation_errors_for_run_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
'manual__2023-06-26T18:30:09.349671+00:00',
),
'/seqr/seqr-hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/validation_errors.json',
)

def test_metadata_for_run_path(self) -> None:
self.assertEqual(
metadata_for_run_path(
Expand Down
4 changes: 0 additions & 4 deletions v03_pipeline/lib/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
from v03_pipeline.lib.tasks.migrate_all_variant_annotations_tables import (
MigrateAllVariantAnnotationsTablesTask,
)
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
UpdateCachedReferenceDatasetQueries,
)
from v03_pipeline.lib.tasks.update_lookup_table import (
UpdateLookupTableTask,
)
Expand Down Expand Up @@ -53,7 +50,6 @@
'UpdateVariantAnnotationsTableWithNewSamplesTask',
'UpdateVariantAnnotationsTableWithDeletedProjectTask',
'UpdateVariantAnnotationsTableWithDeletedFamiliesTask',
'UpdateCachedReferenceDatasetQueries',
'WriteMetadataForRunTask',
'WriteProjectFamilyTablesTask',
]
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
)
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
UpdateCachedReferenceDatasetQueries,
)
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import (
UpdatedReferenceDatasetCollectionTask,
)
Expand All @@ -32,19 +35,24 @@ def output(self) -> luigi.Target:
)

def requires(self) -> list[luigi.Task]:
return [
(
UpdatedReferenceDatasetCollectionTask(
self.reference_genome,
self.dataset_type,
rdc,
)
requirements = [
UpdateCachedReferenceDatasetQueries(
reference_genome=self.reference_genome,
dataset_type=self.dataset_type,
),
]
requirements.extend(
UpdatedReferenceDatasetCollectionTask(
self.reference_genome,
self.dataset_type,
rdc,
)
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
self.reference_genome,
self.dataset_type,
)
]
)
return requirements

def initialize_table(self) -> hl.Table:
key_type = self.dataset_type.table_key_type(self.reference_genome)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,16 @@ def setUp(self) -> None:
@patch(
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask',
)
def test_should_create_initialized_table(self, mock_update_rdc_task) -> None:
@patch(
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries',
)
def test_should_create_initialized_table(
self,
mock_update_crdqs_task,
mock_update_rdc_task,
) -> None:
mock_update_rdc_task.return_value = MockCompleteTask()
mock_update_crdqs_task.return_value = MockCompleteTask()
vat_task = BaseUpdateVariantAnnotationsTableTask(
reference_genome=ReferenceGenome.GRCh38,
dataset_type=DatasetType.SNV_INDEL,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,9 @@
@mock.patch(
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask',
)
@mock.patch(
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries',
)
@mock.patch(
'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.BaseUpdateVariantAnnotationsTableTask.initialize_table',
)
Expand Down Expand Up @@ -719,9 +722,11 @@ def setUp(self) -> None:
def test_update_vat_with_updated_rdc_snv_indel_38(
self,
mock_initialize_table,
mock_update_crdqs_task,
mock_update_rdc_task,
):
mock_update_rdc_task.return_value = MockCompleteTask()
mock_update_crdqs_task.return_value = MockCompleteTask()
mock_initialize_table.return_value = hl.Table.parallelize(
[
hl.Struct(
Expand Down Expand Up @@ -927,9 +932,11 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
def test_update_vat_with_updated_rdc_mito_38(
self,
mock_initialize_table,
mock_update_crdqs_task,
mock_update_rdc_task,
):
mock_update_rdc_task.return_value = MockCompleteTask()
mock_update_crdqs_task.return_value = MockCompleteTask()
mock_initialize_table.return_value = hl.Table.parallelize(
[
hl.Struct(
Expand Down Expand Up @@ -1075,9 +1082,11 @@ def test_update_vat_with_updated_rdc_mito_38(
def test_update_vat_with_updated_rdc_snv_indel_37(
self,
mock_initialize_table,
mock_update_crdqs_task,
mock_update_rdc_task,
):
mock_update_rdc_task.return_value = MockCompleteTask()
mock_update_crdqs_task.return_value = MockCompleteTask()
mock_initialize_table.return_value = hl.Table.parallelize(
[
hl.Struct(
Expand Down
Loading
Loading