Skip to content

Commit 5ffa3e3

Browse files
authored
Merge pull request #910 from broadinstitute/dev
Dev
2 parents 029bba7 + 2bef2ff commit 5ffa3e3

18 files changed

+128
-62
lines changed

.github/workflows/unit-tests.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ jobs:
3636
run: ruff . --output-format github
3737
- name: Unit Tests
3838
run: |
39-
export HAIL_TMP_DIR=/tmp
4039
export GRCH37_TO_GRCH38_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz
4140
export GRCH38_TO_GRCH37_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz
4241
export ACCESS_PRIVATE_REFERENCE_DATASETS=1

v03_pipeline/lib/misc/callsets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def get_callset_ht(
3030
return callset_ht.distinct()
3131

3232

33-
def additional_row_fields(
33+
def get_additional_row_fields(
3434
mt: hl.MatrixTable,
3535
dataset_type: DatasetType,
3636
skip_check_sex_and_relatedness: bool,

v03_pipeline/lib/misc/validation.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,19 @@
1+
from typing import Any
2+
13
import hail as hl
24

3-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
5+
from v03_pipeline.lib.model import (
6+
CachedReferenceDatasetQuery,
7+
DatasetType,
8+
Env,
9+
ReferenceGenome,
10+
SampleType,
11+
Sex,
12+
)
13+
from v03_pipeline.lib.paths import (
14+
cached_reference_dataset_query_path,
15+
sex_check_table_path,
16+
)
417

518
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
619
MIN_ROWS_PER_CONTIG = 100
@@ -11,9 +24,40 @@ class SeqrValidationError(Exception):
1124
pass
1225

1326

27+
def get_validation_dependencies(
28+
dataset_type: DatasetType,
29+
reference_genome: ReferenceGenome,
30+
callset_path: str,
31+
skip_check_sex_and_relatedness: bool,
32+
**_: Any,
33+
) -> dict[str, hl.Table]:
34+
deps = {}
35+
deps['coding_and_noncoding_variants_ht'] = hl.read_table(
36+
cached_reference_dataset_query_path(
37+
reference_genome,
38+
dataset_type,
39+
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
40+
),
41+
)
42+
if (
43+
Env.CHECK_SEX_AND_RELATEDNESS
44+
and dataset_type.check_sex_and_relatedness
45+
and not skip_check_sex_and_relatedness
46+
):
47+
deps['sex_check_ht'] = hl.read_table(
48+
sex_check_table_path(
49+
reference_genome,
50+
dataset_type,
51+
callset_path,
52+
),
53+
)
54+
return deps
55+
56+
1457
def validate_allele_type(
1558
mt: hl.MatrixTable,
1659
dataset_type: DatasetType,
60+
**_: Any,
1761
) -> None:
1862
ht = mt.rows()
1963
ht = ht.filter(
@@ -31,6 +75,7 @@ def validate_allele_type(
3175

3276
def validate_no_duplicate_variants(
3377
mt: hl.MatrixTable,
78+
**_: Any,
3479
) -> None:
3580
ht = mt.rows()
3681
ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
@@ -44,6 +89,7 @@ def validate_expected_contig_frequency(
4489
mt: hl.MatrixTable,
4590
reference_genome: ReferenceGenome,
4691
min_rows_per_contig: int = MIN_ROWS_PER_CONTIG,
92+
**_: Any,
4793
) -> None:
4894
rows_per_contig = mt.aggregate_rows(hl.agg.counter(mt.locus.contig))
4995
missing_contigs = (
@@ -69,6 +115,7 @@ def validate_imported_field_types(
69115
mt: hl.MatrixTable,
70116
dataset_type: DatasetType,
71117
additional_row_fields: dict[str, hl.expr.types.HailType | set],
118+
**_: Any,
72119
) -> None:
73120
def _validate_field(
74121
mt_schema: hl.StructExpression,
@@ -104,8 +151,12 @@ def _validate_field(
104151

105152
def validate_imputed_sex_ploidy(
106153
mt: hl.MatrixTable,
107-
sex_check_ht: hl.Table,
154+
# NB: sex_check_ht will be undefined if sex checking is disabled for the run
155+
sex_check_ht: hl.Table | None = None,
156+
**_: Any,
108157
) -> None:
158+
if not sex_check_ht:
159+
return
109160
mt = mt.select_cols(
110161
discrepant=(
111162
(
@@ -132,6 +183,7 @@ def validate_sample_type(
132183
reference_genome: ReferenceGenome,
133184
sample_type: SampleType,
134185
sample_type_match_threshold: float = SAMPLE_TYPE_MATCH_THRESHOLD,
186+
**_: Any,
135187
) -> None:
136188
coding_variants_ht = coding_and_noncoding_variants_ht.filter(
137189
coding_and_noncoding_variants_ht.coding,

v03_pipeline/lib/misc/validation_test.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import unittest
2+
from unittest.mock import Mock, patch
23

34
import hail as hl
45

@@ -80,7 +81,9 @@ def test_validate_allele_type(self) -> None:
8081
DatasetType.SNV_INDEL,
8182
)
8283

83-
def test_validate_imputed_sex_ploidy(self) -> None:
84+
@patch('v03_pipeline.lib.misc.validation.Env')
85+
def test_validate_imputed_sex_ploidy(self, mock_env: Mock) -> None:
86+
mock_env.CHECK_SEX_AND_RELATEDNESS = True
8487
sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
8588
mt = hl.MatrixTable.from_parts(
8689
rows={

v03_pipeline/lib/tasks/base/base_loading_run_params.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class BaseLoadingRunParams(luigi.Task):
1616
# - The "Loading Pipeline" params are shared with
1717
# tasks that may remove data from or change the
1818
# structure of the persisted Hail Tables.
19+
run_id = luigi.Parameter()
1920
sample_type = luigi.EnumParameter(enum=SampleType)
2021
callset_path = luigi.Parameter()
2122
ignore_missing_samples_when_remapping = luigi.BoolParameter(

v03_pipeline/lib/tasks/update_lookup_table.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ class UpdateLookupTableTask(BaseUpdateLookupTableTask):
2424
project_guids = luigi.ListParameter()
2525
project_remap_paths = luigi.ListParameter()
2626
project_pedigree_paths = luigi.ListParameter()
27-
run_id = luigi.Parameter()
2827

2928
def complete(self) -> bool:
3029
return super().complete() and hl.eval(

v03_pipeline/lib/tasks/update_project_table_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,16 @@
1313
'v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv'
1414
)
1515

16+
TEST_RUN_ID = 'manual__2024-04-03'
17+
1618

1719
class UpdateProjectTableTaskTest(MockedDatarootTestCase):
1820
def test_update_project_table_task(self) -> None:
1921
worker = luigi.worker.Worker()
2022
upt_task = UpdateProjectTableTask(
2123
reference_genome=ReferenceGenome.GRCh38,
2224
dataset_type=DatasetType.SNV_INDEL,
25+
run_id=TEST_RUN_ID,
2326
sample_type=SampleType.WGS,
2427
callset_path=TEST_VCF,
2528
project_guid='R0113_test_project',
@@ -128,6 +131,7 @@ def test_update_project_table_task_different_pedigree(self) -> None:
128131
upt_task = UpdateProjectTableTask(
129132
reference_genome=ReferenceGenome.GRCh38,
130133
dataset_type=DatasetType.SNV_INDEL,
134+
run_id=TEST_RUN_ID,
131135
sample_type=SampleType.WGS,
132136
callset_path=TEST_VCF,
133137
project_guid='R0113_test_project',
@@ -140,6 +144,7 @@ def test_update_project_table_task_different_pedigree(self) -> None:
140144
upt_task = UpdateProjectTableTask(
141145
reference_genome=ReferenceGenome.GRCh38,
142146
dataset_type=DatasetType.SNV_INDEL,
147+
run_id=TEST_RUN_ID,
143148
sample_type=SampleType.WGS,
144149
callset_path=TEST_VCF,
145150
project_guid='R0113_test_project',

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ class UpdateVariantAnnotationsTableWithNewSamplesTask(
2323
project_guids = luigi.ListParameter()
2424
project_remap_paths = luigi.ListParameter()
2525
project_pedigree_paths = luigi.ListParameter()
26-
run_id = luigi.Parameter()
2726

2827
def requires(self) -> list[luigi.Task]:
2928
return [

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,18 @@
22
import luigi
33
import luigi.util
44

5-
from v03_pipeline.lib.misc.callsets import additional_row_fields
65
from v03_pipeline.lib.misc.validation import (
6+
get_validation_dependencies,
77
validate_allele_type,
88
validate_expected_contig_frequency,
9-
validate_imported_field_types,
109
validate_imputed_sex_ploidy,
1110
validate_no_duplicate_variants,
1211
validate_sample_type,
1312
)
1413
from v03_pipeline.lib.model import CachedReferenceDatasetQuery
1514
from v03_pipeline.lib.model.environment import Env
1615
from v03_pipeline.lib.paths import (
17-
cached_reference_dataset_query_path,
1816
imported_callset_path,
19-
sex_check_table_path,
2017
)
2118
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
2219
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
@@ -63,8 +60,8 @@ def requires(self) -> list[luigi.Task]:
6360
]
6461
if (
6562
Env.CHECK_SEX_AND_RELATEDNESS
66-
and not self.skip_check_sex_and_relatedness
6763
and self.dataset_type.check_sex_and_relatedness
64+
and not self.skip_check_sex_and_relatedness
6865
):
6966
requirements = [
7067
*requirements,
@@ -83,17 +80,6 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
8380
self.callset_path,
8481
),
8582
)
86-
# This validation isn't override-able. If a field is the wrong
87-
# type, the pipeline will likely hard-fail downstream.
88-
validate_imported_field_types(
89-
mt,
90-
self.dataset_type,
91-
additional_row_fields(
92-
mt,
93-
self.dataset_type,
94-
self.skip_check_sex_and_relatedness,
95-
),
96-
)
9783
if self.dataset_type.can_run_validation:
9884
# Rather than throwing an error, we silently remove invalid contigs.
9985
# This happens fairly often for AnVIL requests.
@@ -104,38 +90,34 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
10490
)
10591

10692
if not self.skip_validation and self.dataset_type.can_run_validation:
107-
validate_allele_type(mt, self.dataset_type)
108-
validate_no_duplicate_variants(mt)
109-
validate_expected_contig_frequency(mt, self.reference_genome)
110-
coding_and_noncoding_ht = hl.read_table(
111-
cached_reference_dataset_query_path(
112-
self.reference_genome,
113-
self.dataset_type,
114-
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
115-
),
93+
validation_dependencies = get_validation_dependencies(
94+
**self.param_kwargs,
95+
)
96+
validate_allele_type(
97+
mt,
98+
**self.param_kwargs,
99+
**validation_dependencies,
100+
)
101+
validate_no_duplicate_variants(
102+
mt,
103+
**self.param_kwargs,
104+
**validation_dependencies,
105+
)
106+
validate_expected_contig_frequency(
107+
mt,
108+
**self.param_kwargs,
109+
**validation_dependencies,
116110
)
117111
validate_sample_type(
118112
mt,
119-
coding_and_noncoding_ht,
120-
self.reference_genome,
121-
self.sample_type,
113+
**self.param_kwargs,
114+
**validation_dependencies,
115+
)
116+
validate_imputed_sex_ploidy(
117+
mt,
118+
**self.param_kwargs,
119+
**validation_dependencies,
122120
)
123-
if (
124-
Env.CHECK_SEX_AND_RELATEDNESS
125-
and not self.skip_check_sex_and_relatedness
126-
and self.dataset_type.check_sex_and_relatedness
127-
):
128-
sex_check_ht = hl.read_table(
129-
sex_check_table_path(
130-
self.reference_genome,
131-
self.dataset_type,
132-
self.callset_path,
133-
),
134-
)
135-
validate_imputed_sex_ploidy(
136-
mt,
137-
sex_check_ht,
138-
)
139121
return mt.select_globals(
140122
callset_path=self.callset_path,
141123
validated_sample_type=self.sample_type.value,

v03_pipeline/lib/tasks/write_family_table_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,16 @@
1212
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
1313
TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
1414

15+
TEST_RUN_ID = 'manual__2024-04-03'
16+
1517

1618
class WriteFamilyTableTaskTest(MockedDatarootTestCase):
1719
def test_snv_write_family_table_task(self) -> None:
1820
worker = luigi.worker.Worker()
1921
wft_task = WriteFamilyTableTask(
2022
reference_genome=ReferenceGenome.GRCh38,
2123
dataset_type=DatasetType.SNV_INDEL,
24+
run_id=TEST_RUN_ID,
2225
sample_type=SampleType.WGS,
2326
callset_path=TEST_SNV_INDEL_VCF,
2427
project_guid='R0113_test_project',
@@ -156,6 +159,7 @@ def test_sv_write_family_table_task(self) -> None:
156159
write_family_table_task = WriteFamilyTableTask(
157160
reference_genome=ReferenceGenome.GRCh38,
158161
dataset_type=DatasetType.SV,
162+
run_id=TEST_RUN_ID,
159163
sample_type=SampleType.WGS,
160164
callset_path=TEST_SV_VCF,
161165
project_guid='R0115_test_project2',
@@ -408,6 +412,7 @@ def test_gcnv_write_family_table_task(self) -> None:
408412
write_family_table_task = WriteFamilyTableTask(
409413
reference_genome=ReferenceGenome.GRCh38,
410414
dataset_type=DatasetType.GCNV,
415+
run_id=TEST_RUN_ID,
411416
sample_type=SampleType.WES,
412417
callset_path=TEST_GCNV_BED_FILE,
413418
project_guid='R0115_test_project2',

v03_pipeline/lib/tasks/write_imported_callset.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
import luigi
33
import luigi.util
44

5-
from v03_pipeline.lib.misc.callsets import additional_row_fields
5+
from v03_pipeline.lib.misc.callsets import get_additional_row_fields
66
from v03_pipeline.lib.misc.io import (
77
import_callset,
88
import_vcf,
99
select_relevant_fields,
1010
split_multi_hts,
1111
)
12+
from v03_pipeline.lib.misc.validation import (
13+
validate_imported_field_types,
14+
)
1215
from v03_pipeline.lib.misc.vets import annotate_vets
1316
from v03_pipeline.lib.model.environment import Env
1417
from v03_pipeline.lib.paths import (
@@ -79,14 +82,22 @@ def create_table(self) -> hl.MatrixTable:
7982
)
8083
filters_ht = import_vcf(filters_path, self.reference_genome).rows()
8184
mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
85+
additional_row_fields = get_additional_row_fields(
86+
mt,
87+
self.dataset_type,
88+
self.skip_check_sex_and_relatedness,
89+
)
8290
mt = select_relevant_fields(
8391
mt,
8492
self.dataset_type,
85-
additional_row_fields(
86-
mt,
87-
self.dataset_type,
88-
self.skip_check_sex_and_relatedness,
89-
),
93+
additional_row_fields,
94+
)
95+
# This validation isn't override-able by the skip option.
96+
# If a field is the wrong type, the pipeline will likely hard-fail downstream.
97+
validate_imported_field_types(
98+
mt,
99+
self.dataset_type,
100+
additional_row_fields,
90101
)
91102
if self.dataset_type.has_multi_allelic_variants:
92103
mt = split_multi_hts(mt)

v03_pipeline/lib/tasks/write_metadata_for_run.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ class WriteMetadataForRunTask(luigi.Task):
1717
project_guids = luigi.ListParameter()
1818
project_remap_paths = luigi.ListParameter()
1919
project_pedigree_paths = luigi.ListParameter()
20-
run_id = luigi.Parameter()
2120

2221
def output(self) -> luigi.Target:
2322
return GCSorLocalTarget(

0 commit comments

Comments
 (0)