2
2
import luigi
3
3
import luigi .util
4
4
5
+ from v03_pipeline .lib .misc .callsets import additional_row_fields
5
6
from v03_pipeline .lib .misc .io import (
6
7
import_callset ,
7
8
import_vcf ,
8
9
select_relevant_fields ,
9
10
split_multi_hts ,
10
11
)
11
- from v03_pipeline .lib .misc .validation import (
12
- validate_allele_type ,
13
- validate_expected_contig_frequency ,
14
- validate_imported_field_types ,
15
- validate_imputed_sex_ploidy ,
16
- validate_no_duplicate_variants ,
17
- validate_sample_type ,
18
- )
19
12
from v03_pipeline .lib .misc .vets import annotate_vets
20
- from v03_pipeline .lib .model import CachedReferenceDatasetQuery
21
13
from v03_pipeline .lib .model .environment import Env
22
14
from v03_pipeline .lib .paths import (
23
- cached_reference_dataset_query_path ,
24
15
imported_callset_path ,
25
- sex_check_table_path ,
26
16
valid_filters_path ,
27
17
)
28
18
from v03_pipeline .lib .tasks .base .base_loading_run_params import BaseLoadingRunParams
29
19
from v03_pipeline .lib .tasks .base .base_write import BaseWriteTask
30
- from v03_pipeline .lib .tasks .files import CallsetTask , GCSorLocalTarget , HailTableTask
31
- from v03_pipeline .lib .tasks .reference_data .updated_cached_reference_dataset_query import (
32
- UpdatedCachedReferenceDatasetQuery ,
33
- )
34
- from v03_pipeline .lib .tasks .write_sex_check_table import WriteSexCheckTableTask
20
+ from v03_pipeline .lib .tasks .files import CallsetTask , GCSorLocalTarget
35
21
36
22
37
23
@luigi .util .inherits (BaseLoadingRunParams )
38
24
class WriteImportedCallsetTask (BaseWriteTask ):
39
25
def complete (self ) -> luigi .Target :
40
- if not self .force and super ().complete ():
41
- mt = hl .read_matrix_table (self .output ().path )
42
- return hasattr (mt , 'sample_type' ) and hl .eval (
43
- self .sample_type .value == mt .sample_type ,
44
- )
45
- return False
26
+ return not self .force and super ().complete ()
46
27
47
28
def output (self ) -> luigi .Target :
48
29
return GCSorLocalTarget (
@@ -72,56 +53,11 @@ def requires(self) -> list[luigi.Task]:
72
53
),
73
54
),
74
55
]
75
- if not self .skip_validation and self .dataset_type .can_run_validation :
76
- requirements = [
77
- * requirements ,
78
- (
79
- self .clone (
80
- UpdatedCachedReferenceDatasetQuery ,
81
- crdq = CachedReferenceDatasetQuery .GNOMAD_CODING_AND_NONCODING_VARIANTS ,
82
- )
83
- if Env .REFERENCE_DATA_AUTO_UPDATE
84
- else HailTableTask (
85
- cached_reference_dataset_query_path (
86
- self .reference_genome ,
87
- self .dataset_type ,
88
- CachedReferenceDatasetQuery .GNOMAD_CODING_AND_NONCODING_VARIANTS ,
89
- ),
90
- ),
91
- ),
92
- ]
93
- if (
94
- Env .CHECK_SEX_AND_RELATEDNESS
95
- and not self .skip_check_sex_and_relatedness
96
- and self .dataset_type .check_sex_and_relatedness
97
- ):
98
- requirements = [
99
- * requirements ,
100
- self .clone (WriteSexCheckTableTask ),
101
- ]
102
56
return [
103
57
* requirements ,
104
58
CallsetTask (self .callset_path ),
105
59
]
106
60
107
- def additional_row_fields (self , mt ):
108
- return {
109
- ** (
110
- {'info.AF' : hl .tarray (hl .tfloat64 )}
111
- if not self .skip_check_sex_and_relatedness
112
- and self .dataset_type .check_sex_and_relatedness
113
- else {}
114
- ),
115
- # this field is never required, the pipeline
116
- # will run smoothly even in its absence, but
117
- # will trigger special handling if it is present.
118
- ** (
119
- {'info.CALIBRATION_SENSITIVITY' : hl .tarray (hl .tstr )}
120
- if hasattr (mt , 'info' ) and hasattr (mt .info , 'CALIBRATION_SENSITIVITY' )
121
- else {}
122
- ),
123
- }
124
-
125
61
def create_table (self ) -> hl .MatrixTable :
126
62
mt = import_callset (
127
63
self .callset_path ,
@@ -146,64 +82,19 @@ def create_table(self) -> hl.MatrixTable:
146
82
mt = select_relevant_fields (
147
83
mt ,
148
84
self .dataset_type ,
149
- self .additional_row_fields (mt ),
150
- )
151
- # This validation isn't override-able. If a field is the wrong
152
- # type, the pipeline will likely hard-fail downstream.
153
- validate_imported_field_types (
154
- mt ,
155
- self .dataset_type ,
156
- self .additional_row_fields (mt ),
85
+ additional_row_fields (
86
+ mt ,
87
+ self .dataset_type ,
88
+ self .skip_check_sex_and_relatedness ,
89
+ ),
157
90
)
158
91
if self .dataset_type .has_multi_allelic_variants :
159
92
mt = split_multi_hts (mt )
160
93
# Special handling of variant-level filter annotation for VETs filters.
161
94
# The annotations are present on the sample-level FT field but are
162
95
# expected upstream on "filters".
163
96
mt = annotate_vets (mt )
164
- if self .dataset_type .can_run_validation :
165
- # Rather than throwing an error, we silently remove invalid contigs.
166
- # This happens fairly often for AnVIL requests.
167
- mt = mt .filter_rows (
168
- hl .set (self .reference_genome .standard_contigs ).contains (
169
- mt .locus .contig ,
170
- ),
171
- )
172
- if not self .skip_validation and self .dataset_type .can_run_validation :
173
- validate_allele_type (mt )
174
- validate_no_duplicate_variants (mt )
175
- validate_expected_contig_frequency (mt , self .reference_genome )
176
- coding_and_noncoding_ht = hl .read_table (
177
- cached_reference_dataset_query_path (
178
- self .reference_genome ,
179
- self .dataset_type ,
180
- CachedReferenceDatasetQuery .GNOMAD_CODING_AND_NONCODING_VARIANTS ,
181
- ),
182
- )
183
- validate_sample_type (
184
- mt ,
185
- coding_and_noncoding_ht ,
186
- self .reference_genome ,
187
- self .sample_type ,
188
- )
189
- if (
190
- Env .CHECK_SEX_AND_RELATEDNESS
191
- and not self .skip_check_sex_and_relatedness
192
- and self .dataset_type .check_sex_and_relatedness
193
- ):
194
- sex_check_ht = hl .read_table (
195
- sex_check_table_path (
196
- self .reference_genome ,
197
- self .dataset_type ,
198
- self .callset_path ,
199
- ),
200
- )
201
- validate_imputed_sex_ploidy (
202
- mt ,
203
- sex_check_ht ,
204
- )
205
- return mt .annotate_globals (
97
+ return mt .select_globals (
206
98
callset_path = self .callset_path ,
207
99
filters_path = filters_path or hl .missing (hl .tstr ),
208
- sample_type = self .sample_type .value ,
209
100
)
0 commit comments