@@ -23,6 +23,8 @@ workflow ImputationPipeline {
23
23
Array [ReferencePanelContig ] referencePanelContigs
24
24
String genetic_maps_eagle = "/genetic_map_hg19_withX.txt.gz" # this is for Eagle, it is in the docker image
25
25
String output_callset_name = "broad_imputation" # the output callset name
26
+ Boolean split_output_to_single_sample = false
27
+ File haplotype_database
26
28
}
27
29
28
30
if (defined (single_sample_vcfs )) {
@@ -230,14 +232,45 @@ workflow ImputationPipeline {
230
232
basename = output_callset_name
231
233
}
232
234
235
+ call CrosscheckFingerprints {
236
+ input :
237
+ firstInputs = if (defined (multi_sample_vcf )) then select_all ([multi_sample_vcf ]) else select_first ([single_sample_vcfs ]),
238
+ firstInputIndices = if (defined (multi_sample_vcf )) then select_all ([multi_sample_vcf_index ]) else select_first ([single_sample_vcf_indices ]),
239
+ secondInputs = [InterleaveVariants .output_vcf ],
240
+ secondInputIndices = [InterleaveVariants .output_vcf_index ],
241
+ haplotypeDatabase = haplotype_database ,
242
+ basename = output_callset_name
243
+ }
244
+
245
+ if (split_output_to_single_sample ) {
246
+ call SplitMultiSampleVcf {
247
+ input :
248
+ multiSampleVcf = InterleaveVariants .output_vcf
249
+ }
250
+
251
+ call CrosscheckFingerprints as CrosscheckFingerprintsSplit {
252
+ input :
253
+ firstInputs = if (defined (multi_sample_vcf )) then select_all ([multi_sample_vcf ]) else select_first ([single_sample_vcfs ]),
254
+ firstInputIndices = if (defined (multi_sample_vcf )) then select_all ([multi_sample_vcf_index ]) else select_first ([single_sample_vcf_indices ]),
255
+ secondInputs = SplitMultiSampleVcf .single_sample_vcfs ,
256
+ secondInputIndices = SplitMultiSampleVcf .single_sample_vcf_indices ,
257
+ haplotypeDatabase = haplotype_database ,
258
+ basename = output_callset_name + ".split"
259
+ }
260
+ }
261
+
233
262
234
263
output {
264
+ Array [File ]? imputed_single_sample_vcfs = SplitMultiSampleVcf .single_sample_vcfs
265
+ Array [File ]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf .single_sample_vcf_indices
235
266
File imputed_multisample_vcf = InterleaveVariants .output_vcf
236
267
File imputed_multisample_vcf_index = InterleaveVariants .output_vcf_index
237
268
File aggregated_imputation_metrics = MergeImputationQCMetrics .aggregated_metrics
238
269
File chunks_info = StoreChunksInfo .chunks_info
239
270
File failed_chunks = StoreChunksInfo .failed_chunks
240
271
Int n_failed_chunks = StoreChunksInfo .n_failed_chunks
272
+ File crosscheck = CrosscheckFingerprints .crosscheck
273
+ File ? crosscheck_split = CrosscheckFingerprintsSplit .crosscheck
241
274
}
242
275
}
243
276
@@ -916,4 +949,73 @@ task FindSitesFileTwoOnly {
916
949
output {
917
950
File missing_sites = "missing_sites.ids"
918
951
}
952
+ }
953
+
954
+ task SplitMultiSampleVcf {
955
+ input {
956
+ File multiSampleVcf
957
+ Int mem = 8
958
+ }
959
+
960
+ Int disk_size = ceil(3*size(multiSampleVcf, "GB")) + 100
961
+
962
+ command <<<
963
+ mkdir out_dir
964
+ bcftools +split ~{multiSampleVcf} -Oz -o out_dir
965
+ for vcf in out_dir/*.vcf.gz; do
966
+ bcftools index -t $vcf
967
+ done
968
+ >>>
969
+
970
+ runtime {
971
+ docker: "biocontainers/bcftools:v1.9-1-deb_cv1"
972
+ disks: "local-disk " + disk_size + " SSD"
973
+ memory: mem + " GB"
974
+ }
975
+
976
+ output {
977
+ Array[File] single_sample_vcfs = glob("out_dir/*.vcf.gz")
978
+ Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi")
979
+ }
980
+ }
981
+
982
+ task CrosscheckFingerprints {
983
+ input {
984
+ Array[File] firstInputs
985
+ Array[File] secondInputs
986
+ Array[File] firstInputIndices
987
+ Array[File] secondInputIndices
988
+ File haplotypeDatabase
989
+ String basename
990
+ Int mem = 8
991
+ }
992
+
993
+ Int disk_size = ceil(1.2*(size(firstInputs, "GB") + size(secondInputs, "GB") + size(haplotypeDatabase, "GB"))) + 100
994
+
995
+ command <<<
996
+ # add links to ensure correctly located indices
997
+ array_vcfs=( ~{sep=" " firstInputs} )
998
+ array_indices=( ~{sep=" " firstInputIndices} )
999
+ for i in ${!array_vcfs[@]}; do
1000
+ ln -s ${array_indices[i]} $(dirname ${array_vcfs[i]})
1001
+ done
1002
+
1003
+ array_vcfs2=( ~{sep=" " secondInputs} )
1004
+ array_indices2=( ~{sep=" " secondInputIndices} )
1005
+ for i in ${!array_vcfs2[@]}; do
1006
+ ln -s ${array_indices2[i]} $(dirname ${array_vcfs2[i]})
1007
+ done
1008
+
1009
+ gatk CrosscheckFingerprints -I ~{sep=" -I " firstInputs} -SI ~{sep=" -SI " secondInputs} -H ~{haplotypeDatabase} -O ~{basename}.crosscheck
1010
+ >>>
1011
+
1012
+ runtime {
1013
+ docker: "us.gcr.io/broad-gatk/gatk:4.2.0.0"
1014
+ disks: "local-disk " + disk_size + " HDD"
1015
+ memory: "16 GB"
1016
+ }
1017
+
1018
+ output {
1019
+ File crosscheck = "~{basename}.crosscheck"
1020
+ }
919
1021
}
0 commit comments