Skip to content

Commit 599847f

Browse files
kachulistmelman
andauthored
split multisample vcf into single sample vcfs at end of imputation pipeline (#33)
* split multisample to single sample and end of imputation pipeline * add crosscheck * getting optionals to work * typo * update validation to pass hap_db Co-authored-by: tmelman <22672518+tmelman@users.noreply.github.com>
1 parent 4611ad0 commit 599847f

File tree

3 files changed

+108
-2
lines changed

3 files changed

+108
-2
lines changed

ImputationPipeline/Imputation.wdl

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ workflow ImputationPipeline {
2323
Array[ReferencePanelContig] referencePanelContigs
2424
String genetic_maps_eagle = "/genetic_map_hg19_withX.txt.gz" # this is for Eagle, it is in the docker image
2525
String output_callset_name = "broad_imputation" # the output callset name
26+
Boolean split_output_to_single_sample = false
27+
File haplotype_database
2628
}
2729

2830
if (defined(single_sample_vcfs)) {
@@ -230,14 +232,45 @@ workflow ImputationPipeline {
230232
basename = output_callset_name
231233
}
232234

235+
call CrosscheckFingerprints {
236+
input:
237+
firstInputs = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf]) else select_first([single_sample_vcfs]),
238+
firstInputIndices = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf_index]) else select_first([single_sample_vcf_indices]),
239+
secondInputs = [InterleaveVariants.output_vcf],
240+
secondInputIndices = [InterleaveVariants.output_vcf_index],
241+
haplotypeDatabase = haplotype_database,
242+
basename = output_callset_name
243+
}
244+
245+
if (split_output_to_single_sample) {
246+
call SplitMultiSampleVcf {
247+
input:
248+
multiSampleVcf = InterleaveVariants.output_vcf
249+
}
250+
251+
call CrosscheckFingerprints as CrosscheckFingerprintsSplit {
252+
input:
253+
firstInputs = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf]) else select_first([single_sample_vcfs]),
254+
firstInputIndices = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf_index]) else select_first([single_sample_vcf_indices]),
255+
secondInputs = SplitMultiSampleVcf.single_sample_vcfs,
256+
secondInputIndices = SplitMultiSampleVcf.single_sample_vcf_indices,
257+
haplotypeDatabase = haplotype_database,
258+
basename = output_callset_name + ".split"
259+
}
260+
}
261+
233262

234263
output {
264+
Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs
265+
Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices
235266
File imputed_multisample_vcf = InterleaveVariants.output_vcf
236267
File imputed_multisample_vcf_index = InterleaveVariants.output_vcf_index
237268
File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics
238269
File chunks_info = StoreChunksInfo.chunks_info
239270
File failed_chunks = StoreChunksInfo.failed_chunks
240271
Int n_failed_chunks = StoreChunksInfo.n_failed_chunks
272+
File crosscheck = CrosscheckFingerprints.crosscheck
273+
File? crosscheck_split = CrosscheckFingerprintsSplit.crosscheck
241274
}
242275
}
243276

@@ -916,4 +949,73 @@ task FindSitesFileTwoOnly {
916949
output {
917950
File missing_sites = "missing_sites.ids"
918951
}
952+
}
953+
954+
task SplitMultiSampleVcf {
955+
input {
956+
File multiSampleVcf
957+
Int mem = 8
958+
}
959+
960+
Int disk_size = ceil(3*size(multiSampleVcf, "GB")) + 100
961+
962+
command <<<
963+
mkdir out_dir
964+
bcftools +split ~{multiSampleVcf} -Oz -o out_dir
965+
for vcf in out_dir/*.vcf.gz; do
966+
bcftools index -t $vcf
967+
done
968+
>>>
969+
970+
runtime {
971+
docker: "biocontainers/bcftools:v1.9-1-deb_cv1"
972+
disks: "local-disk " + disk_size + " SSD"
973+
memory: mem + " GB"
974+
}
975+
976+
output {
977+
Array[File] single_sample_vcfs = glob("out_dir/*.vcf.gz")
978+
Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi")
979+
}
980+
}
981+
982+
task CrosscheckFingerprints {
983+
input {
984+
Array[File] firstInputs
985+
Array[File] secondInputs
986+
Array[File] firstInputIndices
987+
Array[File] secondInputIndices
988+
File haplotypeDatabase
989+
String basename
990+
Int mem = 8
991+
}
992+
993+
Int disk_size = ceil(1.2*(size(firstInputs, "GB") + size(secondInputs, "GB") + size(haplotypeDatabase, "GB"))) + 100
994+
995+
command <<<
996+
# add links to ensure correctly located indices
997+
array_vcfs=( ~{sep=" " firstInputs} )
998+
array_indices=( ~{sep=" " firstInputIndices} )
999+
for i in ${!array_vcfs[@]}; do
1000+
ln -s ${array_indices[i]} $(dirname ${array_vcfs[i]})
1001+
done
1002+
1003+
array_vcfs2=( ~{sep=" " secondInputs} )
1004+
array_indices2=( ~{sep=" " secondInputIndices} )
1005+
for i in ${!array_vcfs2[@]}; do
1006+
ln -s ${array_indices2[i]} $(dirname ${array_vcfs2[i]})
1007+
done
1008+
1009+
gatk CrosscheckFingerprints -I ~{sep=" -I " firstInputs} -SI ~{sep=" -SI " secondInputs} -H ~{haplotypeDatabase} -O ~{basename}.crosscheck
1010+
>>>
1011+
1012+
runtime {
1013+
docker: "us.gcr.io/broad-gatk/gatk:4.2.0.0"
1014+
disks: "local-disk " + disk_size + " HDD"
1015+
memory: "16 GB"
1016+
}
1017+
1018+
output {
1019+
File crosscheck = "~{basename}.crosscheck"
1020+
}
9191021
}

ImputationPipeline/Validation/FullImputationPRSValidation.wdl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ workflow FullImputationPRSValidation {
3737
String branch
3838

3939
Int wgs_vcf_to_plink_mem = 8
40+
File haplotype_database
4041
}
4142

4243
call ValidateImputation.validateImputation {
@@ -54,7 +55,8 @@ workflow FullImputationPRSValidation {
5455
subpopulation_af_expression = subpopulation_af_expression,
5556
sample_map = sample_map,
5657
referencePanelContigs = referencePanelContigs,
57-
branch = branch
58+
branch = branch,
59+
haplotype_database = haplotype_database
5860
}
5961

6062
call ValidateScoring.ValidateScoring {

ImputationPipeline/Validation/ValidateImputation.wdl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ workflow validateImputation {
2525
File? sample_map #File which maps sample names in array to sample names in wgs. ":" used as separator
2626
2727
Array[ReferencePanelContig] referencePanelContigs
28+
File haplotype_database
2829
}
2930

3031
#run imputation on this branch
@@ -33,7 +34,8 @@ workflow validateImputation {
3334
multi_sample_vcf = validationArrays,
3435
multi_sample_vcf_index = validationArraysIndex,
3536
referencePanelContigs = referencePanelContigs,
36-
perform_extra_qc_steps = false
37+
perform_extra_qc_steps = false,
38+
haplotype_database = haplotype_database
3739
}
3840

3941
#run imputation on main branch

0 commit comments

Comments
 (0)