split multisample vcf into single sample vcfs at end of imputation pipeline (#33)

kachulis · tmelman · web-flow · commit 599847f30ccf · 2021-04-09T13:48:10.000-04:00
* split multisample to single sample and end of imputation pipeline

* add crosscheck

* getting optionals to work

* typo

* update validation to pass hap_db

Co-authored-by: tmelman &lt;22672518+tmelman@users.noreply.github.com&gt;
diff --git a/ImputationPipeline/Imputation.wdl b/ImputationPipeline/Imputation.wdl
@@ -23,6 +23,8 @@ workflow ImputationPipeline {
     Array[ReferencePanelContig] referencePanelContigs
     String genetic_maps_eagle = "/genetic_map_hg19_withX.txt.gz" # this is for Eagle, it is in the docker image 
     String output_callset_name = "broad_imputation" # the output callset name
+	Boolean split_output_to_single_sample = false
+	File haplotype_database
   }
 
   if (defined(single_sample_vcfs)) {
@@ -230,14 +232,45 @@ workflow ImputationPipeline {
   		basename = output_callset_name
   }
 
+  call CrosscheckFingerprints {
+  	input:
+  		firstInputs = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf]) else select_first([single_sample_vcfs]),
+  		firstInputIndices = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf_index]) else select_first([single_sample_vcf_indices]),
+  		secondInputs = [InterleaveVariants.output_vcf],
+  		secondInputIndices = [InterleaveVariants.output_vcf_index],
+  		haplotypeDatabase = haplotype_database,
+  		basename = output_callset_name
+  }
+
+  if (split_output_to_single_sample) {
+  	call SplitMultiSampleVcf {
+  		input:
+  			multiSampleVcf = InterleaveVariants.output_vcf
+  	}
+
+  	call CrosscheckFingerprints as CrosscheckFingerprintsSplit {
+      	input:
+      		firstInputs = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf]) else select_first([single_sample_vcfs]),
+      		firstInputIndices = if (defined(multi_sample_vcf)) then select_all([multi_sample_vcf_index]) else select_first([single_sample_vcf_indices]),
+      		secondInputs = SplitMultiSampleVcf.single_sample_vcfs,
+      		secondInputIndices = SplitMultiSampleVcf.single_sample_vcf_indices,
+      		haplotypeDatabase = haplotype_database,
+      		basename = output_callset_name + ".split"
+      }
+  }
+
 
   output {
+    Array[File]? imputed_single_sample_vcfs = SplitMultiSampleVcf.single_sample_vcfs
+    Array[File]? imputed_single_sample_vcf_indices = SplitMultiSampleVcf.single_sample_vcf_indices
     File imputed_multisample_vcf = InterleaveVariants.output_vcf
     File imputed_multisample_vcf_index = InterleaveVariants.output_vcf_index
     File aggregated_imputation_metrics = MergeImputationQCMetrics.aggregated_metrics
     File chunks_info = StoreChunksInfo.chunks_info
     File failed_chunks = StoreChunksInfo.failed_chunks
     Int n_failed_chunks = StoreChunksInfo.n_failed_chunks
+    File crosscheck = CrosscheckFingerprints.crosscheck
+    File? crosscheck_split = CrosscheckFingerprintsSplit.crosscheck
   }
 }
 
@@ -916,4 +949,73 @@ task FindSitesFileTwoOnly {
 	output {
 		File missing_sites = "missing_sites.ids"
 	}
+}
+
+task SplitMultiSampleVcf {
+	input {
+		File multiSampleVcf
+		Int mem = 8
+	}
+
+	Int disk_size = ceil(3*size(multiSampleVcf, "GB")) + 100
+
+	command <<<
+		mkdir out_dir
+		bcftools +split ~{multiSampleVcf} -Oz -o out_dir
+		for vcf in out_dir/*.vcf.gz; do
+			bcftools index -t $vcf
+		done
+	>>>
+
+	runtime {
+		docker: "biocontainers/bcftools:v1.9-1-deb_cv1"
+		disks: "local-disk " + disk_size + " SSD"
+		memory: mem + " GB"
+	}
+
+	output {
+		Array[File] single_sample_vcfs = glob("out_dir/*.vcf.gz")
+		Array[File] single_sample_vcf_indices = glob("out_dir/*.vcf.gz.tbi")
+	}
+}
+
+task CrosscheckFingerprints {
+	input {
+		Array[File] firstInputs
+		Array[File] secondInputs
+		Array[File] firstInputIndices
+		Array[File] secondInputIndices
+		File haplotypeDatabase
+		String basename
+		Int mem = 8
+	}
+
+	Int disk_size = ceil(1.2*(size(firstInputs, "GB") + size(secondInputs, "GB") + size(haplotypeDatabase, "GB"))) + 100
+
+	command <<<
+		# add links to ensure correctly located indices
+		array_vcfs=( ~{sep=" " firstInputs} )
+		array_indices=( ~{sep=" " firstInputIndices} )
+		for i in ${!array_vcfs[@]}; do
+			ln -s ${array_indices[i]} $(dirname ${array_vcfs[i]})
+		done
+
+		array_vcfs2=( ~{sep=" " secondInputs} )
+		array_indices2=( ~{sep=" " secondInputIndices} )
+		for i in ${!array_vcfs2[@]}; do
+			ln -s ${array_indices2[i]} $(dirname ${array_vcfs2[i]})
+		done
+
+		gatk CrosscheckFingerprints -I ~{sep=" -I " firstInputs} -SI ~{sep=" -SI " secondInputs} -H ~{haplotypeDatabase} -O ~{basename}.crosscheck
+	>>>
+
+	runtime {
+		docker: "us.gcr.io/broad-gatk/gatk:4.2.0.0"
+		disks: "local-disk " + disk_size + " HDD"
+		memory: "16 GB"
+	}
+
+	output {
+		File crosscheck = "~{basename}.crosscheck"
+	}
 }
diff --git a/ImputationPipeline/Validation/FullImputationPRSValidation.wdl b/ImputationPipeline/Validation/FullImputationPRSValidation.wdl
@@ -37,6 +37,7 @@ workflow FullImputationPRSValidation {
 		String branch
 
 		Int wgs_vcf_to_plink_mem = 8
+		File haplotype_database
 	}
 
 	call ValidateImputation.validateImputation {
@@ -54,7 +55,8 @@ workflow FullImputationPRSValidation {
 			subpopulation_af_expression = subpopulation_af_expression,
 			sample_map = sample_map,
 			referencePanelContigs = referencePanelContigs,
-			branch = branch
+			branch = branch,
+			haplotype_database = haplotype_database
 	}
 
 	call ValidateScoring.ValidateScoring {
diff --git a/ImputationPipeline/Validation/ValidateImputation.wdl b/ImputationPipeline/Validation/ValidateImputation.wdl
@@ -25,6 +25,7 @@ workflow validateImputation {
 		File? sample_map #File which maps sample names in array to sample names in wgs.  ":" used as separator
 
 		Array[ReferencePanelContig] referencePanelContigs
+		File haplotype_database
 	}
 
 	#run imputation on this branch
@@ -33,7 +34,8 @@ workflow validateImputation {
 			multi_sample_vcf = validationArrays,
 			multi_sample_vcf_index = validationArraysIndex,
 			referencePanelContigs = referencePanelContigs,
-			perform_extra_qc_steps = false
+			perform_extra_qc_steps = false,
+			haplotype_database = haplotype_database
 	}
 
 	#run imputation on main branch

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ workflow FullImputationPRSValidation {`
`37`	`37`	`String branch`
`38`	`38`
`39`	`39`	`Int wgs_vcf_to_plink_mem = 8`
	`40`	`+ File haplotype_database`
`40`	`41`	`}`
`41`	`42`
`42`	`43`	`call ValidateImputation.validateImputation {`
`@@ -54,7 +55,8 @@ workflow FullImputationPRSValidation {`
`54`	`55`	`subpopulation_af_expression = subpopulation_af_expression,`
`55`	`56`	`sample_map = sample_map,`
`56`	`57`	`referencePanelContigs = referencePanelContigs,`
`57`		`- branch = branch`
	`58`	`+ branch = branch,`
	`59`	`+ haplotype_database = haplotype_database`
`58`	`60`	`}`
`59`	`61`
`60`	`62`	`call ValidateScoring.ValidateScoring {`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ workflow validateImputation {`
`25`	`25`	`File? sample_map #File which maps sample names in array to sample names in wgs. ":" used as separator`
`26`	`26`
`27`	`27`	`Array[ReferencePanelContig] referencePanelContigs`
	`28`	`+ File haplotype_database`
`28`	`29`	`}`
`29`	`30`
`30`	`31`	`#run imputation on this branch`
`@@ -33,7 +34,8 @@ workflow validateImputation {`
`33`	`34`	`multi_sample_vcf = validationArrays,`
`34`	`35`	`multi_sample_vcf_index = validationArraysIndex,`
`35`	`36`	`referencePanelContigs = referencePanelContigs,`
`36`		`- perform_extra_qc_steps = false`
	`37`	`+ perform_extra_qc_steps = false,`
	`38`	`+ haplotype_database = haplotype_database`
`37`	`39`	`}`
`38`	`40`
`39`	`41`	`#run imputation on main branch`