Skip to content

Commit 077ee67

Browse files
Add Glimpse2Imputation pipeline (#123)
1 parent bf22ae3 commit 077ee67

13 files changed

+1219
-11
lines changed

.dockstore.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ workflows:
6363
- name: TrainAncestryAdjustmentModel
6464
subclass: WDL
6565
primaryDescriptorPath: /ImputationPipeline/TrainAncestryAdjustmentModel.wdl
66+
- name: Glimpse1Imputation
67+
subclass: WDL
68+
primaryDescriptorPath: /GlimpseImputationPipeline/Glimpse1Imputation.wdl
69+
- name: Glimpse2Imputation
70+
subclass: WDL
71+
primaryDescriptorPath: /GlimpseImputationPipeline/Glimpse2Imputation.wdl
72+
- name: Glimpse2SplitReference
73+
subclass: WDL
74+
primaryDescriptorPath: /GlimpseImputationPipeline/Glimpse2SplitReference.wdl
75+
- name: Glimpse2MergeBatches
76+
subclass: WDL
77+
primaryDescriptorPath: /GlimpseImputationPipeline/Glimpse2MergeBatches.wdl
78+
- name: ReduceAndMergeForGlimpse
79+
subclass: WDL
80+
primaryDescriptorPath: /GlimpseImputationPipeline/ReduceAndMergeForGlimpse.wdl
81+
- name: CollectBGEImputationMetrics
82+
subclass: WDL
83+
primaryDescriptorPath: /GlimpseImputationPipeline/CollectBGEImputationMetrics.wdl
84+
- name: Glimpse2ImputationInBatches
85+
subclass: WDL
86+
primaryDescriptorPath: /GlimpseImputationPipeline/Glimpse2ImputationInBatches.wdl
6687
- name: RNAMetrics
6788
subclass: WDL
6889
primaryDescriptorPath: /Utilities/WDLs/RNAMetrics.wdl
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
version 1.0
2+
3+
workflow GlimpseImputation {
4+
input {
5+
Array[File] chunks
6+
Array[String] reference_panel_contig_names
7+
Array[String] genetic_map_contig_names
8+
String reference_panel_prefix
9+
String reference_panel_suffix
10+
String reference_panel_index_suffix
11+
String genetic_map_path_prefix
12+
String genetic_map_path_suffix
13+
File input_vcf
14+
File input_vcf_index
15+
File ref_dict
16+
17+
Int mem_gb = 4
18+
Int cpu = 4
19+
Int preemptible = 1
20+
File? monitoring_script
21+
}
22+
23+
scatter (chunks_and_contig in zip(chunks, zip(reference_panel_contig_names, genetic_map_contig_names))) {
24+
String reference_filename = reference_panel_prefix + chunks_and_contig.right.left + reference_panel_suffix
25+
String genetic_map_filename = genetic_map_path_prefix + chunks_and_contig.right.right + genetic_map_path_suffix
26+
27+
call Glimpse {
28+
input:
29+
reference_panel = reference_filename,
30+
reference_panel_index = reference_filename + reference_panel_index_suffix,
31+
contig_name = chunks_and_contig.right.right,
32+
chunks = chunks_and_contig.left,
33+
ref_dict = ref_dict,
34+
input_vcf = input_vcf,
35+
input_vcf_index = input_vcf_index,
36+
genetic_map = genetic_map_filename,
37+
mem_gb = mem_gb,
38+
cpu = cpu,
39+
preemptible = preemptible,
40+
monitoring_script = monitoring_script
41+
}
42+
}
43+
44+
call GatherVcfs {
45+
input:
46+
input_vcfs = Glimpse.imputed_vcf,
47+
input_vcf_indices = Glimpse.imputed_vcf_index,
48+
output_vcf_basename = basename(input_vcf, ".vcf.gz") + ".imputed",
49+
preemptible = preemptible,
50+
monitoring_script = monitoring_script
51+
}
52+
53+
output {
54+
File imputed_vcf = GatherVcfs.output_vcf
55+
File imputed_vcf_index = GatherVcfs.output_vcf_index
56+
Array[File?] glimpse_monitoring = Glimpse.monitoring
57+
File? gather_monitoring = GatherVcfs.monitoring
58+
}
59+
}
60+
61+
task Glimpse {
62+
input {
63+
String contig_name
64+
File input_vcf
65+
File input_vcf_index
66+
File reference_panel
67+
File reference_panel_index
68+
File ref_dict
69+
File genetic_map
70+
71+
File chunks
72+
73+
Int mem_gb = 4
74+
Int cpu = 4
75+
Int disk_size_gb = ceil(2 * size(input_vcf, "GiB") + size(reference_panel, "GiB") + size(genetic_map, "GiB") + 100)
76+
Int preemptible = 1
77+
78+
File? monitoring_script
79+
}
80+
81+
command <<<
82+
set -xeuo pipefail
83+
84+
~{"bash " + monitoring_script + " > monitoring.log &"}
85+
86+
while IFS="" read -r LINE || [ -n "$LINE" ];
87+
do
88+
printf -v ID "%02d" $(echo $LINE | cut -d" " -f1)
89+
IRG=$(echo $LINE | cut -d" " -f3)
90+
ORG=$(echo $LINE | cut -d" " -f4)
91+
PHASE_OUT=phased.chunk${ID}.vcf.gz
92+
/glimpse/phase/bin/GLIMPSE_phase --input ~{input_vcf} --reference ~{reference_panel} --map ~{genetic_map} --input-region ${IRG} --output-region ${ORG} --output ${PHASE_OUT} --thread ~{cpu}
93+
tabix ${PHASE_OUT}
94+
done < ~{chunks}
95+
96+
LST=list.txt
97+
ls phased.chunk*.vcf.gz > ${LST}
98+
LIGATE_OUT=ligated.~{contig_name}.vcf.gz
99+
/glimpse/ligate/bin/GLIMPSE_ligate --input ${LST} --output $LIGATE_OUT --thread ~{cpu}
100+
tabix ${LIGATE_OUT}
101+
102+
SAMPLE_OUT=ligated.sampled.~{contig_name}.vcf.gz
103+
/glimpse/sample/bin/GLIMPSE_sample --input ${LIGATE_OUT} --solve --output ${SAMPLE_OUT} --thread ~{cpu}
104+
tabix ${SAMPLE_OUT}
105+
106+
UPDATE_OUT=ligated.sampled.dict_updated.~{contig_name}.vcf.gz
107+
java -jar /picard.jar UpdateVcfSequenceDictionary -I ${SAMPLE_OUT} --SD ~{ref_dict} -O ${UPDATE_OUT}
108+
tabix ${UPDATE_OUT}
109+
>>>
110+
111+
runtime {
112+
docker: "us.gcr.io/broad-dsde-methods/glimpse:1.1.1"
113+
disks: "local-disk " + disk_size_gb + " HDD"
114+
memory: mem_gb + " GiB"
115+
cpu: cpu
116+
preemptible: preemptible
117+
}
118+
119+
output {
120+
File imputed_vcf = "ligated.sampled.dict_updated." + contig_name + ".vcf.gz"
121+
File imputed_vcf_index = "ligated.sampled.dict_updated." + contig_name + ".vcf.gz.tbi"
122+
File? monitoring = "monitoring.log"
123+
}
124+
}
125+
126+
task GatherVcfs {
127+
input {
128+
Array[File] input_vcfs
129+
Array[File] input_vcf_indices
130+
String output_vcf_basename
131+
132+
String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0"
133+
Int cpu = 1
134+
Int memory_mb = 16000
135+
Int disk_size_gb = ceil(3*size(input_vcfs, "GiB"))
136+
Int preemptible = 1
137+
138+
File? monitoring_script
139+
}
140+
Int command_mem = memory_mb - 1000
141+
Int max_heap = memory_mb - 500
142+
143+
command <<<
144+
set -xeuo pipefail
145+
146+
~{"bash " + monitoring_script + " > monitoring.log &"}
147+
148+
gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \
149+
GatherVcfs \
150+
-I ~{sep=' -I ' input_vcfs} \
151+
-O ~{output_vcf_basename}.vcf.gz
152+
153+
gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \
154+
IndexFeatureFile -I ~{output_vcf_basename}.vcf.gz
155+
156+
>>>
157+
runtime {
158+
docker: gatk_docker
159+
disks: "local-disk ${disk_size_gb} HDD"
160+
memory: "${memory_mb} MiB"
161+
cpu: cpu
162+
preemptible: preemptible
163+
}
164+
output {
165+
File output_vcf = "~{output_vcf_basename}.vcf.gz"
166+
File output_vcf_index = "~{output_vcf_basename}.vcf.gz.tbi"
167+
File? monitoring = "monitoring.log"
168+
}
169+
}

0 commit comments

Comments
 (0)