@@ -21,17 +21,48 @@ workflow Glimpse2Imputation {
21
21
Int ? n_burnin
22
22
Int ? n_main
23
23
Int ? effective_population_size
24
+
25
+ Boolean collect_qc_metrics = true
24
26
25
- Int preemptible = 1
26
- String docker = "us.gcr.io/broad-dsde-methods/glimpse:palantir-workflows_20c9de0"
27
- Int cpu_phase = 4
28
- Int mem_gb_phase = 16
27
+ Int preemptible = 9
28
+ String docker = "us.gcr.io/broad-dsde-methods/glimpse:odelaneau_f310862"
29
+ String docker_extract_num_sites_from_reference_chunk = "us.gcr.io/broad-dsde-methods/glimpse_extract_num_sites_from_reference_chunks:michaelgatzen_edc7f3a"
29
30
Int cpu_ligate = 4
30
31
Int mem_gb_ligate = 4
31
32
File ? monitoring_script
32
33
}
33
34
34
35
scatter (reference_chunk in read_lines (reference_chunks )) {
36
+ call GetNumberOfSitesInChunk {
37
+ input :
38
+ reference_chunk = reference_chunk ,
39
+ docker = docker_extract_num_sites_from_reference_chunk
40
+ }
41
+
42
+ Int n_rare = GetNumberOfSitesInChunk .n_rare
43
+ Int n_common = GetNumberOfSitesInChunk .n_common
44
+
45
+ if (defined (input_vcf )) {
46
+ call CountSamples {
47
+ input :
48
+ vcf = select_first ([input_vcf ])
49
+ }
50
+ }
51
+
52
+ Int n_samples = select_first ([CountSamples .nSamples , length (select_first ([crams ]))])
53
+
54
+ call SelectResourceParameters {
55
+ input :
56
+ n_rare = n_rare ,
57
+ n_common = n_common ,
58
+ n_samples = n_samples
59
+ }
60
+
61
+ if (SelectResourceParameters .memory_gb > 256 || SelectResourceParameters .request_n_cpus > 32 ) {
62
+ # force failure if we're accidently going to request too much resources and spend too much money
63
+ Int safety_check_memory_gb = -1
64
+ Int safety_check_n_cpu = -1
65
+ }
35
66
call GlimpsePhase {
36
67
input :
37
68
reference_chunk = reference_chunk ,
@@ -49,8 +80,8 @@ workflow Glimpse2Imputation {
49
80
fasta_index = fasta_index ,
50
81
preemptible = preemptible ,
51
82
docker = docker ,
52
- cpu = cpu_phase ,
53
- mem_gb = mem_gb_phase ,
83
+ cpu = select_first ([ safety_check_n_cpu , SelectResourceParameters . request_n_cpus ]) ,
84
+ mem_gb = select_first ([ safety_check_memory_gb , SelectResourceParameters . memory_gb ]) ,
54
85
monitoring_script = monitoring_script
55
86
}
56
87
}
@@ -68,9 +99,21 @@ workflow Glimpse2Imputation {
68
99
monitoring_script = monitoring_script
69
100
}
70
101
102
+ if (collect_qc_metrics ) {
103
+ call CollectQCMetrics {
104
+ input :
105
+ imputed_vcf = GlimpseLigate .imputed_vcf ,
106
+ output_basename = output_basename ,
107
+ monitoring_script = monitoring_script
108
+ }
109
+ }
110
+
71
111
output {
72
112
File imputed_vcf = GlimpseLigate .imputed_vcf
73
113
File imputed_vcf_index = GlimpseLigate .imputed_vcf_index
114
+
115
+ File ? qc_metrics = CollectQCMetrics .qc_metrics
116
+
74
117
Array [File ?] glimpse_phase_monitoring = GlimpsePhase .monitoring
75
118
File ? glimpse_ligate_monitoring = GlimpseLigate .monitoring
76
119
}
@@ -95,8 +138,8 @@ task GlimpsePhase {
95
138
96
139
Int mem_gb = 4
97
140
Int cpu = 4
98
- Int disk_size_gb = ceil (2.2 * size (input_vcf , "GiB" ) + size (reference_chunk , "GiB" ) + 100 )
99
- Int preemptible = 1
141
+ Int disk_size_gb = ceil (2.2 * size (input_vcf , "GiB" ) + size (reference_chunk , "GiB" ) + 0.003 * length ( select_first ([ crams , []])) + 10 )
142
+ Int preemptible = 9
100
143
Int max_retries = 3
101
144
String docker
102
145
File ? monitoring_script
@@ -117,16 +160,27 @@ task GlimpsePhase {
117
160
118
161
export GCS_OAUTH_TOKEN=$( /root/google-cloud-sdk/bin/gcloud auth application-default print-access-token)
119
162
163
+ seq_cache_populate.pl -root ./ref/cache ~ {fasta}
164
+ export REF_PATH=:
165
+ export REF_CACHE=./ref/cache/%2s/%2s/%s
166
+
120
167
~ {" bash " + monitoring_script + " > monitoring.log &" }
121
168
122
169
cram_paths=( ~ {sep=" " crams} )
170
+ cram_index_paths=( ~ {sep=" " cram_indices} )
123
171
sample_ids=( ~ {sep=" " sample_ids} )
124
172
173
+ chunk_region=$( echo " ~{reference_chunk}" | sed ' s/^.*chr/chr/' | sed ' s/\.bin//' | sed ' s/_/:/1' | sed ' s/_/-/1' )
174
+
175
+ echo " Region for CRAM extraction: ${chunk_region} "
125
176
for i in " ${! cram_paths[@]} " ; do
126
- echo -e " ${cram_paths[$i]} ${sample_ids[$i]} " >> crams.list
177
+ samtools view -h -C -X -T ~ {fasta} -o cram${i} .cram " ${cram_paths[$i]} " " ${cram_index_paths[$i]} " ${chunk_region}
178
+ samtools index cram${i} .cram
179
+ echo -e " cram${i} .cram ${sample_ids[$i]} " >> crams.list
180
+ echo " Processed CRAM ${i} : ${cram_paths[$i]} -> cram${i} .cram"
127
181
done
128
182
129
- /bin/GLIMPSE2_phase \
183
+ cmd= " /bin/GLIMPSE2_phase \
130
184
~{" --input-gl " + input_vcf} \
131
185
--reference ~{reference_chunk} \
132
186
--output phase_output.bcf \
@@ -135,7 +189,14 @@ task GlimpsePhase {
135
189
~{" --burnin " + n_burnin} ~{" --main " + n_main} \
136
190
~{" --ne " + effective_population_size} \
137
191
~{bam_file_list_input} \
138
- ~ {" --fasta " + fasta}
192
+ ~{" --fasta " + fasta} \
193
+ --checkpoint-file-out checkpoint.bin"
194
+
195
+ if [ -s " checkpoint.bin" ]; then
196
+ cmd=" $cmd --checkpoint-file-in checkpoint.bin"
197
+ fi
198
+
199
+ eval $cmd
139
200
>>>
140
201
141
202
runtime {
@@ -145,6 +206,7 @@ task GlimpsePhase {
145
206
cpu : cpu
146
207
preemptible : preemptible
147
208
maxRetries : max_retries
209
+ checkpointFile : "checkpoint.bin"
148
210
}
149
211
150
212
output {
@@ -202,3 +264,155 @@ task GlimpseLigate {
202
264
File ? monitoring = "monitoring.log"
203
265
}
204
266
}
267
+
268
+ task CollectQCMetrics {
269
+ input {
270
+ File imputed_vcf
271
+ String output_basename
272
+
273
+ Int preemptible = 1
274
+ String docker = "hailgenetics/hail:0.2.126-py3.11"
275
+ Int cpu = 4
276
+ Int mem_gb = 16
277
+ File ? monitoring_script
278
+ }
279
+
280
+ parameter_meta {
281
+ imputed_vcf : {
282
+ localization_optional : true
283
+ }
284
+ }
285
+
286
+ Int disk_size_gb = 100
287
+
288
+ command <<<
289
+ set -euo pipefail
290
+
291
+ ~ {" bash " + monitoring_script + " > monitoring.log &" }
292
+
293
+ cat << 'EOF ' > script.py
294
+ import hail as hl
295
+ import pandas as pd
296
+
297
+ # Calculate metrics
298
+ hl.init(default_reference='GRCh38', idempotent=True)
299
+ vcf = hl.import_vcf('~{imputed_vcf}', force_bgz=True)
300
+ qc = hl.sample_qc(vcf)
301
+ qc.cols().flatten().rename({'sample_qc.' + col: col for col in list(qc['sample_qc'])}).export('~{output_basename}.qc_metrics.tsv')
302
+ EOF
303
+ python3 script.py
304
+ >>>
305
+
306
+ runtime {
307
+ docker : docker
308
+ disks : "local-disk " + disk_size_gb + " HDD"
309
+ memory : mem_gb + " GiB"
310
+ cpu : cpu
311
+ preemptible : preemptible
312
+ }
313
+
314
+ output {
315
+ File qc_metrics = "~{output_basename} .qc_metrics.tsv"
316
+ File ? monitoring = "monitoring.log"
317
+ }
318
+ }
319
+
320
+ task GetNumberOfSitesInChunk {
321
+ input {
322
+ File reference_chunk
323
+
324
+ String docker
325
+ Int mem_gb = 4
326
+ Int cpu = 4
327
+ Int disk_size_gb = ceil (size (reference_chunk , "GiB" ) + 10 )
328
+ Int preemptible = 1
329
+ Int max_retries = 3
330
+ }
331
+
332
+ command <<<
333
+ set -xeuo pipefail
334
+ /bin/GLIMPSE2_extract_num_sites_from_reference_chunk ~ {reference_chunk} > n_sites.txt
335
+ cat n_sites.txt
336
+ grep " Lrare" n_sites.txt | sed ' s/Lrare=//' > n_rare.txt
337
+ grep " Lcommon" n_sites.txt | sed ' s/Lcommon=//' > n_common.txt
338
+ >>>
339
+
340
+ runtime {
341
+ docker : docker
342
+ disks : "local-disk " + disk_size_gb + " HDD"
343
+ memory : mem_gb + " GiB"
344
+ cpu : cpu
345
+ preemptible : preemptible
346
+ maxRetries : max_retries
347
+ }
348
+
349
+ output {
350
+ Int n_rare = read_int ("n_rare.txt" )
351
+ Int n_common = read_int ("n_common.txt" )
352
+ }
353
+ }
354
+
355
+ task CountSamples {
356
+ input {
357
+ File vcf
358
+
359
+ String bcftools_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.7-1.10.2-0.1.16-1669908889"
360
+ Int cpu = 1
361
+ Int memory_mb = 3000
362
+ Int disk_size_gb = 10 + ceil (size (vcf , "GiB" ))
363
+ }
364
+
365
+ command <<<
366
+ bcftools query -l ~ {vcf} | wc -l
367
+ >>>
368
+
369
+ runtime {
370
+ docker : bcftools_docker
371
+ disks : "local-disk ${disk_size_gb} HDD"
372
+ memory : "${memory_mb} MiB"
373
+ cpu : cpu
374
+ }
375
+ output {
376
+ Int nSamples = read_int (stdout ())
377
+ }
378
+ }
379
+
380
+ task SelectResourceParameters {
381
+ input {
382
+ Int n_rare
383
+ Int n_common
384
+ Int n_samples
385
+ }
386
+
387
+ command <<<
388
+ python3 << EOF
389
+ import math
390
+ n_rare = ~{n_rare}
391
+ n_common = ~{n_common}
392
+ n_samples = ~{n_samples}
393
+ n_sites = n_common + n_rare
394
+
395
+ # try to keep expected runtime under 4 hours, but don't ask for more than 32 cpus, or 256 GB memory
396
+ estimated_needed_threads = min(math.ceil(5e-6*n_sites*n_samples/240), 32)
397
+ estimated_needed_memory_gb = min(math.ceil((800e-3 + 0.97e-6 * n_rare * estimated_needed_threads + 14.6e-6 * n_common * estimated_needed_threads + 6.5e-9 * (n_rare + n_common) * n_samples + 13.7e-3 * n_samples + 1.8e-6*(n_rare + n_common)*math.log(n_samples))), 256)
398
+ # recalc allowable threads, may be some additional threads available due to rounding memory up
399
+ threads_to_use = max(math.floor((estimated_needed_memory_gb - (800e-3 + 6.5e-9 * (n_rare + n_common) * n_samples + 13.7e-3 * n_samples + 1.8e-6*(n_rare + n_common)*math.log(n_samples)))/(0.97e-6 * n_rare + 14.6e-6 * n_common)), 1)
400
+ #estimated_needed_memory_gb = math.ceil(1.2 * estimated_needed_memory_gb)
401
+
402
+ with open("n_cpus_request.txt", "w") as f_cpus_request:
403
+ f_cpus_request.write(f'{int(threads_to_use)}')
404
+
405
+ with open("memory_gb.txt", "w") as f_mem:
406
+ f_mem.write(f'{int(estimated_needed_memory_gb)}')
407
+ EOF
408
+ >>>
409
+
410
+ runtime {
411
+ docker : "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
412
+ }
413
+
414
+ output {
415
+ Int memory_gb = read_int("memory_gb.txt")
416
+ Int request_n_cpus = read_int("n_cpus_request.txt")
417
+ }
418
+ }
0 commit comments