Skip to content

Commit da3e318

Browse files
authored
Added Dipcall WDL and reformat README for utilities (#130)
Added Dipcall WDL and reformat README for utilities
1 parent 5653fb5 commit da3e318

File tree

4 files changed

+332
-15
lines changed

4 files changed

+332
-15
lines changed

Utilities/WDLs/CollectBenchmarkSucceeded_README.md

Lines changed: 0 additions & 14 deletions
This file was deleted.

Utilities/WDLs/Dipcall.wdl

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
version 1.0
2+
3+
# Modified from "official" Dockstore version to allow custom PAR regions for male samples
4+
# Allows for use with non-hg37/38 references
5+
workflow runDipcall {
6+
input {
7+
File assemblyFastaPat
8+
File assemblyFastaMat
9+
File referenceFasta
10+
Boolean isMaleSample
11+
File? custom_PAR_bed
12+
String sample_name = "dipcall"
13+
String output_file_name = "cleaned-indexed"
14+
}
15+
16+
call dipcall {
17+
input:
18+
assemblyFastaPat=assemblyFastaPat,
19+
assemblyFastaMat=assemblyFastaMat,
20+
referenceFasta=referenceFasta,
21+
isMaleSample=isMaleSample,
22+
custom_PAR_bed=custom_PAR_bed
23+
}
24+
25+
# Note: Highly recommended to include custom PAR bed to clean missing genotypes in haploid regions for male samples
26+
# This allows easier downstream analysis for comparison tools
27+
if ((isMaleSample) && (defined(custom_PAR_bed))) {
28+
call CleanHaploidCalls {
29+
input:
30+
dipcall_vcf=dipcall.outputVCF,
31+
PAR_bed=select_first([custom_PAR_bed])
32+
}
33+
}
34+
35+
call IndexVCF {
36+
input:
37+
input_vcf=select_first([CleanHaploidCalls.cleaned_vcf, dipcall.outputVCF]),
38+
sample_name=sample_name,
39+
output_file_name=output_file_name
40+
}
41+
42+
output{
43+
File outputTarball = dipcall.outputTarball
44+
File outputVCF = IndexVCF.vcf
45+
File outputVCF_index = IndexVCF.vcf_index
46+
File outputBED = dipcall.outputBED
47+
}
48+
}
49+
50+
task dipcall {
51+
input {
52+
File assemblyFastaPat
53+
File assemblyFastaMat
54+
File referenceFasta
55+
Boolean isMaleSample
56+
Boolean referenceIsHS38 = true
57+
File? custom_PAR_bed
58+
Int memSizeGB = 64
59+
Int threadCount = 16
60+
Int diskSizeGB = 64
61+
String dockerImage = "humanpangenomics/hpp_dipcall_v0.3:latest"
62+
}
63+
64+
command <<<
65+
# Set the exit code of a pipeline to that of the rightmost command
66+
# to exit with a non-zero status, or zero if all commands of the pipeline exit
67+
set -o pipefail
68+
# cause a bash script to exit immediately when a command fails
69+
set -e
70+
# cause the bash shell to treat unset variables as an error and exit immediately
71+
set -u
72+
# echo each line of the script to stdout so we can see what is happening
73+
# to turn off echo do 'set +o xtrace'
74+
set -o xtrace
75+
PATH="/root/bin/samtools_1.9:$PATH"
76+
77+
# get output base
78+
PREFIX=$(basename "~{assemblyFastaPat}" | sed 's/.gz$//' | sed 's/.fa\(sta\)*$//' | sed 's/[._][pm]at\(ernal\)*//')
79+
mkdir "$PREFIX.dipcall"
80+
81+
# prep paternal
82+
PAT_FILENAME=$(basename -- "~{assemblyFastaPat}")
83+
if [[ $PAT_FILENAME =~ \.gz$ ]]; then
84+
cp "~{assemblyFastaPat}" .
85+
gunzip "$PAT_FILENAME"
86+
PAT_FILENAME="${PAT_FILENAME%.gz}"
87+
else
88+
ln -s "~{assemblyFastaPat}" "$PAT_FILENAME"
89+
fi
90+
91+
# prep maternal
92+
MAT_FILENAME=$(basename -- "~{assemblyFastaMat}")
93+
if [[ $MAT_FILENAME =~ \.gz$ ]]; then
94+
cp "~{assemblyFastaMat}" .
95+
gunzip "$MAT_FILENAME"
96+
MAT_FILENAME="${MAT_FILENAME%.gz}"
97+
else
98+
ln -s "~{assemblyFastaMat}" "$MAT_FILENAME"
99+
fi
100+
101+
# prep reference
102+
REF_FILENAME=$(basename -- "~{referenceFasta}")
103+
if [[ $REF_FILENAME =~ \.gz$ ]]; then
104+
cp ~{referenceFasta} .
105+
gunzip "$REF_FILENAME"
106+
REF_FILENAME="${REF_FILENAME%.gz}"
107+
else
108+
ln -s "~{referenceFasta}" "$REF_FILENAME"
109+
fi
110+
samtools faidx "$REF_FILENAME"
111+
112+
# initialize script
113+
cmd=( /opt/dipcall/dipcall.kit/run-dipcall )
114+
115+
# male samples need PAR region excluded
116+
if [[ ~{isMaleSample} == true ]]; then
117+
if [[ "~{referenceIsHS38}" ]]; then
118+
cmd+=( -x /opt/dipcall/dipcall.kit/hs38.PAR.bed )
119+
elif [ -f "~{custom_PAR_bed}" ]; then
120+
cmd+=( -x "~{custom_PAR_bed}" )
121+
else
122+
echo "WARNING: Assuming reference is hg37 because input is not hg38 and no custom PAR provided."
123+
echo "Using built-in PAR regions for hg37 to exclude from calling..."
124+
cmd+=( -x /opt/dipcall/dipcall.kit/hs37d5.PAR.bed )
125+
fi
126+
127+
fi
128+
129+
# finalize script
130+
cmd+=( "${PREFIX}".dipcall/"${PREFIX}" )
131+
cmd+=( "$REF_FILENAME" )
132+
cmd+=( "$PAT_FILENAME" )
133+
cmd+=( "$MAT_FILENAME" )
134+
135+
# generate makefile
136+
"${cmd[@]}" > "${PREFIX}.mak"
137+
138+
# run dipcall
139+
make -j 2 -f "${PREFIX}.mak"
140+
141+
# finalize
142+
rm "${PREFIX}".dipcall/*sam.gz
143+
tar czvf "${PREFIX}.dipcall.tar.gz" "${PREFIX}".dipcall/
144+
cp "${PREFIX}.dipcall/${PREFIX}.dip.bed" "${PREFIX}.dipcall.bed"
145+
cp "${PREFIX}.dipcall/${PREFIX}.dip.vcf.gz" "${PREFIX}.dipcall.vcf.gz"
146+
147+
>>>
148+
output {
149+
File outputTarball = glob("*.dipcall.tar.gz")[0]
150+
File outputVCF = glob("*.dipcall.vcf.gz")[0]
151+
File outputBED = glob("*.dipcall.bed")[0]
152+
}
153+
runtime {
154+
memory: memSizeGB + " GB"
155+
cpu: threadCount
156+
disks: "local-disk " + diskSizeGB + " SSD"
157+
docker: dockerImage
158+
preemptible: 1
159+
}
160+
}
161+
162+
# Task assumes contig names in PAR bed match the allosome names in Dipcall VCF, but otherwise name agnostic
163+
# Also assumes that PAR are on the edges of X/Y with two connected components and nowhere else, as in humans
164+
task CleanHaploidCalls {
165+
input {
166+
File dipcall_vcf
167+
File PAR_bed
168+
169+
# Runtime arguments
170+
Int disk_size = 2*ceil(size(dipcall_vcf, "GB")) + 50
171+
Int cpu = 4
172+
Int memory = 8
173+
}
174+
175+
command <<<
176+
set -xueo pipefail
177+
178+
python << CODE
179+
180+
import pysam
181+
import csv
182+
from cyvcf2 import VCF, Writer
183+
184+
185+
input_file = "~{dipcall_vcf}"
186+
bed_file = "~{PAR_bed}"
187+
188+
vcf = VCF(input_file)
189+
vcf_out = Writer('cleaned.vcf.gz', vcf)
190+
191+
# Function for formatting variants with missing genotypes in haploid regions
192+
def make_haploid_var(variant):
193+
chrom_string = variant.CHROM
194+
pos_string = str(variant.POS)
195+
id_string = str(variant.ID)
196+
ref_string = variant.REF
197+
alt_string = ','.join(variant.ALT)
198+
qual_string = f"{variant.QUAL:g}"
199+
filter_string = variant.FILTER if variant.FILTER is not None else '.'
200+
info_string = '.'
201+
format_string = ':'.join(variant.FORMAT)
202+
203+
gt_string = [x for x in variant.genotypes[0][:2] if (x != -1)] # Grab non-missing entries; uses diploid assumption
204+
if len(gt_string) == 0:
205+
gt_string = '.'
206+
else:
207+
gt_string = gt_string[0]
208+
ad_string = ','.join([str(x) for x in variant.format('AD').tolist()[0]])
209+
210+
var_array = [chrom_string, pos_string, id_string, ref_string, alt_string, qual_string, filter_string,
211+
info_string, format_string, f"{gt_string}:{ad_string}"]
212+
var_string = '\t'.join(var_array)
213+
214+
return vcf_out.variant_from_string(var_string)
215+
216+
# Process PAR bed file
217+
regions = []
218+
with open(bed_file) as bed:
219+
bed_rows = csv.reader(bed, delimiter='\t')
220+
for row in bed_rows:
221+
regions += [row]
222+
223+
# Computes complement using assumption on PAR region structure
224+
non_PAR = {}
225+
for i in range(0, len(regions), 2):
226+
non_PAR[regions[i][0]] = (regions[i][2], regions[i+1][1])
227+
228+
# Iterate through variants and modify missing values in haploid regions
229+
for variant in vcf:
230+
# Check if variant in non-PAR X/Y region, i.e. expected haploid
231+
# Use > since bed coordinates are 0-based but VCF are 1-based
232+
if (variant.CHROM in set(non_PAR.keys())) and (variant.POS > int(non_PAR[variant.CHROM][0])) and (variant.POS <= int(non_PAR[variant.CHROM][1])):
233+
# Check if any missing genotypes
234+
if -1 in variant.genotypes[0]:
235+
variant = make_haploid_var(variant)
236+
vcf_out.write_record(variant)
237+
238+
vcf_out.close()
239+
240+
CODE
241+
>>>
242+
243+
runtime {
244+
docker: "us.gcr.io/broad-dsde-methods/pysam:v1.1"
245+
disks: "local-disk " + disk_size + " HDD"
246+
cpu: cpu
247+
memory: memory + "GB"
248+
}
249+
250+
output {
251+
File cleaned_vcf = "cleaned.vcf.gz"
252+
}
253+
}
254+
255+
task IndexVCF {
256+
input {
257+
File input_vcf
258+
String sample_name = "dipcall"
259+
String output_file_name = "cleaned-indexed"
260+
}
261+
262+
command <<<
263+
set -xe
264+
265+
echo "~{sample_name}" > samples.txt
266+
bcftools reheader -s samples.txt -o "~{output_file_name}.vcf.gz" ~{input_vcf}
267+
bcftools index -t -f "~{output_file_name}.vcf.gz"
268+
>>>
269+
270+
runtime {
271+
docker: "us.gcr.io/broad-dsde-methods/bcftools:v1.0"
272+
cpu: 2
273+
memory: "8GB"
274+
disks: "local-disk " + 100 + " HDD"
275+
}
276+
277+
output {
278+
File vcf = "~{output_file_name}.vcf.gz"
279+
File vcf_index = "~{output_file_name}.vcf.gz.tbi"
280+
}
281+
}

Utilities/WDLs/IndexCramOrBam_README.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

Utilities/WDLs/README.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Utility WDLs
2+
3+
This directory contains a collection of miscellaneous WDLs useful for some small tasks. Check below for documentation
4+
on each.
5+
6+
* [CollectBenchmarkSucceeded](#collectbenchmarksucceeded)
7+
* [Dipcall](#dipcall)
8+
* [IndexCramOrBam](#indexcramorbam)
9+
10+
## CollectBenchmarkSucceeded
11+
12+
### Summary
13+
14+
When running the `FindSamplesAndBenchmark` wdl with many samples, it sometimes happens that a few fail in Terra while most succeed.
15+
Unfortunately, this means that the outputs of benchmarking the successful ones don't get compiled into one convenient .csv file to use for data analysis.
16+
If you don't mind sacrificing the few that failed, or want to get started analyzing the successful ones ASAP, this wdl will automatically collect
17+
the successful outputs and aggregate them into one .csv, similar to the last task of the benchmarking wdl.
18+
19+
### Inputs
20+
21+
* `namespace`: the first personalized part of your workspace URL; e.g. if you see `<my_project>/<my_workspace>` at the top
22+
in Terra, then this should be `<my_project>` as a string.
23+
* `workspace_name`: specific name for your workspace, e.g. `<my_workspace>` in the last example.
24+
* `submission_id`: the submission id for the `FindSamplesAndBenchmark` run, found from the "Job History" tab.
25+
26+
## Dipcall
27+
28+
### Summary
29+
30+
This WDL is a modified version of the Dockstore version of the [Dipcall](https://github.com/human-pangenomics/hpp_production_workflows/blob/master/QC/wdl/tasks/dipcall.wdl)
31+
pipeline. This workflow takes in a diploid assembly and calls variants, creating a VCF against your chosen reference.
32+
This modified version allows for you to specify custom PAR regions for your reference so you can call haploid variants
33+
when appropriate. Some data cleaning and indexing of the output VCF is also performed.
34+
35+
### Inputs
36+
37+
* `assemblyFastaPat`: the haploid assembly fasta for paternally inherited chromosomes
38+
* `assemblyFastaMat`: the haploid assembly fasta for maternally inherited chromosomes
39+
* `referenceFasta`: the reference to call variants against
40+
* `isMaleSample`: set true if you would like to make haploid calls on X/Y outside the PAR region
41+
* `custom_PAR_bed`: bed file denoting pseudoautosomal (PAR) regions for your reference
42+
* `sample_name`: name to put for your sample in final output VCF
43+
* `referenceIsHS38`: set true (default) if using hg38 reference
44+
45+
46+
## IndexCramOrBam
47+
48+
### Summary
49+
50+
Use this WDL to index a CRAM or BAM file, using `samtools`. The type is inferred using the file extension (either `.cram` or `.bam`).
51+

0 commit comments

Comments
 (0)