|
| 1 | +version 1.0 |
| 2 | + |
| 3 | +# Modified from "official" Dockstore version to allow custom PAR regions for male samples |
| 4 | +# Allows for use with non-hg37/38 references |
| 5 | +workflow runDipcall { |
| 6 | + input { |
| 7 | + File assemblyFastaPat |
| 8 | + File assemblyFastaMat |
| 9 | + File referenceFasta |
| 10 | + Boolean isMaleSample |
| 11 | + File? custom_PAR_bed |
| 12 | + String sample_name = "dipcall" |
| 13 | + String output_file_name = "cleaned-indexed" |
| 14 | + } |
| 15 | + |
| 16 | + call dipcall { |
| 17 | + input: |
| 18 | + assemblyFastaPat=assemblyFastaPat, |
| 19 | + assemblyFastaMat=assemblyFastaMat, |
| 20 | + referenceFasta=referenceFasta, |
| 21 | + isMaleSample=isMaleSample, |
| 22 | + custom_PAR_bed=custom_PAR_bed |
| 23 | + } |
| 24 | + |
| 25 | + # Note: Highly recommended to include custom PAR bed to clean missing genotypes in haploid regions for male samples |
| 26 | + # This allows easier downstream analysis for comparison tools |
| 27 | + if ((isMaleSample) && (defined(custom_PAR_bed))) { |
| 28 | + call CleanHaploidCalls { |
| 29 | + input: |
| 30 | + dipcall_vcf=dipcall.outputVCF, |
| 31 | + PAR_bed=select_first([custom_PAR_bed]) |
| 32 | + } |
| 33 | + } |
| 34 | + |
| 35 | + call IndexVCF { |
| 36 | + input: |
| 37 | + input_vcf=select_first([CleanHaploidCalls.cleaned_vcf, dipcall.outputVCF]), |
| 38 | + sample_name=sample_name, |
| 39 | + output_file_name=output_file_name |
| 40 | + } |
| 41 | + |
| 42 | + output{ |
| 43 | + File outputTarball = dipcall.outputTarball |
| 44 | + File outputVCF = IndexVCF.vcf |
| 45 | + File outputVCF_index = IndexVCF.vcf_index |
| 46 | + File outputBED = dipcall.outputBED |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +task dipcall { |
| 51 | + input { |
| 52 | + File assemblyFastaPat |
| 53 | + File assemblyFastaMat |
| 54 | + File referenceFasta |
| 55 | + Boolean isMaleSample |
| 56 | + Boolean referenceIsHS38 = true |
| 57 | + File? custom_PAR_bed |
| 58 | + Int memSizeGB = 64 |
| 59 | + Int threadCount = 16 |
| 60 | + Int diskSizeGB = 64 |
| 61 | + String dockerImage = "humanpangenomics/hpp_dipcall_v0.3:latest" |
| 62 | + } |
| 63 | + |
| 64 | + command <<< |
| 65 | + # Set the exit code of a pipeline to that of the rightmost command |
| 66 | + # to exit with a non-zero status, or zero if all commands of the pipeline exit |
| 67 | + set -o pipefail |
| 68 | + # cause a bash script to exit immediately when a command fails |
| 69 | + set -e |
| 70 | + # cause the bash shell to treat unset variables as an error and exit immediately |
| 71 | + set -u |
| 72 | + # echo each line of the script to stdout so we can see what is happening |
| 73 | + # to turn off echo do 'set +o xtrace' |
| 74 | + set -o xtrace |
| 75 | + PATH="/root/bin/samtools_1.9:$PATH" |
| 76 | + |
| 77 | + # get output base |
| 78 | + PREFIX=$(basename "~{assemblyFastaPat}" | sed 's/.gz$//' | sed 's/.fa\(sta\)*$//' | sed 's/[._][pm]at\(ernal\)*//') |
| 79 | + mkdir "$PREFIX.dipcall" |
| 80 | + |
| 81 | + # prep paternal |
| 82 | + PAT_FILENAME=$(basename -- "~{assemblyFastaPat}") |
| 83 | + if [[ $PAT_FILENAME =~ \.gz$ ]]; then |
| 84 | + cp "~{assemblyFastaPat}" . |
| 85 | + gunzip "$PAT_FILENAME" |
| 86 | + PAT_FILENAME="${PAT_FILENAME%.gz}" |
| 87 | + else |
| 88 | + ln -s "~{assemblyFastaPat}" "$PAT_FILENAME" |
| 89 | + fi |
| 90 | + |
| 91 | + # prep maternal |
| 92 | + MAT_FILENAME=$(basename -- "~{assemblyFastaMat}") |
| 93 | + if [[ $MAT_FILENAME =~ \.gz$ ]]; then |
| 94 | + cp "~{assemblyFastaMat}" . |
| 95 | + gunzip "$MAT_FILENAME" |
| 96 | + MAT_FILENAME="${MAT_FILENAME%.gz}" |
| 97 | + else |
| 98 | + ln -s "~{assemblyFastaMat}" "$MAT_FILENAME" |
| 99 | + fi |
| 100 | + |
| 101 | + # prep reference |
| 102 | + REF_FILENAME=$(basename -- "~{referenceFasta}") |
| 103 | + if [[ $REF_FILENAME =~ \.gz$ ]]; then |
| 104 | + cp ~{referenceFasta} . |
| 105 | + gunzip "$REF_FILENAME" |
| 106 | + REF_FILENAME="${REF_FILENAME%.gz}" |
| 107 | + else |
| 108 | + ln -s "~{referenceFasta}" "$REF_FILENAME" |
| 109 | + fi |
| 110 | + samtools faidx "$REF_FILENAME" |
| 111 | + |
| 112 | + # initialize script |
| 113 | + cmd=( /opt/dipcall/dipcall.kit/run-dipcall ) |
| 114 | + |
| 115 | + # male samples need PAR region excluded |
| 116 | + if [[ ~{isMaleSample} == true ]]; then |
| 117 | + if [[ "~{referenceIsHS38}" ]]; then |
| 118 | + cmd+=( -x /opt/dipcall/dipcall.kit/hs38.PAR.bed ) |
| 119 | + elif [ -f "~{custom_PAR_bed}" ]; then |
| 120 | + cmd+=( -x "~{custom_PAR_bed}" ) |
| 121 | + else |
| 122 | + echo "WARNING: Assuming reference is hg37 because input is not hg38 and no custom PAR provided." |
| 123 | + echo "Using built-in PAR regions for hg37 to exclude from calling..." |
| 124 | + cmd+=( -x /opt/dipcall/dipcall.kit/hs37d5.PAR.bed ) |
| 125 | + fi |
| 126 | + |
| 127 | + fi |
| 128 | + |
| 129 | + # finalize script |
| 130 | + cmd+=( "${PREFIX}".dipcall/"${PREFIX}" ) |
| 131 | + cmd+=( "$REF_FILENAME" ) |
| 132 | + cmd+=( "$PAT_FILENAME" ) |
| 133 | + cmd+=( "$MAT_FILENAME" ) |
| 134 | + |
| 135 | + # generate makefile |
| 136 | + "${cmd[@]}" > "${PREFIX}.mak" |
| 137 | + |
| 138 | + # run dipcall |
| 139 | + make -j 2 -f "${PREFIX}.mak" |
| 140 | + |
| 141 | + # finalize |
| 142 | + rm "${PREFIX}".dipcall/*sam.gz |
| 143 | + tar czvf "${PREFIX}.dipcall.tar.gz" "${PREFIX}".dipcall/ |
| 144 | + cp "${PREFIX}.dipcall/${PREFIX}.dip.bed" "${PREFIX}.dipcall.bed" |
| 145 | + cp "${PREFIX}.dipcall/${PREFIX}.dip.vcf.gz" "${PREFIX}.dipcall.vcf.gz" |
| 146 | + |
| 147 | + >>> |
| 148 | + output { |
| 149 | + File outputTarball = glob("*.dipcall.tar.gz")[0] |
| 150 | + File outputVCF = glob("*.dipcall.vcf.gz")[0] |
| 151 | + File outputBED = glob("*.dipcall.bed")[0] |
| 152 | + } |
| 153 | + runtime { |
| 154 | + memory: memSizeGB + " GB" |
| 155 | + cpu: threadCount |
| 156 | + disks: "local-disk " + diskSizeGB + " SSD" |
| 157 | + docker: dockerImage |
| 158 | + preemptible: 1 |
| 159 | + } |
| 160 | +} |
| 161 | + |
| 162 | +# Task assumes contig names in PAR bed match the allosome names in Dipcall VCF, but otherwise name agnostic |
| 163 | +# Also assumes that PAR are on the edges of X/Y with two connected components and nowhere else, as in humans |
| 164 | +task CleanHaploidCalls { |
| 165 | + input { |
| 166 | + File dipcall_vcf |
| 167 | + File PAR_bed |
| 168 | + |
| 169 | + # Runtime arguments |
| 170 | + Int disk_size = 2*ceil(size(dipcall_vcf, "GB")) + 50 |
| 171 | + Int cpu = 4 |
| 172 | + Int memory = 8 |
| 173 | + } |
| 174 | + |
| 175 | + command <<< |
| 176 | + set -xueo pipefail |
| 177 | + |
| 178 | + python << CODE |
| 179 | +
|
| 180 | + import pysam |
| 181 | + import csv |
| 182 | + from cyvcf2 import VCF, Writer |
| 183 | +
|
| 184 | +
|
| 185 | + input_file = "~{dipcall_vcf}" |
| 186 | + bed_file = "~{PAR_bed}" |
| 187 | +
|
| 188 | + vcf = VCF(input_file) |
| 189 | + vcf_out = Writer('cleaned.vcf.gz', vcf) |
| 190 | +
|
| 191 | + # Function for formatting variants with missing genotypes in haploid regions |
| 192 | + def make_haploid_var(variant): |
| 193 | + chrom_string = variant.CHROM |
| 194 | + pos_string = str(variant.POS) |
| 195 | + id_string = str(variant.ID) |
| 196 | + ref_string = variant.REF |
| 197 | + alt_string = ','.join(variant.ALT) |
| 198 | + qual_string = f"{variant.QUAL:g}" |
| 199 | + filter_string = variant.FILTER if variant.FILTER is not None else '.' |
| 200 | + info_string = '.' |
| 201 | + format_string = ':'.join(variant.FORMAT) |
| 202 | +
|
| 203 | + gt_string = [x for x in variant.genotypes[0][:2] if (x != -1)] # Grab non-missing entries; uses diploid assumption |
| 204 | + if len(gt_string) == 0: |
| 205 | + gt_string = '.' |
| 206 | + else: |
| 207 | + gt_string = gt_string[0] |
| 208 | + ad_string = ','.join([str(x) for x in variant.format('AD').tolist()[0]]) |
| 209 | +
|
| 210 | + var_array = [chrom_string, pos_string, id_string, ref_string, alt_string, qual_string, filter_string, |
| 211 | + info_string, format_string, f"{gt_string}:{ad_string}"] |
| 212 | + var_string = '\t'.join(var_array) |
| 213 | +
|
| 214 | + return vcf_out.variant_from_string(var_string) |
| 215 | +
|
| 216 | + # Process PAR bed file |
| 217 | + regions = [] |
| 218 | + with open(bed_file) as bed: |
| 219 | + bed_rows = csv.reader(bed, delimiter='\t') |
| 220 | + for row in bed_rows: |
| 221 | + regions += [row] |
| 222 | +
|
| 223 | + # Computes complement using assumption on PAR region structure |
| 224 | + non_PAR = {} |
| 225 | + for i in range(0, len(regions), 2): |
| 226 | + non_PAR[regions[i][0]] = (regions[i][2], regions[i+1][1]) |
| 227 | +
|
| 228 | + # Iterate through variants and modify missing values in haploid regions |
| 229 | + for variant in vcf: |
| 230 | + # Check if variant in non-PAR X/Y region, i.e. expected haploid |
| 231 | + # Use > since bed coordinates are 0-based but VCF are 1-based |
| 232 | + if (variant.CHROM in set(non_PAR.keys())) and (variant.POS > int(non_PAR[variant.CHROM][0])) and (variant.POS <= int(non_PAR[variant.CHROM][1])): |
| 233 | + # Check if any missing genotypes |
| 234 | + if -1 in variant.genotypes[0]: |
| 235 | + variant = make_haploid_var(variant) |
| 236 | + vcf_out.write_record(variant) |
| 237 | +
|
| 238 | + vcf_out.close() |
| 239 | +
|
| 240 | + CODE |
| 241 | + >>> |
| 242 | +
|
| 243 | + runtime { |
| 244 | + docker: "us.gcr.io/broad-dsde-methods/pysam:v1.1" |
| 245 | + disks: "local-disk " + disk_size + " HDD" |
| 246 | + cpu: cpu |
| 247 | + memory: memory + "GB" |
| 248 | + } |
| 249 | +
|
| 250 | + output { |
| 251 | + File cleaned_vcf = "cleaned.vcf.gz" |
| 252 | + } |
| 253 | +} |
| 254 | +
|
| 255 | +task IndexVCF { |
| 256 | + input { |
| 257 | + File input_vcf |
| 258 | + String sample_name = "dipcall" |
| 259 | + String output_file_name = "cleaned-indexed" |
| 260 | + } |
| 261 | +
|
| 262 | + command <<< |
| 263 | + set -xe |
| 264 | +
|
| 265 | + echo "~{sample_name}" > samples.txt |
| 266 | + bcftools reheader -s samples.txt -o "~{output_file_name}.vcf.gz" ~{input_vcf} |
| 267 | + bcftools index -t -f "~{output_file_name}.vcf.gz" |
| 268 | + >>> |
| 269 | +
|
| 270 | + runtime { |
| 271 | + docker: "us.gcr.io/broad-dsde-methods/bcftools:v1.0" |
| 272 | + cpu: 2 |
| 273 | + memory: "8GB" |
| 274 | + disks: "local-disk " + 100 + " HDD" |
| 275 | + } |
| 276 | +
|
| 277 | + output { |
| 278 | + File vcf = "~{output_file_name}.vcf.gz" |
| 279 | + File vcf_index = "~{output_file_name}.vcf.gz.tbi" |
| 280 | + } |
| 281 | +} |
0 commit comments