Skip to content

Commit 5422d1e

Browse files
authored
Merge pull request #12 from aofarrel/mickey-and-minnie
Dot Product Scatter
2 parents dc4194f + 90a4f67 commit 5422d1e

File tree

3 files changed

+137
-0
lines changed

3 files changed

+137
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ Checks if two files are equivalent, as opposed to arraycheck_* iterating through
2222

2323
## workflow-level
2424

25+
### dot_product_scatter
26+
Example of how to use the pair variable type along with zip() to do a dot product scatter. This can be used to take a previous scattered task's inputs and outputs, and scatter again in a way that links the inputs with the outputs they generated.
27+
2528
### metamouse
2629
Checker/Debugger for Stuart tasks. The test files are derived from the [WDL translation](https://github.com/DataBiosphere/analysis_pipeline_WDL) of the [UWGAC TOPMed Pipeline](https://github.com/UW-GAC/analysis_pipeline).
2730

dot_product_scatter.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"dot_product_scatter.gds_files": [
3+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr5.gds",
4+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr9.gds",
5+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr11.gds",
6+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr12.gds",
7+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr19.gds",
8+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr21.gds",
9+
"gs://topmed_workflow_testing/UWGAC_WDL/checker/a_vcf2gds/1KG_phase3_subset_chr22.gds"
10+
]
11+
}

dot_product_scatter.wdl

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
version 1.0
2+
3+
# This workflow is an example of the pair() input type. It is based upon
4+
# an LD pruning workflow, but should not be used for actual scientific
5+
# analysis -- use this instead:
6+
# https://dockstore.org/workflows/github.com/DataBiosphere/analysis_pipeline_WDL/ld-pruning-wdl
7+
8+
task ld_pruning {
9+
input {
10+
File gds_file
11+
12+
# runtime attributes
13+
Int addldisk = 5
14+
Int cpu = 2
15+
Int memory = 4
16+
Int preempt = 3
17+
}
18+
19+
# Estimate disk size required
20+
Int gds_size = ceil(size(gds_file, "GB"))
21+
Int final_disk_dize = gds_size + addldisk
22+
23+
command {
24+
set -eux -o pipefail
25+
26+
# Generate a configuration file -- this is specific to the R script that this
27+
# task uses; generally, you wouldn't do this for most workflows.
28+
python << CODE
29+
import os
30+
f = open("ld_pruning.config", "a")
31+
f.write('gds_file "~{gds_file}"\n')
32+
f.write('genome_build hg38\n')
33+
34+
# The R script expects the GDS files to contain "chr*" where * is chr number/X/Y,
35+
# so use some string manipulation to determine the output file name. Again, this
36+
# is one of those tricks that is specific to this particular R script.
37+
if "chr" in "~{gds_file}":
38+
parts = os.path.splitext(os.path.basename("~{gds_file}"))[0].split("chr")
39+
outfile_temp = "pruned_variants_chr" + parts[1] + ".RData"
40+
else:
41+
outfile_temp = "pruned_variants.RData"
42+
f.write('out_file "' + outfile_temp + '"\n')
43+
f.close()
44+
CODE
45+
46+
echo "Calling R script ld_pruning.R"
47+
Rscript /usr/local/analysis_pipeline/R/ld_pruning.R ld_pruning.config
48+
}
49+
50+
runtime {
51+
cpu: cpu
52+
docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
53+
disks: "local-disk " + final_disk_dize + " HDD"
54+
memory: "${memory} GB"
55+
preemptible: "${preempt}"
56+
}
57+
output {
58+
File ld_pruning_output = glob("*.RData")[0]
59+
}
60+
61+
}
62+
63+
task echo_pairs {
64+
input {
65+
Pair[File, File] gds_n_varinc # [gds, variants to prune]
66+
67+
# runtime attributes
68+
Int addldisk = 5
69+
Int cpu = 2
70+
Int memory = 4
71+
Int preempt = 3
72+
}
73+
74+
# Estimate disk size required
75+
Int gds_size = ceil(size(gds_n_varinc.left, "GB"))
76+
Int final_disk_dize = gds_size + addldisk
77+
78+
command {
79+
80+
printf "GDS file: ~{gds_n_varinc.left}\n\n"
81+
printf "Resulting variant file it output: ~{gds_n_varinc.right}\n\n"
82+
printf "We can now call another R script to subset each GDS file via the variants file..."
83+
printf "...but we won't, because I want to encourage you to use this workflow instead: "
84+
printf "https://dockstore.org/workflows/github.com/DataBiosphere/analysis_pipeline_WDL/ld-pruning-wdl"
85+
86+
}
87+
88+
runtime {
89+
cpu: cpu
90+
docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
91+
disks: "local-disk " + final_disk_dize + " HDD"
92+
memory: "${memory} GB"
93+
preemptibles: "${preempt}"
94+
}
95+
}
96+
97+
98+
99+
workflow dot_product_scatter {
100+
input {
101+
Array[File] gds_files
102+
}
103+
104+
scatter(gds_file in gds_files) {
105+
call ld_pruning {
106+
input:
107+
gds_file = gds_file
108+
}
109+
}
110+
111+
# CWL uses a dotproduct scatter; this is the closest WDL equivalent
112+
scatter(gds_n_varinc in zip(gds_files, ld_pruning.ld_pruning_output)) {
113+
call echo_pairs {
114+
input:
115+
gds_n_varinc = gds_n_varinc
116+
}
117+
}
118+
119+
meta {
120+
author: "Ash O'Farrell"
121+
email: "aofarrel@ucsc.edu"
122+
}
123+
}

0 commit comments

Comments
 (0)