@@ -7,8 +7,9 @@ workflow PerformPopulationPCA {
7
7
File population_vcf # Like Thousand Genomes
8
8
File population_vcf_index # Like Thousand Genomes
9
9
String basename # what the outputs will be named
10
- File imputed_array_vcf # limit to TYPED and TYPED_ONLY sites before LD pruning. Also will limit population to sites in imputed vcf for scoring correction
11
- File imputed_array_vcf_index
10
+ Array [File ] imputed_array_vcfs # limit to TYPED and TYPED_ONLY sites before LD pruning. Also will limit population to sites in imputed vcf for scoring correction
11
+ Array [File ] original_array_vcfs
12
+ Array [File ]? subset_to_sites
12
13
}
13
14
14
15
# this task seaparates multiallelics and changes variant IDs to chr:pos:ref:alt1 (bc there are no multiallelics now, alt1=alt)
@@ -19,43 +20,41 @@ workflow PerformPopulationPCA {
19
20
output_basename = basename + ".no_multiallelics"
20
21
}
21
22
22
- call UpdateVariantIds {
23
- input :
24
- vcf = imputed_array_vcf ,
25
- basename = basename + ".original_array.updated_ids."
26
- }
23
+ # we use sorted variant IDs so this step makes sure the variant IDs are in the format of chr:pos:allele1:allele2 where allele1
24
+ # and allele2 are sorted
25
+ call SortVariantIds {
26
+ input :
27
+ vcf = SeparateMultiallelics .output_vcf ,
28
+ basename = basename + ".sorted_ids"
29
+ }
27
30
28
- # we use sorted variant IDs so this step makes sure the variant IDs are in the format of chr:pos:allele1:allele2 where allele1
29
- # and allele2 are sorted
30
- call SortVariantIds {
31
- input :
32
- vcf = SeparateMultiallelics .output_vcf ,
33
- basename = basename + ".sorted_ids"
34
- }
31
+ scatter (imputed_array_vcf in imputed_array_vcfs ) {
32
+ call UpdateVariantIds as UpdateVariantIdsImputed {
33
+ input :
34
+ vcf = imputed_array_vcf ,
35
+ basename = basename + ".imputed_array.updated_ids."
36
+ }
35
37
36
- call SortVariantIds as SortVariantIdsImputedArray {
37
- input :
38
- vcf = UpdateVariantIds .output_vcf ,
39
- basename = basename + ".orginal_array.sorted_ids"
40
- }
41
38
42
- call ExtractIDs as ExtractIDsAll {
43
- input :
44
- vcf = SortVariantIdsImputedArray .output_vcf ,
45
- output_basename = basename
46
- }
47
39
48
- call SelectTypedSites {
49
- input :
50
- vcf = SortVariantIdsImputedArray .output_vcf ,
51
- basename = basename
52
- }
40
+ call SortVariantIds as SortVariantIdsImputedArray {
41
+ input :
42
+ vcf = UpdateVariantIdsImputed .output_vcf ,
43
+ basename = basename + ".imputed_array.sorted_ids"
44
+ }
53
45
54
- call ExtractIDs as ExtractIDsTyped {
55
- input :
56
- vcf = SelectTypedSites .output_vcf ,
57
- output_basename = basename
58
- }
46
+ call SelectTypedSites {
47
+ input :
48
+ vcf = SortVariantIdsImputedArray .output_vcf ,
49
+ basename = basename
50
+ }
51
+
52
+ call ExtractIDs as ExtractIDsTyped {
53
+ input :
54
+ vcf = SelectTypedSites .output_vcf ,
55
+ output_basename = basename
56
+ }
57
+ }
59
58
60
59
call SubsetToArrayVCF {
61
60
input :
@@ -65,6 +64,14 @@ workflow PerformPopulationPCA {
65
64
intervals_index = SelectTypedSites .output_vcf_index ,
66
65
basename = basename + ".sorted_ids.subsetted"
67
66
}
67
+
68
+ scatter (original_array_vcf in original_array_vcfs ) {
69
+ call SelectSitesOriginalArray {
70
+ input :
71
+ vcf = original_array_vcf ,
72
+ basename = basename
73
+ }
74
+ }
68
75
69
76
# this performs some basic QC steps (filtering by MAF, HWE, etc.), as well as
70
77
# generating a plink-style bim,bed,fam format that has been limited to LD pruned
@@ -74,7 +81,9 @@ workflow PerformPopulationPCA {
74
81
input :
75
82
vcf = SubsetToArrayVCF .output_vcf ,
76
83
basename = basename ,
77
- original_array_sites = ExtractIDsTyped .ids
84
+ imputed_typed_sites = ExtractIDsTyped .ids ,
85
+ original_array_sites = SelectSitesOriginalArray .ids ,
86
+ selected_sites = subset_to_sites
78
87
}
79
88
80
89
# perform PCA using flashPCA
@@ -110,6 +119,37 @@ workflow PerformPopulationPCA {
110
119
}
111
120
}
112
121
122
+ task SelectSitesOriginalArray {
123
+ input {
124
+ File vcf
125
+ String basename
126
+ Int mem = 8
127
+ }
128
+
129
+ Int disk_size = ceil (size (vcf , "GB" )) + 50
130
+
131
+ command <<<
132
+ /plink2 --vcf ~ {vcf} \
133
+ --set-all-var-ids @:# :\$1:\$2 \
134
+ --rm-dup force-first \
135
+ --geno 0.001 \
136
+ --maf 0.01 \
137
+ --snps-only \
138
+ --write-snplist \
139
+ --out ~ {basename}_selected
140
+ >>>
141
+
142
+ runtime {
143
+ docker : "skwalker/plink2:first"
144
+ disks : "local-disk 400 HDD"
145
+ memory : mem + " GB"
146
+ }
147
+
148
+ output {
149
+ File ids = "~{basename} _selected.snplist"
150
+ }
151
+ }
152
+
113
153
task SelectTypedSites {
114
154
input {
115
155
File vcf
@@ -144,7 +184,9 @@ task SelectTypedSites {
144
184
task LDPruning {
145
185
input {
146
186
File vcf
147
- File original_array_sites
187
+ Array [File ] original_array_sites
188
+ Array [File ] imputed_typed_sites
189
+ Array [File ]? selected_sites
148
190
Int mem = 8
149
191
String basename
150
192
}
@@ -156,7 +198,7 @@ task LDPruning {
156
198
--rm-dup force-first \
157
199
--geno 0.05 \
158
200
--hwe 1e-10 \
159
- --extract ~ {original_array_sites} \
201
+ --extract-intersect ~ {sep= " " original_array_sites} ~ {sep = " " imputed_typed_sites} ~ {sep= " " selected_sites } \
160
202
--indep-pairwise 1000 50 0.2 \
161
203
--maf 0.01 \
162
204
--allow-extra-chr \
@@ -387,14 +429,14 @@ task SubsetToArrayVCF {
387
429
input {
388
430
File vcf
389
431
File vcf_index
390
- File intervals
391
- File ? intervals_index
432
+ Array [ File ] intervals
433
+ Array [ File ] intervals_index
392
434
String basename
393
435
Int disk_size = 3 *ceil (size ([vcf , intervals , vcf_index ], "GB" )) + 20
394
436
}
395
437
396
438
command {
397
- gatk SelectVariants -V ~ {vcf} -L ~ {intervals} -O ~ {basename}.vcf.gz
439
+ gatk SelectVariants -V ~ {vcf} -L ~ {sep = " -L " intervals} --interval-set-rule INTERSECTION -O ~ {basename}.vcf.gz
398
440
}
399
441
400
442
runtime {
0 commit comments