Skip to content

Commit 07c6eca

Browse files
authored
Constrain PCA to high call rate sites, and enable intersection of "good" sites from multiple arrays (#20)
* subset pca to sites with very high call rate * pca snps only * typo * typo * extract-intersect typo * typo * maf orig site selection * remove unused call to ExtractIDs * initial sketch * intersection for PopulationPCA * subset_to_sites option in PCA
1 parent ad65411 commit 07c6eca

File tree

2 files changed

+84
-42
lines changed

2 files changed

+84
-42
lines changed

ImputationPipeline/EndToEndPipeline.wdl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ workflow EndToEndPipeline {
6565
population_vcf = population_vcf,
6666
population_vcf_index = population_vcf_index,
6767
basename = population_basename,
68-
imputed_array_vcf = ImputationSteps.imputed_multisample_vcf,
69-
imputed_array_vcf_index = ImputationSteps.imputed_multisample_vcf_index,
68+
imputed_array_vcfs = [ImputationSteps.imputed_multisample_vcf],
69+
original_array_vcfs = [multi_sample_vcf]
7070
}
7171
}
7272

ImputationPipeline/PerformPopulationPCA.wdl

Lines changed: 82 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ workflow PerformPopulationPCA {
77
File population_vcf # Like Thousand Genomes
88
File population_vcf_index # Like Thousand Genomes
99
String basename # what the outputs will be named
10-
File imputed_array_vcf # limit to TYPED and TYPED_ONLY sites before LD pruning. Also will limit population to sites in imputed vcf for scoring correction
11-
File imputed_array_vcf_index
10+
Array[File] imputed_array_vcfs # limit to TYPED and TYPED_ONLY sites before LD pruning. Also will limit population to sites in imputed vcf for scoring correction
11+
Array[File] original_array_vcfs
12+
Array[File]? subset_to_sites
1213
}
1314

1415
# this task seaparates multiallelics and changes variant IDs to chr:pos:ref:alt1 (bc there are no multiallelics now, alt1=alt)
@@ -19,43 +20,41 @@ workflow PerformPopulationPCA {
1920
output_basename = basename + ".no_multiallelics"
2021
}
2122

22-
call UpdateVariantIds {
23-
input:
24-
vcf = imputed_array_vcf,
25-
basename = basename + ".original_array.updated_ids."
26-
}
23+
# we use sorted variant IDs so this step makes sure the variant IDs are in the format of chr:pos:allele1:allele2 where allele1
24+
# and allele2 are sorted
25+
call SortVariantIds {
26+
input:
27+
vcf = SeparateMultiallelics.output_vcf,
28+
basename = basename + ".sorted_ids"
29+
}
2730

28-
# we use sorted variant IDs so this step makes sure the variant IDs are in the format of chr:pos:allele1:allele2 where allele1
29-
# and allele2 are sorted
30-
call SortVariantIds {
31-
input:
32-
vcf = SeparateMultiallelics.output_vcf,
33-
basename = basename + ".sorted_ids"
34-
}
31+
scatter (imputed_array_vcf in imputed_array_vcfs) {
32+
call UpdateVariantIds as UpdateVariantIdsImputed {
33+
input:
34+
vcf = imputed_array_vcf,
35+
basename = basename + ".imputed_array.updated_ids."
36+
}
3537

36-
call SortVariantIds as SortVariantIdsImputedArray {
37-
input:
38-
vcf = UpdateVariantIds.output_vcf,
39-
basename = basename + ".orginal_array.sorted_ids"
40-
}
4138

42-
call ExtractIDs as ExtractIDsAll {
43-
input:
44-
vcf = SortVariantIdsImputedArray.output_vcf,
45-
output_basename = basename
46-
}
4739

48-
call SelectTypedSites {
49-
input:
50-
vcf = SortVariantIdsImputedArray.output_vcf,
51-
basename = basename
52-
}
40+
call SortVariantIds as SortVariantIdsImputedArray {
41+
input:
42+
vcf = UpdateVariantIdsImputed.output_vcf,
43+
basename = basename + ".imputed_array.sorted_ids"
44+
}
5345

54-
call ExtractIDs as ExtractIDsTyped {
55-
input:
56-
vcf = SelectTypedSites.output_vcf,
57-
output_basename = basename
58-
}
46+
call SelectTypedSites {
47+
input:
48+
vcf = SortVariantIdsImputedArray.output_vcf,
49+
basename = basename
50+
}
51+
52+
call ExtractIDs as ExtractIDsTyped {
53+
input:
54+
vcf = SelectTypedSites.output_vcf,
55+
output_basename = basename
56+
}
57+
}
5958

6059
call SubsetToArrayVCF {
6160
input:
@@ -65,6 +64,14 @@ workflow PerformPopulationPCA {
6564
intervals_index = SelectTypedSites.output_vcf_index,
6665
basename = basename + ".sorted_ids.subsetted"
6766
}
67+
68+
scatter (original_array_vcf in original_array_vcfs) {
69+
call SelectSitesOriginalArray {
70+
input:
71+
vcf = original_array_vcf,
72+
basename = basename
73+
}
74+
}
6875

6976
# this performs some basic QC steps (filtering by MAF, HWE, etc.), as well as
7077
# generating a plink-style bim,bed,fam format that has been limited to LD pruned
@@ -74,7 +81,9 @@ workflow PerformPopulationPCA {
7481
input:
7582
vcf = SubsetToArrayVCF.output_vcf,
7683
basename = basename,
77-
original_array_sites = ExtractIDsTyped.ids
84+
imputed_typed_sites = ExtractIDsTyped.ids,
85+
original_array_sites = SelectSitesOriginalArray.ids,
86+
selected_sites = subset_to_sites
7887
}
7988

8089
# perform PCA using flashPCA
@@ -110,6 +119,37 @@ workflow PerformPopulationPCA {
110119
}
111120
}
112121

122+
task SelectSitesOriginalArray {
123+
input {
124+
File vcf
125+
String basename
126+
Int mem = 8
127+
}
128+
129+
Int disk_size = ceil(size(vcf, "GB")) + 50
130+
131+
command <<<
132+
/plink2 --vcf ~{vcf} \
133+
--set-all-var-ids @:#:\$1:\$2 \
134+
--rm-dup force-first \
135+
--geno 0.001 \
136+
--maf 0.01 \
137+
--snps-only \
138+
--write-snplist \
139+
--out ~{basename}_selected
140+
>>>
141+
142+
runtime {
143+
docker: "skwalker/plink2:first"
144+
disks: "local-disk 400 HDD"
145+
memory: mem + " GB"
146+
}
147+
148+
output {
149+
File ids = "~{basename}_selected.snplist"
150+
}
151+
}
152+
113153
task SelectTypedSites {
114154
input {
115155
File vcf
@@ -144,7 +184,9 @@ task SelectTypedSites {
144184
task LDPruning {
145185
input {
146186
File vcf
147-
File original_array_sites
187+
Array[File] original_array_sites
188+
Array[File] imputed_typed_sites
189+
Array[File]? selected_sites
148190
Int mem = 8
149191
String basename
150192
}
@@ -156,7 +198,7 @@ task LDPruning {
156198
--rm-dup force-first \
157199
--geno 0.05 \
158200
--hwe 1e-10 \
159-
--extract ~{original_array_sites} \
201+
--extract-intersect ~{sep=" " original_array_sites} ~{sep = " " imputed_typed_sites} ~{sep=" " selected_sites} \
160202
--indep-pairwise 1000 50 0.2 \
161203
--maf 0.01 \
162204
--allow-extra-chr \
@@ -387,14 +429,14 @@ task SubsetToArrayVCF {
387429
input {
388430
File vcf
389431
File vcf_index
390-
File intervals
391-
File? intervals_index
432+
Array[File] intervals
433+
Array[File] intervals_index
392434
String basename
393435
Int disk_size = 3*ceil(size([vcf, intervals, vcf_index], "GB")) + 20
394436
}
395437

396438
command {
397-
gatk SelectVariants -V ~{vcf} -L ~{intervals} -O ~{basename}.vcf.gz
439+
gatk SelectVariants -V ~{vcf} -L ~{sep =" -L " intervals} --interval-set-rule INTERSECTION -O ~{basename}.vcf.gz
398440
}
399441

400442
runtime {

0 commit comments

Comments
 (0)