Skip to content

Commit 9dd1f4e

Browse files
skwalkerkachulisedytamalolepsza
authored
Imputation and PRS scoring pipeline (#7)
Adds Imputation and PRS scoring pipeline Co-authored-by: Christopher Kachulis <ckachuli@broadinstitute.org> Co-authored-by: edytamalolepsza <54959060+edytamalolepsza@users.noreply.github.com>
1 parent d620e86 commit 9dd1f4e

File tree

7 files changed

+1391
-0
lines changed

7 files changed

+1391
-0
lines changed

.dockstore.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
version: 1.2
2+
workflows:
3+
- name: ImputationWorkflow
4+
subclass: WDL
5+
primaryDescriptorPath: /ImputationPipeline/Imputation.wdl
6+
- name: PRScoringWorkflow
7+
subclass: WDL
8+
primaryDescriptorPath: /ImputationPipeline/ScoringPart.wdl
9+
- name: EndToEndPipeline
10+
subclass: WDL
11+
primaryDescriptorPath: /ImputationPipeline/EndToEndPipeline.wdl
12+
- name: PerformPopulationPCA
13+
subclass: WDL
14+
primaryDescriptorPath: /ImputationPipeline/PerformPopulationPCA.wdl
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
version 1.0
2+
3+
import "Imputation.wdl" as imputation_pipeline
4+
import "ScoringPart.wdl" as scoring_pipeline
5+
import "PerformPopulationPCA.wdl" as population_pipeline # Like Thousand Genomes
6+
7+
workflow EndToEndPipeline {
8+
input {
9+
File multi_sample_vcf ## the dataset you want to impute, this is used both in imputation as well
10+
# as for doing the scoring adjustment steps, and may be used to select sites before LD pruning
11+
# if generating new population PCs
12+
File multi_sample_vcf_index ## index for the dataset you want to impute,used for imputation
13+
14+
Boolean perform_extra_qc_steps # these are optional additional extra QC steps that can be performed
15+
# on the `multi_sample_vcf` before imputing -- these should only be run for large and disparate sample
16+
# sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE)
17+
18+
19+
## Do you want to generate new PCs for your population, either if:
20+
## you have a new population (don't want to run Thousand Genomes)
21+
## or you have a new array and want to limit the projection to sites called in that array
22+
Boolean generate_population_pcs = false
23+
24+
25+
# the population vcf to use for the scoring adjustment, and if `generate_population_pcs` is set to
26+
# true, for generating the population pcs on
27+
# this defaults to Thousand Genomes
28+
File population_vcf = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/thousand_genomes_sorted_variant_ids.vcf.gz"
29+
File population_vcf_index = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/thousand_genomes_sorted_variant_ids.vcf.gz.tbi"
30+
31+
## if generate_population_pcs is not true: then you need to include the PC files in order to project the
32+
## array data onto that population -- here we default to running on the Thousand Genomes with the sites
33+
## Wallace generated after LD pruning
34+
35+
File? population_loadings = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/sorted_thousand_genomes_wallace_sites.pc.loadings"
36+
File? population_meansd = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/sorted_thousand_genomes_wallace_sites.pc.meansd"
37+
File? population_pcs = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/sorted_thousand_genomes_wallace_sites.pc"
38+
File? pruning_sites_for_pca = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/wallace_pruning_sites_sorted_ids.txt"
39+
40+
41+
## The following are inputs for scoring and performing the adjustment
42+
43+
File disease_weights = # disease weights file. Because we use variant IDs with sorted alleles, there is a task at the bottom of this workflow
44+
String? columns_for_scoring # Plink expects the first 3 columns in your weights file to be variant ID, effect allele, effect weight
45+
46+
Int scoring_mem = 16 # update memory for scoring imputed array
47+
Int population_scoring_mem = 16 # update memory for scoring population VCF
48+
49+
# output names (what the files will be named)
50+
String output_callset_name # the name for the imputed callset name
51+
String population_basename # the basename for the output PCs if `generate_population_pcs` is true
52+
53+
}
54+
55+
call imputation_pipeline.ImputationPipeline as ImputationSteps {
56+
input:
57+
multi_sample_vcf = multi_sample_vcf,
58+
multi_sample_vcf_index = multi_sample_vcf_index,
59+
perform_extra_qc_steps = perform_extra_qc_steps,
60+
output_callset_name = output_callset_name,
61+
}
62+
63+
if (generate_population_pcs) {
64+
call population_pipeline.PerformPopulationPCA as PopulationPCASteps {
65+
input:
66+
population_vcf = population_vcf,
67+
population_vcf_index = population_vcf_index,
68+
basename = population_basename,
69+
original_array_vcf = multi_sample_vcf,
70+
original_array_vcf_index = multi_sample_vcf_index,
71+
bad_variant_id_format = true # it will update the variant ids into the format we use: chr:pos:allele1:allele2
72+
}
73+
}
74+
75+
call scoring_pipeline.ScoringImputedDataset as ScoringSteps {
76+
input :
77+
weights = disease_weights,
78+
columns_for_scoring = columns_for_scoring,
79+
imputed_array_vcf = ImputationSteps.imputed_multisample_vcf,
80+
scoring_mem = scoring_mem,
81+
population_scoring_mem = population_scoring_mem,
82+
population_vcf = select_first([PopulationPCASteps.sorted_variant_id_dataset, population_vcf]),
83+
population_basename = population_basename,
84+
basename = output_callset_name,
85+
population_loadings = select_first([PopulationPCASteps.population_loadings, population_loadings]), # either use your newely generated PC files or the input loadings/meansd/pcs/pruning sites
86+
population_meansd = select_first([PopulationPCASteps.population_meansd, population_meansd]),
87+
population_pcs = select_first([PopulationPCASteps.population_pcs, population_pcs]),
88+
pruning_sites_for_pca = select_first([PopulationPCASteps.pruning_sites_for_pca, pruning_sites_for_pca]),
89+
}
90+
91+
output {
92+
93+
# the final imputed VCF + index
94+
File imputed_multisample_vcf = ImputationSteps.imputed_multisample_vcf
95+
File imputed_multisample_vcf_index = ImputationSteps.imputed_multisample_vcf_index
96+
97+
# the following files will only be generated if `generate_population_pcs` is set to true
98+
# these can be used in future runs to just calculate the scores and the scoring adjustment
99+
File? new_population_loadings = PopulationPCASteps.population_loadings
100+
File? new_population_meansd = PopulationPCASteps.population_loadings
101+
File? new_population_pcs = PopulationPCASteps.population_loadings
102+
File? new_pruning_sites_for_pca = PopulationPCASteps.population_loadings
103+
File? new_population_dataset_with_sorted_variant_ids = PopulationPCASteps.sorted_variant_id_dataset
104+
File? new_population_dataset_index_with_sorted_variant_ids = PopulationPCASteps.sorted_variant_id_dataset_index
105+
106+
# results from the scoring part
107+
File pc_plot = ScoringSteps.pc_plot
108+
File adjusted_population_scores = ScoringSteps.adjusted_population_scores
109+
File adjusted_array_scores = ScoringSteps.adjusted_array_scores
110+
}
111+
}

0 commit comments

Comments
 (0)