1
+ version 1.0
2
+
3
+ import "Imputation.wdl" as imputation_pipeline
4
+ import "ScoringPart.wdl" as scoring_pipeline
5
+ import "PerformPopulationPCA.wdl" as population_pipeline # Like Thousand Genomes
6
+
7
+ workflow EndToEndPipeline {
8
+ input {
9
+ File multi_sample_vcf ## the dataset you want to impute, this is used both in imputation as well
10
+ # as for doing the scoring adjustment steps, and may be used to select sites before LD pruning
11
+ # if generating new population PCs
12
+ File multi_sample_vcf_index ## index for the dataset you want to impute,used for imputation
13
+
14
+ Boolean perform_extra_qc_steps # these are optional additional extra QC steps that can be performed
15
+ # on the `multi_sample_vcf` before imputing -- these should only be run for large and disparate sample
16
+ # sets, especially a diverse set of samples (it's further limiting called at sites to 95% and by HWE)
17
+
18
+
19
+ ## Do you want to generate new PCs for your population, either if:
20
+ ## you have a new population (don't want to run Thousand Genomes)
21
+ ## or you have a new array and want to limit the projection to sites called in that array
22
+ Boolean generate_population_pcs = false
23
+
24
+
25
+ # the population vcf to use for the scoring adjustment, and if `generate_population_pcs` is set to
26
+ # true, for generating the population pcs on
27
+ # this defaults to Thousand Genomes
28
+ File population_vcf = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/thousand_genomes_sorted_variant_ids.vcf.gz"
29
+ File population_vcf_index = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/thousand_genomes_sorted_variant_ids.vcf.gz.tbi"
30
+
31
+ ## if generate_population_pcs is not true: then you need to include the PC files in order to project the
32
+ ## array data onto that population -- here we default to running on the Thousand Genomes with the sites
33
+ ## Wallace generated after LD pruning
34
+
35
+ File ? population_loadings = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/sorted_thousand_genomes_wallace_sites.pc.loadings"
36
+ File ? population_meansd = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/sorted_thousand_genomes_wallace_sites.pc.meansd"
37
+ File ? population_pcs = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/sorted_thousand_genomes_wallace_sites.pc"
38
+ File ? pruning_sites_for_pca = "gs://fc-6413177b-e99c-4476-b085-3da80d320081/RiskScoreAdjustmentFiles/WallacesPCASites/wallace_pruning_sites_sorted_ids.txt"
39
+
40
+
41
+ ## The following are inputs for scoring and performing the adjustment
42
+
43
+ File disease_weights = # disease weights file. Because we use variant IDs with sorted alleles, there is a task at the bottom of this workflow
44
+ String ? columns_for_scoring # Plink expects the first 3 columns in your weights file to be variant ID, effect allele, effect weight
45
+
46
+ Int scoring_mem = 16 # update memory for scoring imputed array
47
+ Int population_scoring_mem = 16 # update memory for scoring population VCF
48
+
49
+ # output names (what the files will be named)
50
+ String output_callset_name # the name for the imputed callset name
51
+ String population_basename # the basename for the output PCs if `generate_population_pcs` is true
52
+
53
+ }
54
+
55
+ call imputation_pipeline .ImputationPipeline as ImputationSteps {
56
+ input :
57
+ multi_sample_vcf = multi_sample_vcf ,
58
+ multi_sample_vcf_index = multi_sample_vcf_index ,
59
+ perform_extra_qc_steps = perform_extra_qc_steps ,
60
+ output_callset_name = output_callset_name ,
61
+ }
62
+
63
+ if (generate_population_pcs ) {
64
+ call population_pipeline .PerformPopulationPCA as PopulationPCASteps {
65
+ input :
66
+ population_vcf = population_vcf ,
67
+ population_vcf_index = population_vcf_index ,
68
+ basename = population_basename ,
69
+ original_array_vcf = multi_sample_vcf ,
70
+ original_array_vcf_index = multi_sample_vcf_index ,
71
+ bad_variant_id_format = true # it will update the variant ids into the format we use: chr:pos:allele1:allele2
72
+ }
73
+ }
74
+
75
+ call scoring_pipeline .ScoringImputedDataset as ScoringSteps {
76
+ input :
77
+ weights = disease_weights ,
78
+ columns_for_scoring = columns_for_scoring ,
79
+ imputed_array_vcf = ImputationSteps .imputed_multisample_vcf ,
80
+ scoring_mem = scoring_mem ,
81
+ population_scoring_mem = population_scoring_mem ,
82
+ population_vcf = select_first ([PopulationPCASteps .sorted_variant_id_dataset , population_vcf ]),
83
+ population_basename = population_basename ,
84
+ basename = output_callset_name ,
85
+ population_loadings = select_first ([PopulationPCASteps .population_loadings , population_loadings ]), # either use your newely generated PC files or the input loadings/meansd/pcs/pruning sites
86
+ population_meansd = select_first ([PopulationPCASteps .population_meansd , population_meansd ]),
87
+ population_pcs = select_first ([PopulationPCASteps .population_pcs , population_pcs ]),
88
+ pruning_sites_for_pca = select_first ([PopulationPCASteps .pruning_sites_for_pca , pruning_sites_for_pca ]),
89
+ }
90
+
91
+ output {
92
+
93
+ # the final imputed VCF + index
94
+ File imputed_multisample_vcf = ImputationSteps .imputed_multisample_vcf
95
+ File imputed_multisample_vcf_index = ImputationSteps .imputed_multisample_vcf_index
96
+
97
+ # the following files will only be generated if `generate_population_pcs` is set to true
98
+ # these can be used in future runs to just calculate the scores and the scoring adjustment
99
+ File ? new_population_loadings = PopulationPCASteps .population_loadings
100
+ File ? new_population_meansd = PopulationPCASteps .population_loadings
101
+ File ? new_population_pcs = PopulationPCASteps .population_loadings
102
+ File ? new_pruning_sites_for_pca = PopulationPCASteps .population_loadings
103
+ File ? new_population_dataset_with_sorted_variant_ids = PopulationPCASteps .sorted_variant_id_dataset
104
+ File ? new_population_dataset_index_with_sorted_variant_ids = PopulationPCASteps .sorted_variant_id_dataset_index
105
+
106
+ # results from the scoring part
107
+ File pc_plot = ScoringSteps .pc_plot
108
+ File adjusted_population_scores = ScoringSteps .adjusted_population_scores
109
+ File adjusted_array_scores = ScoringSteps .adjusted_array_scores
110
+ }
111
+ }
0 commit comments