Aggregate PCs (#116)

kachulis · web-flow · commit e6ffb674b2c3 · 2022-11-21T15:24:07.000-05:00
* Aggregate PCs and include in batch results output
* Add tests for Aggregation workflow
diff --git a/ImputationPipeline/AggregatePRSResults.wdl b/ImputationPipeline/AggregatePRSResults.wdl
@@ -15,28 +15,32 @@ workflow AggregatePRSResults {
 
   call AggregateResults {
     input:
+      group_n = group_n,
       results = results,
       missing_sites_shifts = missing_sites_shifts,
-      lab_batch = lab_batch
+      lab_batch = lab_batch,
+      target_pc_projections = target_pc_projections
   }
 
   call PlotPCA {
     input:
+      group_n = group_n,
       lab_batch = lab_batch,
       population_name = population_name,
-      target_pc_projections = target_pc_projections,
+      batch_pcs = AggregateResults.batch_pcs,
       population_pc_projections = population_pc_projections
   }
 
   call BuildHTMLReport {
     input:
+      group_n = group_n,
       lab_batch = lab_batch,
       batch_control_results = AggregateResults.batch_control_results,
       batch_missing_sites_shifts = AggregateResults.batch_missing_sites_shifts,
       expected_control_results = expected_control_results,
       batch_summarised_results = AggregateResults.batch_summarised_results,
       batch_pivoted_results = AggregateResults.batch_pivoted_results,
-      target_pc_projections = target_pc_projections,
+      batch_pcs = AggregateResults.batch_pcs,
       population_pc_projections = population_pc_projections,
       population_name = population_name,
       high_risk_thresholds = high_risk_thresholds
@@ -50,13 +54,15 @@ workflow AggregatePRSResults {
     File score_distribution = AggregateResults.batch_score_distribution
     File pc_plot = PlotPCA.pc_plot
     File report = BuildHTMLReport.report
+    File batch_pcs = AggregateResults.batch_pcs
   }
 }
 
 task AggregateResults {
   input {
     Array[File] results
     Array[File] missing_sites_shifts
+    Array[File] target_pc_projections
     String lab_batch
     Int group_n
   }
@@ -72,6 +78,9 @@ task AggregateResults {
     library(ggplot2)
 
     results <- c("~{sep='","' results}") %>% map(read_csv, col_types=cols(is_control_sample='l', .default='c')) %>% reduce(bind_rows)
+    target_pcs <- c("~{sep='","' target_pc_projections}") %>% map(read_tsv) %>% reduce(bind_rows) %>% select(-FID) %>% rename(sample_id = IID)
+
+    results <- inner_join(results, target_pcs)
 
     lab_batch <- results %>% pull(lab_batch) %>% unique()
 
@@ -94,7 +103,7 @@ task AggregateResults {
 
     write_tsv(results %>% filter(is_control_sample), "~{output_prefix}_control_results.tsv")
 
-    results_pivoted <- results %>% filter(!is_control_sample) %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="([^_]+)_(.+)")
+    results_pivoted <- results %>% select(-starts_with("PC")) %>% filter(!is_control_sample) %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="(.+)(?<!not)(?<!reason)_(.+)$")
     results_pivoted <- results_pivoted %T>% {options(warn=-1)} %>% mutate(adjusted = as.numeric(adjusted),
                                                                           raw = as.numeric(raw),
                                                                           percentile = as.numeric(percentile)) %T>% {options(warn=0)}
@@ -121,6 +130,7 @@ task AggregateResults {
 
     missing_sites_shifts <-  c("~{sep='","' missing_sites_shifts}") %>% map(read_tsv) %>% reduce(bind_rows)
     write_tsv(missing_sites_shifts, "~{output_prefix}_missing_sites_shifts.tsv")
+    write_tsv(target_pcs, "~{output_prefix}_pcs.tsv")
 
     EOF
   >>>
@@ -138,12 +148,13 @@ task AggregateResults {
     File batch_pivoted_results = "~{output_prefix}_pivoted_results.tsv"
     File batch_score_distribution = "~{output_prefix}_score_distribution.png"
     File batch_missing_sites_shifts = "~{output_prefix}_missing_sites_shifts.tsv"
+    File batch_pcs = "~{output_prefix}_pcs.tsv"
   }
 }
 
 task PlotPCA {
   input {
-    Array[File] target_pc_projections
+    File batch_pcs
     File population_pc_projections
     String lab_batch
     Int group_n
@@ -158,7 +169,7 @@ task PlotPCA {
     library(purrr)
     library(ggplot2)
 
-    target_pcs <- c("~{sep='","' target_pc_projections}") %>% map(read_tsv) %>% reduce(bind_rows)
+    target_pcs <- read_tsv("~{batch_pcs}")
     population_pcs <- read_tsv("~{population_pc_projections}")
 
     ggplot(population_pcs, aes(x=PC1, y=PC2, color="~{population_name}")) +
@@ -191,7 +202,7 @@ task BuildHTMLReport {
     File batch_summarised_results
     File batch_pivoted_results
     File high_risk_thresholds
-    Array[File] target_pc_projections
+    File batch_pcs
     File population_pc_projections
     String population_name
     String lab_batch
@@ -295,23 +306,30 @@ task BuildHTMLReport {
     \`\`\`{r score distributions, echo=FALSE, message=FALSE, warning=FALSE, results="asis", fig.align='center'}
     normal_dist <- tibble(x=seq(-5,5,0.01)) %>% mutate(y=dnorm(x)) # needed because plotly doesn't work with geom_function
     conditions_with_more_than_4_samples <- batch_pivoted_results %>% group_by(condition) %>% filter(!is.na(adjusted)) %>% count() %>% filter(n>4) %>% pull(condition)
-    p_dist <- ggplot(batch_pivoted_results %>% filter(condition %in% conditions_with_more_than_4_samples), aes(x=adjusted)) +
-      stat_density(aes(color=condition, text=condition), geom="line", position = "identity") +
-      xlim(-5,5) + theme_bw() + xlab("z-score") + geom_line(data=normal_dist, aes(x=x, y=y), color="black") +
-      geom_point(data = batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)), aes(color=condition, x = adjusted, text=condition), y=0) +
-      ylab("density")
+    n_density <- batch_pivoted_results %>% filter(condition %in% conditions_with_more_than_4_samples) %>% nrow()
+    n_point <- batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)) %>% nrow()
+    p_dist <- ggplot()
+    if (n_density > 0) {
+      p_dist <- p_dist + stat_density(data = batch_pivoted_results %>% filter(condition %in% conditions_with_more_than_4_samples),
+      aes(color=condition, text=condition, x = adjusted), geom="line", position = "identity")
+    }
+    if (n_point > 0) {
+      p_dist <- p_dist + geom_point(data = batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)), aes(color=condition, x = adjusted, text=condition), y=0)
+    }
+    p_dist <- p_dist + xlim(-5,5) + theme_bw() + geom_line(data=normal_dist, aes(x=x, y=y), color="black") + xlab("z-score") + ylab("density")
+
     ggplotly(p_dist, tooltip="text")
     \`\`\`
 
     ## PCA
     #### Hover for sample ID
     \`\`\`{r pca plot, echo=FALSE, message=FALSE, warning=FALSE, results="asis", fig.align='center'}
-    target_pcs <- c("~{sep='","' target_pc_projections}") %>% map(read_tsv) %>% reduce(bind_rows)
+    target_pcs <- read_tsv("~{batch_pcs}")
     population_pcs <- read_tsv("~{population_pc_projections}")
 
     p <- ggplot(population_pcs, aes(x=PC1, y=PC2, color="~{population_name}")) +
       geom_point() +
-      geom_point(data=target_pcs, aes(color="~{lab_batch}", text=paste0("Sample ID: ", IID))) +
+      geom_point(data=target_pcs, aes(color="~{lab_batch}", text=paste0("Sample ID: ", sample_id))) +
       theme_bw()
     ggplotly(p, tooltip="text")
     \`\`\`
diff --git a/ImputationPipeline/ManualQCPRS/tests/ManualQCPRSTests.py b/ImputationPipeline/ManualQCPRS/tests/ManualQCPRSTests.py
@@ -870,6 +870,7 @@ def assert_samples_failed_all_conditions(self, failed_samples_with_reasons):
         data["is_control_sample"] = False
         expected_results = pd.DataFrame(data).set_index("sample_id")
         expected_results["notes"] = expected_results.index.map(failed_samples_with_reasons)
+        expected_results = expected_results.join(self.results[['PC1', 'PC2']])
         for i in range(1, 4):
             expected_results[f'condition_{i}_reason_not_resulted'] = \
                 expected_results.index.map(failed_samples_with_reasons)
diff --git a/ImputationPipeline/ManualQCPRS/tests/resources/test_results.tsv b/ImputationPipeline/ManualQCPRS/tests/resources/test_results.tsv
@@ -1,7 +1,7 @@
-sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted
-sample_1	BATCH_12345	FALSE	37.2	0.78	0.78	NOT_HIGH	NA	11.1	1.9	0.97	HIGH	NA	4.3	0.93	0.82	NOT_HIGH	NA
-sample_2	BATCH_12345	FALSE	39.4	0.29	0.62	NOT_HIGH	NA	12.2	0.71	0.76	NOT_HIGH	NA	4.4	1.58	0.94	NOT_HIGH	NA
-sample_3	BATCH_12345	TRUE	34.1	2.1	0.98	HIGH	NA	12.4	-1.13	0.13	NOT_HIGH	NA	3.9	-0.61	0.27	NOT_HIGH	NA
-sample_4	BATCH_12345	FALSE	36.3	-1.78	0.04	NOT_HIGH	NA	13.1	1.72	0.96	HIGH	NA	NA	NA	NA	NA
-sample_5	BATCH_12345	FALSE	38.5	0.43	0.66	NOT_HIGH	NA	10.8	-0.09	0.46	NOT_HIGH	NA	NA	NA	NA	NA
-sample_6	BATCH_12345	FALSE	36.2	-0.99	0.16	NOT_HIGH	NA	NA	NA	NA	NA	NA	2.4	-0.28	0.39	NOT_HIGH	NA
+sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted	PC1	PC2
+sample_1	BATCH_12345	FALSE	37.2	0.78	0.78	NOT_HIGH	NA	11.1	1.9	0.97	HIGH	NA	4.3	0.93	0.82	NOT_HIGH	NA	1.2	0.7
+sample_2	BATCH_12345	FALSE	39.4	0.29	0.62	NOT_HIGH	NA	12.2	0.71	0.76	NOT_HIGH	NA	4.4	1.58	0.94	NOT_HIGH	NA	1.4	0.6
+sample_3	BATCH_12345	TRUE	34.1	2.1	0.98	HIGH	NA	12.4	-1.13	0.13	NOT_HIGH	NA	3.9	-0.61	0.27	NOT_HIGH	NA	1.5	1.9
+sample_4	BATCH_12345	FALSE	36.3	-1.78	0.04	NOT_HIGH	NA	13.1	1.72	0.96	HIGH	NA	NA	NA	NA	NA	NA	1.3	1.4
+sample_5	BATCH_12345	FALSE	38.5	0.43	0.66	NOT_HIGH	NA	10.8	-0.09	0.46	NOT_HIGH	NA	NA	NA	NA	NA	NA	0.8	-0.7
+sample_6	BATCH_12345	FALSE	36.2	-0.99	0.16	NOT_HIGH	NA	NA	NA	NA	NA	NA	2.4	-0.28	0.39	NOT_HIGH	NA	1.2	-1.1
diff --git a/ImputationPipeline/ManualQCPRS/tests/resources/test_results_2.tsv b/ImputationPipeline/ManualQCPRS/tests/resources/test_results_2.tsv
@@ -1,7 +1,7 @@
-sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted
-sample_7	BATCH_12345	FALSE	37.2	0.78	0.78	NOT_HIGH	NA	11.1	1.9	0.97	HIGH	NA	4.3	0.93	0.82	NOT_HIGH	NA
-sample_8	BATCH_12345	FALSE	39.4	0.29	0.62	NOT_HIGH	NA	12.2	0.71	0.76	NOT_HIGH	NA	4.4	1.58	0.94	NOT_HIGH	NA
-sample_3	BATCH_12345	TRUE	34.1	2.1	0.98	HIGH	NA	12.4	-1.13	0.13	NOT_HIGH	NA	3.9	-0.61	0.27	NOT_HIGH	NA
-sample_9	BATCH_12345	FALSE	36.3	-1.78	0.04	NOT_HIGH	NA	13.1	1.72	0.96	HIGH	NA	NA	NA	NA	NA
-sample_10	BATCH_12345	FALSE	38.5	0.43	0.66	NOT_HIGH	NA	10.8	-0.09	0.46	NOT_HIGH	NA	NA	NA	NA	NA
-sample_11	BATCH_12345	FALSE	36.2	-0.99	0.16	NOT_HIGH	NA	NA	NA	NA	NA	NA	2.4	-0.28	0.39	NOT_HIGH	NA
+sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted	PC1	PC2
+sample_7	BATCH_12345	FALSE	37.2	0.78	0.78	NOT_HIGH	NA	11.1	1.9	0.97	HIGH	NA	4.3	0.93	0.82	NOT_HIGH	NA	1.3	1.1
+sample_8	BATCH_12345	FALSE	39.4	0.29	0.62	NOT_HIGH	NA	12.2	0.71	0.76	NOT_HIGH	NA	4.4	1.58	0.94	NOT_HIGH	NA	0.9	1.4
+sample_3	BATCH_12345	TRUE	34.1	2.1	0.98	HIGH	NA	12.4	-1.13	0.13	NOT_HIGH	NA	3.9	-0.61	0.27	NOT_HIGH	NA	1.4	0.2
+sample_9	BATCH_12345	FALSE	36.3	-1.78	0.04	NOT_HIGH	NA	13.1	1.72	0.96	HIGH	NA	NA	NA	NA	NA	NA	1.1	0.9
+sample_10	BATCH_12345	FALSE	38.5	0.43	0.66	NOT_HIGH	NA	10.8	-0.09	0.46	NOT_HIGH	NA	NA	NA	NA	NA	NA	0.7	0.1
+sample_11	BATCH_12345	FALSE	36.2	-0.99	0.16	NOT_HIGH	NA	NA	NA	NA	NA	NA	2.4	-0.28	0.39	NOT_HIGH	NA	1.5	1.2
diff --git a/ImputationPipeline/ManualQCPRS/tests/resources/test_results_3.tsv b/ImputationPipeline/ManualQCPRS/tests/resources/test_results_3.tsv
@@ -1,7 +1,7 @@
-sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted
-sample_1	BATCH_123456	FALSE	37.2	0.78	0.78	NOT_HIGH	NA	11.1	1.9	0.97	HIGH	NA	4.3	0.93	0.82	NOT_HIGH	NA
-sample_2	BATCH_123456	FALSE	39.4	0.29	0.62	NOT_HIGH	NA	12.2	0.71	0.76	NOT_HIGH	NA	4.4	1.58	0.94	NOT_HIGH	NA
-sample_3	BATCH_123456	TRUE	34.1	2.1	0.98	HIGH	NA	12.4	-1.13	0.13	NOT_HIGH	NA	3.9	-0.61	0.27	NOT_HIGH	NA
-sample_4	BATCH_123456	FALSE	36.3	-1.78	0.04	NOT_HIGH	NA	13.1	1.72	0.96	HIGH	NA	NA	NA	NA	NA
-sample_5	BATCH_123456	FALSE	38.5	0.43	0.66	NOT_HIGH	NA	10.8	-0.09	0.46	NOT_HIGH	NA	NA	NA	NA	NA
-sample_6	BATCH_123456	FALSE	36.2	-0.99	0.16	NOT_HIGH	NA	NA	NA	NA	NA	NA	2.4	-0.28	0.39	NOT_HIGH	NA
+sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted	PC1	PC2
+sample_1	BATCH_123456	FALSE	37.2	0.78	0.78	NOT_HIGH	NA	11.1	1.9	0.97	HIGH	NA	4.3	0.93	0.82	NOT_HIGH	NA	-1.1	1.3
+sample_2	BATCH_123456	FALSE	39.4	0.29	0.62	NOT_HIGH	NA	12.2	0.71	0.76	NOT_HIGH	NA	4.4	1.58	0.94	NOT_HIGH	NA	-0.9	1.25
+sample_3	BATCH_123456	TRUE	34.1	2.1	0.98	HIGH	NA	12.4	-1.13	0.13	NOT_HIGH	NA	3.9	-0.61	0.27	NOT_HIGH	NA	-1.4	0.7
+sample_4	BATCH_123456	FALSE	36.3	-1.78	0.04	NOT_HIGH	NA	13.1	1.72	0.96	HIGH	NA	NA	NA	NA	NA	NA	1.1	1.2
+sample_5	BATCH_123456	FALSE	38.5	0.43	0.66	NOT_HIGH	NA	10.8	-0.09	0.46	NOT_HIGH	NA	NA	NA	NA	NA	NA	0.8	-1.4
+sample_6	BATCH_123456	FALSE	36.2	-0.99	0.16	NOT_HIGH	NA	NA	NA	NA	NA	NA	2.4	-0.28	0.39	NOT_HIGH	NA	-1.2	1.3
diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+SHELL=/bin/bash -o pipefail
+
 TEST_JSON= $(shell find test -name '*.json')
 
 VALIDATE_WDL= $(shell find . -name '*.wdl' ! -path './test/*')
diff --git a/test/AggregatePRSResults/plumbing_data/expected_batch_all_results.tsv b/test/AggregatePRSResults/plumbing_data/expected_batch_all_results.tsv
@@ -0,0 +1,4 @@
+sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted	condition_4_raw	condition_4_adjusted	condition_4_percentile	condition_4_risk	condition_4_reason_not_resulted	PC1	PC2	PC3	PC4
+sample_1	test_batch	FALSE	49.8	-0.5	0.308	NOT_HIGH	NA	-0.09	-1.8	0.036	NOT_HIGH	NA	-0.3	-0.7	0.24	NOT_HIGH	NA	NA	NA	NA	NA	NA	0.13	-0.15	-0.05	-0.02
+sample_2	test_batch	TRUE	51.2	2.1	0.98	HIGH	NA	1.97	0.9	0.82	NOT_HIGH	NA	-0.2	-0.3	0.38	NOT_HIGH	NA	1.23	0.32	0.63	NOT_HIGH	NA	0.12	-0.15	-0.05	-0.03
+sample_3	test_batch	FALSE	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	1.2	-0.03	0.48	NOT_HIGH	NA	0.11	-0.15	-0.05	-0.03
diff --git a/test/AggregatePRSResults/plumbing_data/expected_batch_control_results.tsv b/test/AggregatePRSResults/plumbing_data/expected_batch_control_results.tsv
@@ -0,0 +1,2 @@
+sample_id	lab_batch	is_control_sample	condition_1_raw	condition_1_adjusted	condition_1_percentile	condition_1_risk	condition_1_reason_not_resulted	condition_2_raw	condition_2_adjusted	condition_2_percentile	condition_2_risk	condition_2_reason_not_resulted	condition_3_raw	condition_3_adjusted	condition_3_percentile	condition_3_risk	condition_3_reason_not_resulted	condition_4_raw	condition_4_adjusted	condition_4_percentile	condition_4_risk	condition_4_reason_not_resulted	PC1	PC2	PC3	PC4
+sample_2	test_batch	TRUE	51.2	2.1	0.98	HIGH	NA	1.97	0.9	0.82	NOT_HIGH	NA	-0.2	-0.3	0.38	NOT_HIGH	NA	1.23	0.32	0.63	NOT_HIGH	NA	0.12	-0.15	-0.05	-0.03
diff --git a/test/AggregatePRSResults/plumbing_data/expected_batch_pcs.tsv b/test/AggregatePRSResults/plumbing_data/expected_batch_pcs.tsv
@@ -0,0 +1,4 @@
+sample_id	PC1	PC2	PC3	PC4
+sample_1	0.13	-0.15	-0.05	-0.02
+sample_2	0.12	-0.15	-0.05	-0.03
+sample_3	0.11	-0.15	-0.05	-0.03
diff --git a/test/AggregatePRSResults/plumbing_data/expected_batch_summarised_results.tsv b/test/AggregatePRSResults/plumbing_data/expected_batch_summarised_results.tsv
@@ -0,0 +1,5 @@
+condition	mean_adjusted	mean_percentile	num_samples	num_scored	num_high	num_not_high	num_not_resulted
+condition_1	-0.5	0.308	2	1	0	1	0
+condition_2	-1.8	0.036	2	1	0	1	0
+condition_3	-0.7	0.24	2	1	0	1	0
+condition_4	-0.03	0.48	2	1	0	1	0
diff --git a/test/AggregatePRSResults/plumbing_data/sample_1_missing_sites_shifts.tsv b/test/AggregatePRSResults/plumbing_data/sample_1_missing_sites_shifts.tsv
@@ -0,0 +1,4 @@
+sample_id	condition	n_missing_sites	adjusted_score	percentile	potential_high_adjusted_score	potential_high_percentile	potential_low_adjusted_score	potential_low_percentile
+sample_1	condition_1	0	-0.5	0.308	-0.5	0.308	-0.5	0.308
+sample_1	condition_2	0	-1.8	0.036	-1.8	0.036	-1.8	0.036
+sample_1	condition_3	0	-0.7	0.24	-0.7	0.24	-0.7	0.24
diff --git a/test/AggregatePRSResults/plumbing_data/sample_1_projections.tsv b/test/AggregatePRSResults/plumbing_data/sample_1_projections.tsv
@@ -0,0 +1,2 @@
+FID	IID	PC1	PC2	PC3	PC4
+0	sample_1	0.13	-0.15	-0.05	-0.02
diff --git a/test/AggregatePRSResults/plumbing_data/sample_1_results.csv b/test/AggregatePRSResults/plumbing_data/sample_1_results.csv
@@ -0,0 +1,2 @@
+sample_id,lab_batch,is_control_sample,condition_1_raw,condition_1_adjusted,condition_1_percentile,condition_1_risk,condition_1_reason_not_resulted,condition_2_raw,condition_2_adjusted,condition_2_percentile,condition_2_risk,condition_2_reason_not_resulted,condition_3_raw,condition_3_adjusted,condition_3_percentile,condition_3_risk,condition_3_reason_not_resulted,condition_4_raw,condition_4_adjusted,condition_4_percentile,condition_4_risk,condition_4_reason_not_resulted
+sample_1,test_batch,false,49.8,-0.5,0.308,NOT_HIGH,NA,-0.09,-1.8,0.036,NOT_HIGH,NA,-0.3,-0.7,0.24,NOT_HIGH,NA,NA,NA,NA,NA,NA
diff --git a/test/AggregatePRSResults/plumbing_data/sample_2_missing_sites_shifts.tsv b/test/AggregatePRSResults/plumbing_data/sample_2_missing_sites_shifts.tsv
@@ -0,0 +1,5 @@
+sample_id	condition	n_missing_sites	adjusted_score	percentile	potential_high_adjusted_score	potential_high_percentile	potential_low_adjusted_score	potential_low_percentile
+sample_2	condition_1	0	2.1	0.98	2.1	0.98	2.1	0.98
+sample_2	condition_2	0	0.9	0.82	0.9	0.82	0.9	0.82
+sample_2	condition_3	0	-0.3	0.38	-0.3	0.38	-0.3	0.38
+sample_2	condition_4	0	0.32	0.63	0.32	0.63	0.32	0.63
diff --git a/test/AggregatePRSResults/plumbing_data/sample_2_projections.tsv b/test/AggregatePRSResults/plumbing_data/sample_2_projections.tsv
@@ -0,0 +1,2 @@
+FID	IID	PC1	PC2	PC3	PC4
+0	sample_2	0.12	-0.15	-0.05	-0.03
diff --git a/test/AggregatePRSResults/plumbing_data/sample_2_results.csv b/test/AggregatePRSResults/plumbing_data/sample_2_results.csv
@@ -0,0 +1,2 @@
+sample_id,lab_batch,is_control_sample,condition_1_raw,condition_1_adjusted,condition_1_percentile,condition_1_risk,condition_1_reason_not_resulted,condition_2_raw,condition_2_adjusted,condition_2_percentile,condition_2_risk,condition_2_reason_not_resulted,condition_3_raw,condition_3_adjusted,condition_3_percentile,condition_3_risk,condition_3_reason_not_resulted,condition_4_raw,condition_4_adjusted,condition_4_percentile,condition_4_risk,condition_4_reason_not_resulted
+sample_2,test_batch,true,51.2,2.1,0.98,HIGH,NA,1.97,0.9,0.82,NOT_HIGH,NA,-0.2,-0.3,0.38,NOT_HIGH,NA,1.23,0.32,0.63,NOT_HIGH,NA
diff --git a/test/AggregatePRSResults/plumbing_data/sample_3_missing_sites_shifts.tsv b/test/AggregatePRSResults/plumbing_data/sample_3_missing_sites_shifts.tsv
@@ -0,0 +1,2 @@
+sample_id	condition	n_missing_sites	adjusted_score	percentile	potential_high_adjusted_score	potential_high_percentile	potential_low_adjusted_score	potential_low_percentile
+sample_4	condition_3	0	-0.03	0.48	-0.03	0.48	-0.03	0.48
diff --git a/test/AggregatePRSResults/plumbing_data/sample_3_projections.tsv b/test/AggregatePRSResults/plumbing_data/sample_3_projections.tsv
@@ -0,0 +1,2 @@
+FID	IID	PC1	PC2	PC3	PC4
+0	sample_3	0.11	-0.15	-0.05	-0.03
diff --git a/test/AggregatePRSResults/plumbing_data/sample_3_results.csv b/test/AggregatePRSResults/plumbing_data/sample_3_results.csv
@@ -0,0 +1,2 @@
+sample_id,lab_batch,is_control_sample,condition_1_raw,condition_1_adjusted,condition_1_percentile,condition_1_risk,condition_1_reason_not_resulted,condition_2_raw,condition_2_adjusted,condition_2_percentile,condition_2_risk,condition_2_reason_not_resulted,condition_3_raw,condition_3_adjusted,condition_3_percentile,condition_3_risk,condition_3_reason_not_resulted,condition_4_raw,condition_4_adjusted,condition_4_percentile,condition_4_risk,condition_4_reason_not_resulted
+sample_3,test_batch,false,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,1.2,-0.03,0.48,NOT_HIGH,NA
diff --git a/test/AggregatePRSResults/plumbing_data/test_expected_control_results.csv b/test/AggregatePRSResults/plumbing_data/test_expected_control_results.csv
@@ -0,0 +1,2 @@
+condition_1_adjusted, condition_2_adjusted, condition_3_adjusted, condition_4_adjusted
+2.2, 0.87, -0.25, 0.3
diff --git a/test/AggregatePRSResults/plumbing_data/test_population_projections.tsv b/test/AggregatePRSResults/plumbing_data/test_population_projections.tsv
@@ -0,0 +1,10 @@
+PC1	PC2	PC3	PC4
+0.132828	-0.158744	-0.05737172	-0.03349135
+0.1305322	-0.1609648	-0.05699385	-0.03228922
+0.131498	-0.1578435	-0.0559588	-0.0330277
+0.1303931	-0.1595069	-0.05761969	-0.03046467
+0.1320887	-0.1578632	-0.05805177	-0.03590859
+0.1324791	-0.1561622	-0.05606031	-0.03278937
+0.1309225	-0.1581563	-0.05817697	-0.03207536
+0.1315213	-0.1590344	-0.05657792	-0.03416705
+0.1340385	-0.1596115	-0.05519977	-0.02799103
diff --git a/test/AggregatePRSResults/plumbing_data/test_thresholds.tsv b/test/AggregatePRSResults/plumbing_data/test_thresholds.tsv
@@ -0,0 +1,5 @@
+condition	threshold
+condition_1	0.97
+condition_2	0.97
+condition_3	0.98
+condition_4	0.97
diff --git a/test/AggregatePRSResults/test_AggregatePRSResults.wdl b/test/AggregatePRSResults/test_AggregatePRSResults.wdl
diff --git a/test/AggregatePRSResults/test_AggregatePRSResults_json/test_plumbing_inputs.json b/test/AggregatePRSResults/test_AggregatePRSResults_json/test_plumbing_inputs.json

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+SHELL=/bin/bash -o pipefail`
	`2`	`+`
`1`	`3`	`TEST_JSON= $(shell find test -name '*.json')`
`2`	`4`
`3`	`5`	`VALIDATE_WDL= $(shell find . -name '.wdl' ! -path './test/')`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+sample_id lab_batch is_control_sample condition_1_raw condition_1_adjusted condition_1_percentile condition_1_risk condition_1_reason_not_resulted condition_2_raw condition_2_adjusted condition_2_percentile condition_2_risk condition_2_reason_not_resulted condition_3_raw condition_3_adjusted condition_3_percentile condition_3_risk condition_3_reason_not_resulted condition_4_raw condition_4_adjusted condition_4_percentile condition_4_risk condition_4_reason_not_resulted PC1 PC2 PC3 PC4`
	`2`	`+sample_2 test_batch TRUE 51.2 2.1 0.98 HIGH NA 1.97 0.9 0.82 NOT_HIGH NA -0.2 -0.3 0.38 NOT_HIGH NA 1.23 0.32 0.63 NOT_HIGH NA 0.12 -0.15 -0.05 -0.03`