@@ -5,6 +5,7 @@ workflow AggregatePRSResults {
5
5
Array [File ] results
6
6
Array [File ] target_pc_projections
7
7
Array [File ] missing_sites_shifts
8
+ File high_risk_thresholds
8
9
File population_pc_projections
9
10
String population_name = "Reference Population"
10
11
File expected_control_results
@@ -36,7 +37,8 @@ workflow AggregatePRSResults {
36
37
batch_pivoted_results = AggregateResults .batch_pivoted_results ,
37
38
target_pc_projections = target_pc_projections ,
38
39
population_pc_projections = population_pc_projections ,
39
- population_name = population_name
40
+ population_name = population_name ,
41
+ high_risk_thresholds = high_risk_thresholds
40
42
}
41
43
42
44
output {
@@ -67,7 +69,7 @@ task AggregateResults {
67
69
library(ggplot2)
68
70
69
71
results <- c("~{sep='","' results}") %>% map(read_csv, col_types=cols(is_control_sample='l', .default='c')) %>% reduce(bind_rows)
70
-
72
+
71
73
lab_batch <- results %>% pull(lab_batch) %>% unique()
72
74
73
75
if (length(lab_batch) != 1) {
@@ -89,7 +91,7 @@ task AggregateResults {
89
91
90
92
write_tsv(results %>% filter(is_control_sample), paste0(lab_batch, "_control_results.tsv"))
91
93
92
- results_pivoted <- results %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="([^_]+)_(.+)")
94
+ results_pivoted <- results %>% filter(!is_control_sample) %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="([^_]+)_(.+)")
93
95
results_pivoted <- results_pivoted %T>% {options(warn=-1)} %>% mutate(adjusted = as.numeric(adjusted),
94
96
raw = as.numeric(raw),
95
97
percentile = as.numeric(percentile)) %T>% {options(warn=0)}
@@ -183,6 +185,7 @@ task BuildHTMLReport {
183
185
File expected_control_results
184
186
File batch_summarised_results
185
187
File batch_pivoted_results
188
+ File high_risk_thresholds
186
189
Array[File] target_pc_projections
187
190
File population_pc_projections
188
191
String population_name
@@ -219,6 +222,32 @@ task BuildHTMLReport {
219
222
batch_pivoted_results <- read_tsv("~{batch_pivoted_results}")
220
223
batch_summary <- read_tsv("~{batch_summarised_results}")
221
224
batch_summary <- batch_summary %>% rename_with(.cols = -condition, ~ str_to_title(gsub("_"," ", .x)))
225
+ condition_thresholds <- read_tsv("~{high_risk_thresholds}")
226
+ get_probs_n_high_per_sample_distribution <- function(thresholds_list) {
227
+ probs_n_high <- tibble(n_high = seq(0,length(thresholds_list)), prob=c(1,rep(0,length(thresholds_list - 1))))
228
+ for (threshold in thresholds_list) {
229
+ new_probs_n_high <- probs_n_high %>% mutate(prob=prob*(threshold) + lag(prob, default=0)*(1-threshold))
230
+ probs_n_high <- new_probs_n_high
231
+ }
232
+
233
+ probs_n_high <- probs_n_high %>% pivot_wider(names_from = n_high, names_prefix = "prob_high_", values_from = prob) %>%
234
+ mutate(thresholds = paste0(thresholds_list, collapse = ","))
235
+ }
236
+
237
+ thresholds_sets <- batch_pivoted_results %>% filter(risk == "HIGH" | risk == "NOT_HIGH") %>% group_by(sample_id) %>% inner_join(condition_thresholds) %>%
238
+ summarise(thresholds = list(sort(threshold))) %>% pull(thresholds) %>% unique() %>% map(get_probs_n_high_per_sample_distribution) %>%
239
+ reduce(bind_rows) %>% mutate(across(-thresholds, ~ifelse(is.na(.), 0, .))) %>% pivot_longer(-thresholds, names_to = "n_high",
240
+ names_prefix = "prob_high_",
241
+ values_to="prob") %>%
242
+ mutate(n_high = as.integer(n_high))
243
+
244
+ threshold_set_per_sample <- batch_pivoted_results %>% filter(risk == "HIGH" | risk == "NOT_HIGH") %>% group_by(sample_id) %>% inner_join(condition_thresholds) %>%
245
+ summarise(thresholds = paste0(sort(threshold), collapse=",")) %>% inner_join(thresholds_sets)
246
+
247
+ multi_high_samples <- batch_pivoted_results %>% filter(risk=="HIGH") %>% group_by(sample_id) %>%
248
+ summarise(\`high risk conditions\` = paste(condition, collapse = ","), n=n()) %>%
249
+ filter(n>1) %>% inner_join(threshold_set_per_sample) %>% group_by(sample_id, \`high risk conditions\`, n, thresholds) %>% filter(n_high >= n) %>%
250
+ summarise(significance=paste0(signif(qnorm(1-sum(prob)),2), "\\U03C3")) %>% select(-n,-thresholds)
222
251
\`\`\`
223
252
224
253
\`\`\`{css, echo=FALSE}
@@ -247,14 +276,23 @@ task BuildHTMLReport {
247
276
kable(batch_summary, digits = 2, escape = FALSE, format = "pandoc")
248
277
\`\`\`
249
278
250
-
279
+ ## Samples High Risk for Multiple Conditions
280
+ \`r if (multi_high_samples %>% nrow() == 0) {"No Samples were high risk for multiple conditions."} else {"The following samples were high risk for multiple conditions. Significance represents the likelihood that a sample scored for the same conditions as this sample would be high for at least as many conditions, assuming all conditions are uncorrelated."}\`
281
+ \`\`\`{r multi high samples table, echo = FALSE, results = "asis", warning = FALSE }
282
+ if (multi_high_samples %>% nrow() > 0) {
283
+ kable(multi_high_samples, digits = 2, escape = FALSE, format = "pandoc") }
284
+ \`\`\`
251
285
252
286
## Batch Score distribution
253
287
\`\`\`{r score distributions, echo=FALSE, message=FALSE, warning=FALSE, results="asis", fig.align='center'}
254
- ggplot(batch_pivoted_results, aes(x=adjusted)) +
255
- geom_density(aes(color=condition), fill=NA, position = "identity") +
256
- xlim(-5,5) + theme_bw() + xlab("z-score") + geom_function(fun=dnorm) +
288
+ normal_dist <- tibble(x=seq(-5,5,0.01)) %>% mutate(y=dnorm(x)) # needed because plotly doesn't work with geom_function
289
+ conditions_with_more_than_4_samples <- batch_pivoted_results %>% group_by(condition) %>% filter(!is.na(adjusted)) %>% count() %>% filter(n>4) %>% pull(condition)
290
+ p_dist <- ggplot(batch_pivoted_results %>% filter(condition %in% conditions_with_more_than_4_samples), aes(x=adjusted)) +
291
+ stat_density(aes(color=condition, text=condition), geom="line", position = "identity") +
292
+ xlim(-5,5) + theme_bw() + xlab("z-score") + geom_line(data=normal_dist, aes(x=x, y=y), color="black") +
293
+ geom_point(data = batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)), aes(color=condition, x = adjusted, text=condition), y=0) +
257
294
ylab("density")
295
+ ggplotly(p_dist, tooltip="text")
258
296
\`\`\`
259
297
260
298
## PCA
@@ -271,22 +309,24 @@ task BuildHTMLReport {
271
309
\`\`\`
272
310
273
311
## Individual Sample Results (without control sample)
274
- \`\`\`{r sample results , echo = FALSE, results = "asis", warning = FALSE}
312
+ \`\`\`{r sample results , echo = FALSE, results = "asis", warning = FALSE, message = FALSE}
313
+ batch_high_counts_per_sample <- batch_pivoted_results %>% group_by(sample_id) %>% summarise(n_high_risk = sum(ifelse(!is.na(risk) & risk =="HIGH", 1, 0)))
275
314
batch_results_table <- batch_pivoted_results %>% filter(!is_control_sample) %>% select(!is_control_sample) %>%
276
315
mutate(across(!c(sample_id, lab_batch, reason_not_resulted, condition), ~kableExtra::cell_spec(gsub("_", " ", ifelse(is.na(as.numeric(.x)), ifelse(is.na(.x), 'SCORE NOT REQUESTED', .x), round(as.numeric(.x), 2))), color=ifelse(is.na(risk), "lightgrey", ifelse(risk=="NOT_RESULTED", "red", ifelse(risk == "HIGH", "orange", "green")))))) %>% # round numbers, color all by risk
277
316
mutate(reason_not_resulted = ifelse(is.na(reason_not_resulted), reason_not_resulted, kableExtra::cell_spec(reason_not_resulted, color="red"))) %>% # reason not resulted always red if exists
278
- pivot_wider(id_cols = c(sample_id, lab_batch), names_from = condition, names_glue = "{condition}_{.value}", values_from = c(raw, adjusted, percentile, risk, reason_not_resulted)) # pivot to wide format
317
+ pivot_wider(id_cols = c(sample_id, lab_batch), names_from = condition, names_glue = "{condition}_{.value}", values_from = c(raw, adjusted, percentile, risk, reason_not_resulted)) %>% # pivot to wide format
318
+ inner_join(batch_high_counts_per_sample) # add number of high risk conditions for each sample
279
319
280
320
#order columns as desired
281
- cols <- batch_results_table %>% select(-sample_id, -lab_batch) %>% colnames()
321
+ cols <- batch_results_table %>% select(-sample_id, -lab_batch, -n_high_risk ) %>% colnames()
282
322
desired_order_values <- c("raw", "adjusted", "percentile", "risk", "reason_not_resulted")
283
- col_order <- c("sample_id", "lab_batch", cols[order(sapply(stri_split_fixed(cols, "_", n=2), "[",1), match(sapply(stri_split_fixed(cols, "_", n=2), "[",2), desired_order_values))])
323
+ col_order <- c("sample_id", "lab_batch", "n_high_risk", cols[order(sapply(stri_split_fixed(cols, "_", n=2), "[",1), match(sapply(stri_split_fixed(cols, "_", n=2), "[",2), desired_order_values))])
284
324
batch_results_table <- batch_results_table %>% select(all_of(col_order)) %>%
285
325
rename_with(.cols = ends_with("percentile"), .fn = ~gsub("_percentile", " %", .x,fixed=TRUE)) %>%
286
326
rename_with(.cols = ends_with("adjusted"), .fn = ~gsub("_adjusted", "_adj", .x,fixed=TRUE))
287
327
288
328
all_cols = batch_results_table %>% colnames()
289
- risk_cols = which(endsWith(all_cols, "risk"))
329
+ risk_cols = which(endsWith(all_cols, "risk") & all_cols != "n_high_risk" )
290
330
raw_cols = which(endsWith(all_cols, "raw"))
291
331
adjusted_cols = which(endsWith(all_cols, "adj"))
292
332
percentile_cols = which(endsWith(all_cols, "%"))
0 commit comments