Skip to content

Commit 72e00b6

Browse files
authored
PRS QC Report updates (#113)
1 parent 2dd3848 commit 72e00b6

File tree

1 file changed

+52
-12
lines changed

1 file changed

+52
-12
lines changed

ImputationPipeline/AggregatePRSResults.wdl

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ workflow AggregatePRSResults {
55
Array[File] results
66
Array[File] target_pc_projections
77
Array[File] missing_sites_shifts
8+
File high_risk_thresholds
89
File population_pc_projections
910
String population_name = "Reference Population"
1011
File expected_control_results
@@ -36,7 +37,8 @@ workflow AggregatePRSResults {
3637
batch_pivoted_results = AggregateResults.batch_pivoted_results,
3738
target_pc_projections = target_pc_projections,
3839
population_pc_projections = population_pc_projections,
39-
population_name = population_name
40+
population_name = population_name,
41+
high_risk_thresholds = high_risk_thresholds
4042
}
4143

4244
output {
@@ -67,7 +69,7 @@ task AggregateResults {
6769
library(ggplot2)
6870
6971
results <- c("~{sep='","' results}") %>% map(read_csv, col_types=cols(is_control_sample='l', .default='c')) %>% reduce(bind_rows)
70-
72+
7173
lab_batch <- results %>% pull(lab_batch) %>% unique()
7274
7375
if (length(lab_batch) != 1) {
@@ -89,7 +91,7 @@ task AggregateResults {
8991
9092
write_tsv(results %>% filter(is_control_sample), paste0(lab_batch, "_control_results.tsv"))
9193
92-
results_pivoted <- results %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="([^_]+)_(.+)")
94+
results_pivoted <- results %>% filter(!is_control_sample) %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="([^_]+)_(.+)")
9395
results_pivoted <- results_pivoted %T>% {options(warn=-1)} %>% mutate(adjusted = as.numeric(adjusted),
9496
raw = as.numeric(raw),
9597
percentile = as.numeric(percentile)) %T>% {options(warn=0)}
@@ -183,6 +185,7 @@ task BuildHTMLReport {
183185
File expected_control_results
184186
File batch_summarised_results
185187
File batch_pivoted_results
188+
File high_risk_thresholds
186189
Array[File] target_pc_projections
187190
File population_pc_projections
188191
String population_name
@@ -219,6 +222,32 @@ task BuildHTMLReport {
219222
batch_pivoted_results <- read_tsv("~{batch_pivoted_results}")
220223
batch_summary <- read_tsv("~{batch_summarised_results}")
221224
batch_summary <- batch_summary %>% rename_with(.cols = -condition, ~ str_to_title(gsub("_"," ", .x)))
225+
condition_thresholds <- read_tsv("~{high_risk_thresholds}")
226+
get_probs_n_high_per_sample_distribution <- function(thresholds_list) {
227+
probs_n_high <- tibble(n_high = seq(0,length(thresholds_list)), prob=c(1,rep(0,length(thresholds_list - 1))))
228+
for (threshold in thresholds_list) {
229+
new_probs_n_high <- probs_n_high %>% mutate(prob=prob*(threshold) + lag(prob, default=0)*(1-threshold))
230+
probs_n_high <- new_probs_n_high
231+
}
232+
233+
probs_n_high <- probs_n_high %>% pivot_wider(names_from = n_high, names_prefix = "prob_high_", values_from = prob) %>%
234+
mutate(thresholds = paste0(thresholds_list, collapse = ","))
235+
}
236+
237+
thresholds_sets <- batch_pivoted_results %>% filter(risk == "HIGH" | risk == "NOT_HIGH") %>% group_by(sample_id) %>% inner_join(condition_thresholds) %>%
238+
summarise(thresholds = list(sort(threshold))) %>% pull(thresholds) %>% unique() %>% map(get_probs_n_high_per_sample_distribution) %>%
239+
reduce(bind_rows) %>% mutate(across(-thresholds, ~ifelse(is.na(.), 0, .))) %>% pivot_longer(-thresholds, names_to = "n_high",
240+
names_prefix = "prob_high_",
241+
values_to="prob") %>%
242+
mutate(n_high = as.integer(n_high))
243+
244+
threshold_set_per_sample <- batch_pivoted_results %>% filter(risk == "HIGH" | risk == "NOT_HIGH") %>% group_by(sample_id) %>% inner_join(condition_thresholds) %>%
245+
summarise(thresholds = paste0(sort(threshold), collapse=",")) %>% inner_join(thresholds_sets)
246+
247+
multi_high_samples <- batch_pivoted_results %>% filter(risk=="HIGH") %>% group_by(sample_id) %>%
248+
summarise(\`high risk conditions\` = paste(condition, collapse = ","), n=n()) %>%
249+
filter(n>1) %>% inner_join(threshold_set_per_sample) %>% group_by(sample_id, \`high risk conditions\`, n, thresholds) %>% filter(n_high >= n) %>%
250+
summarise(significance=paste0(signif(qnorm(1-sum(prob)),2), "\\U03C3")) %>% select(-n,-thresholds)
222251
\`\`\`
223252
224253
\`\`\`{css, echo=FALSE}
@@ -247,14 +276,23 @@ task BuildHTMLReport {
247276
kable(batch_summary, digits = 2, escape = FALSE, format = "pandoc")
248277
\`\`\`
249278
250-
279+
## Samples High Risk for Multiple Conditions
280+
\`r if (multi_high_samples %>% nrow() == 0) {"No Samples were high risk for multiple conditions."} else {"The following samples were high risk for multiple conditions. Significance represents the likelihood that a sample scored for the same conditions as this sample would be high for at least as many conditions, assuming all conditions are uncorrelated."}\`
281+
\`\`\`{r multi high samples table, echo = FALSE, results = "asis", warning = FALSE }
282+
if (multi_high_samples %>% nrow() > 0) {
283+
kable(multi_high_samples, digits = 2, escape = FALSE, format = "pandoc") }
284+
\`\`\`
251285
252286
## Batch Score distribution
253287
\`\`\`{r score distributions, echo=FALSE, message=FALSE, warning=FALSE, results="asis", fig.align='center'}
254-
ggplot(batch_pivoted_results, aes(x=adjusted)) +
255-
geom_density(aes(color=condition), fill=NA, position = "identity") +
256-
xlim(-5,5) + theme_bw() + xlab("z-score") + geom_function(fun=dnorm) +
288+
normal_dist <- tibble(x=seq(-5,5,0.01)) %>% mutate(y=dnorm(x)) # needed because plotly doesn't work with geom_function
289+
conditions_with_more_than_4_samples <- batch_pivoted_results %>% group_by(condition) %>% filter(!is.na(adjusted)) %>% count() %>% filter(n>4) %>% pull(condition)
290+
p_dist <- ggplot(batch_pivoted_results %>% filter(condition %in% conditions_with_more_than_4_samples), aes(x=adjusted)) +
291+
stat_density(aes(color=condition, text=condition), geom="line", position = "identity") +
292+
xlim(-5,5) + theme_bw() + xlab("z-score") + geom_line(data=normal_dist, aes(x=x, y=y), color="black") +
293+
geom_point(data = batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)), aes(color=condition, x = adjusted, text=condition), y=0) +
257294
ylab("density")
295+
ggplotly(p_dist, tooltip="text")
258296
\`\`\`
259297
260298
## PCA
@@ -271,22 +309,24 @@ task BuildHTMLReport {
271309
\`\`\`
272310
273311
## Individual Sample Results (without control sample)
274-
\`\`\`{r sample results , echo = FALSE, results = "asis", warning = FALSE}
312+
\`\`\`{r sample results , echo = FALSE, results = "asis", warning = FALSE, message = FALSE}
313+
batch_high_counts_per_sample <- batch_pivoted_results %>% group_by(sample_id) %>% summarise(n_high_risk = sum(ifelse(!is.na(risk) & risk =="HIGH", 1, 0)))
275314
batch_results_table <- batch_pivoted_results %>% filter(!is_control_sample) %>% select(!is_control_sample) %>%
276315
mutate(across(!c(sample_id, lab_batch, reason_not_resulted, condition), ~kableExtra::cell_spec(gsub("_", " ", ifelse(is.na(as.numeric(.x)), ifelse(is.na(.x), 'SCORE NOT REQUESTED', .x), round(as.numeric(.x), 2))), color=ifelse(is.na(risk), "lightgrey", ifelse(risk=="NOT_RESULTED", "red", ifelse(risk == "HIGH", "orange", "green")))))) %>% # round numbers, color all by risk
277316
mutate(reason_not_resulted = ifelse(is.na(reason_not_resulted), reason_not_resulted, kableExtra::cell_spec(reason_not_resulted, color="red"))) %>% # reason not resulted always red if exists
278-
pivot_wider(id_cols = c(sample_id, lab_batch), names_from = condition, names_glue = "{condition}_{.value}", values_from = c(raw, adjusted, percentile, risk, reason_not_resulted)) # pivot to wide format
317+
pivot_wider(id_cols = c(sample_id, lab_batch), names_from = condition, names_glue = "{condition}_{.value}", values_from = c(raw, adjusted, percentile, risk, reason_not_resulted)) %>% # pivot to wide format
318+
inner_join(batch_high_counts_per_sample) # add number of high risk conditions for each sample
279319
280320
#order columns as desired
281-
cols <- batch_results_table %>% select(-sample_id, -lab_batch) %>% colnames()
321+
cols <- batch_results_table %>% select(-sample_id, -lab_batch, -n_high_risk) %>% colnames()
282322
desired_order_values <- c("raw", "adjusted", "percentile", "risk", "reason_not_resulted")
283-
col_order <- c("sample_id", "lab_batch", cols[order(sapply(stri_split_fixed(cols, "_", n=2), "[",1), match(sapply(stri_split_fixed(cols, "_", n=2), "[",2), desired_order_values))])
323+
col_order <- c("sample_id", "lab_batch", "n_high_risk", cols[order(sapply(stri_split_fixed(cols, "_", n=2), "[",1), match(sapply(stri_split_fixed(cols, "_", n=2), "[",2), desired_order_values))])
284324
batch_results_table <- batch_results_table %>% select(all_of(col_order)) %>%
285325
rename_with(.cols = ends_with("percentile"), .fn = ~gsub("_percentile", " %", .x,fixed=TRUE)) %>%
286326
rename_with(.cols = ends_with("adjusted"), .fn = ~gsub("_adjusted", "_adj", .x,fixed=TRUE))
287327
288328
all_cols = batch_results_table %>% colnames()
289-
risk_cols = which(endsWith(all_cols, "risk"))
329+
risk_cols = which(endsWith(all_cols, "risk") & all_cols != "n_high_risk")
290330
raw_cols = which(endsWith(all_cols, "raw"))
291331
adjusted_cols = which(endsWith(all_cols, "adj"))
292332
percentile_cols = which(endsWith(all_cols, "%"))

0 commit comments

Comments
 (0)