PRS check z-score before APOL1 correction (#131)

kachulis · web-flow · commit 8a3d78f28176 · 2023-07-25T16:08:06.000-04:00
diff --git a/ImputationPipeline/AggregatePRSResults.wdl b/ImputationPipeline/AggregatePRSResults.wdl
@@ -108,12 +108,9 @@ task AggregateResults {
     write_tsv(results %>% filter(is_control_sample), "~{output_prefix}_control_results.tsv")
 
     results_pivoted <- results %>% select(-starts_with("pc")) %>% filter(!is_control_sample) %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="(.+)(?<!not)(?<!reason)_(.+)$")
-    results_pivoted <- results_pivoted %T>% {options(warn=-1)} %>% mutate(adjusted = as.numeric(adjusted),
-                                                                          raw = as.numeric(raw),
-                                                                          percentile = as.numeric(percentile)) %T>% {options(warn=0)}
 
     results_summarised <- results_pivoted %>% group_by(condition) %>%
-                                              summarise(across(c(adjusted,percentile), ~mean(.x, na.rm=TRUE), .names = "mean_{.col}"),
+                                              summarise(across(c(adjusted,percentile), ~mean(as.numeric(.x), na.rm=TRUE), .names = "mean_{.col}"),
                                                         num_samples=n(),
                                                         num_scored = sum(!is.na(risk)),
                                                         num_high = sum(risk=="HIGH", na.rm=TRUE),
@@ -122,7 +119,7 @@ task AggregateResults {
 
     write_tsv(results_summarised, "~{output_prefix}_summarised_results.tsv")
 
-    ggplot(results_pivoted, aes(x=adjusted)) +
+    ggplot(results_pivoted, aes(x=as.numeric(adjusted))) +
       geom_density(aes(color=condition), fill=NA, position = "identity") +
       xlim(-5,5) + theme_bw() + xlab("z-score") + geom_function(fun=dnorm) +
       ylab("density")
@@ -333,10 +330,10 @@ task BuildHTMLReport {
     p_dist <- ggplot()
     if (n_density > 0) {
       p_dist <- p_dist + stat_density(data = batch_pivoted_results %>% filter(condition %in% conditions_with_more_than_4_samples),
-      aes(color=condition, text=condition, x = adjusted), geom="line", position = "identity")
+      aes(color=condition, text=condition, x = as.numeric(adjusted)), geom="line", position = "identity")
     }
     if (n_point > 0) {
-      p_dist <- p_dist + geom_point(data = batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)), aes(color=condition, x = adjusted, text=condition), y=0)
+      p_dist <- p_dist + geom_point(data = batch_pivoted_results %>% filter(!(condition %in% conditions_with_more_than_4_samples)), aes(color=condition, x = as.numeric(adjusted), text=condition), y=0)
     }
     p_dist <- p_dist + xlim(-5,5) + theme_bw() + geom_line(data=normal_dist, aes(x=x, y=y), color="black") + xlab("z-score") + ylab("density")
 
diff --git a/ImputationPipeline/PRSWrapper.wdl b/ImputationPipeline/PRSWrapper.wdl
@@ -1,6 +1,6 @@
 version 1.0
 import "ScoringPart.wdl" as Score
-import "CKDRiskAdjustment.wdl" as CKDRiskAdjustment
+import "CKDRiskAdjustment.wdl" as CKDRiskAdjustmentWF
 import "Structs.wdl"
 
 workflow PRSWrapper {
@@ -37,22 +37,28 @@ workflow PRSWrapper {
       }
 
       if (condition_resource.named_weight_set.condition_name == "ckd") {
-        call CKDRiskAdjustment.CKDRiskAdjustment {
+        call CKDRiskAdjustmentWF.CKDRiskAdjustment {
           input:
             adjustedScores = select_first([ScoringImputedDataset.adjusted_array_scores]),
             vcf = vcf,
             risk_alleles = ckd_risk_alleles
         }
       }
 
+      call CheckZScoreAgainstReportableRange {
+        input:
+          score_result = select_first([ScoringImputedDataset.adjusted_array_scores]),
+          z_score_reportable_range = z_score_reportable_range
+      }
 
       call SelectValuesOfInterest {
         input:
           score_result = select_first([CKDRiskAdjustment.adjusted_scores_with_apol1, ScoringImputedDataset.adjusted_array_scores]),
           sample_id = sample_id,
           condition_name = condition_resource.named_weight_set.condition_name,
           threshold = condition_resource.percentile_threshold,
-          z_score_reportable_range = z_score_reportable_range
+          z_score_reportable_range = z_score_reportable_range,
+          out_of_reportable_range = CheckZScoreAgainstReportableRange.out_of_reportable_range
       }
     }
 
@@ -94,12 +100,44 @@ workflow PRSWrapper {
 }
 
 
+task CheckZScoreAgainstReportableRange {
+  input {
+    File score_result
+    Float z_score_reportable_range
+  }
+
+  command <<<
+    Rscript - <<- "EOF"
+    library(dplyr)
+    library(readr)
+    score <- read_tsv("~{score_result}")
+    if (nrow(score) != 1) {
+    quit(status=1)
+    }
+
+    adjusted_score <- (score %>% pull(adjusted_score))[[1]]
+    write(abs(adjusted_score) > ~{z_score_reportable_range}, "out_of_reportable_range.bool")
+    EOF
+  >>>
+
+  runtime {
+    docker: "rocker/tidyverse@sha256:aaace6c41a258e13da76881f0b282932377680618fcd5d121583f9455305e727"
+    disks: "local-disk 100 HDD"
+    memory: "4 GB"
+  }
+
+  output {
+    Boolean out_of_reportable_range = read_boolean("out_of_reportable_range.bool")
+  }
+}
+
 task SelectValuesOfInterest {
   input {
     File score_result
     String sample_id
     String condition_name
     Float threshold
+    Boolean out_of_reportable_range
     Float z_score_reportable_range
   }
 
@@ -120,24 +158,21 @@ task SelectValuesOfInterest {
     percentile <- (score %>% pull(percentile))[[1]]
     risk <- ifelse(percentile > ~{threshold}, "HIGH", "NOT_HIGH")
 
-    raw_score_output <- ifelse(abs(adjusted_score) > ~{z_score_reportable_range}, "NOT_RESULTED", raw_score)
-    adjusted_score_output <- ifelse(abs(adjusted_score) > ~{z_score_reportable_range}, "NOT_RESULTED", adjusted_score)
-    percentile_output <- ifelse(abs(adjusted_score) > ~{z_score_reportable_range}, "NOT_RESULTED", percentile)
-    risk_output <- ifelse(abs(adjusted_score) > ~{z_score_reportable_range}, "NOT_RESULTED", risk)
-    reason_not_resulted <- ifelse(abs(adjusted_score) > ~{z_score_reportable_range},
-                                ifelse(adjusted_score > 0, paste("Z-SCORE ABOVE + ", ~{z_score_reportable_range}),
-                                                           paste("Z-SCORE BELOW - ", ~{z_score_reportable_range})
-                                      ),
+    raw_score_output <- ~{if out_of_reportable_range then '"NOT_RESULTED"' else 'raw_score'}
+    adjusted_score_output <- ~{if out_of_reportable_range then '"NOT_RESULTED"' else 'adjusted_score'}
+    percentile_output <- ~{if out_of_reportable_range then '"NOT_RESULTED"' else 'percentile'}
+    risk_output <- ~{if out_of_reportable_range then '"NOT_RESULTED"' else 'risk'}
+    reason_not_resulted <- ~{if out_of_reportable_range then
+                                'ifelse(adjusted_score > 0, "Z-SCORE ABOVE + ' + z_score_reportable_range +'", "Z-SCORE BELOW - ' + z_score_reportable_range +'")' else
                                 "NA"
-                                )
+                           }
 
     result <- tibble(sample_id = "~{sample_id}", ~{condition_name}_raw = raw_score_output,
                                                  ~{condition_name}_adjusted = adjusted_score_output,
                                                  ~{condition_name}_percentile = percentile_output,
                                                  ~{condition_name}_risk = risk_output,
                                                  ~{condition_name}_reason_not_resulted = reason_not_resulted)
     write_csv(result, "results.csv")
-
     EOF
   >>>