broadinstitute
diff --git a/‎ImputationPipeline/AggregatePRSResults.wdl
Lines changed: 26 additions & 18 deletions b/‎ImputationPipeline/AggregatePRSResults.wdl
Lines changed: 26 additions & 18 deletions
diff --git a/‎ImputationPipeline/CreateAggregationSets/CreateAggregationSets.py
Lines changed: 230 additions & 0 deletions b/‎ImputationPipeline/CreateAggregationSets/CreateAggregationSets.py
Lines changed: 230 additions & 0 deletions
diff --git a/‎ImputationPipeline/CreateAggregationSets/__init__.py b/‎ImputationPipeline/CreateAggregationSets/__init__.py
diff --git a/‎ImputationPipeline/CreateAggregationSets/pyproject.toml
Lines changed: 13 additions & 0 deletions b/‎ImputationPipeline/CreateAggregationSets/pyproject.toml
Lines changed: 13 additions & 0 deletions
@@ -10,6 +10,7 @@ workflow AggregatePRSResults {
     String population_name = "Reference Population"
     File expected_control_results
     String lab_batch
+    Int group_n
   }
 
   call AggregateResults {
@@ -57,8 +58,10 @@ task AggregateResults {
     Array[File] results
     Array[File] missing_sites_shifts
     String lab_batch
+    Int group_n
   }
 
+  String output_prefix = lab_batch + if group_n > 1 then  "_group_" + group_n else ""
   command <<<
     Rscript - <<- "EOF"
     library(dplyr)
@@ -87,9 +90,9 @@ task AggregateResults {
       stop(paste0("There are ", num_control_samples, " control samples in the input tables, however, only 1 is expected."))
     }
 
-    write_tsv(results, paste0(lab_batch, "_all_results.tsv"))
+    write_tsv(results, "~{output_prefix}_all_results.tsv")
 
-    write_tsv(results %>% filter(is_control_sample), paste0(lab_batch, "_control_results.tsv"))
+    write_tsv(results %>% filter(is_control_sample), "~{output_prefix}_control_results.tsv")
 
     results_pivoted <- results %>% filter(!is_control_sample) %>% pivot_longer(!c(sample_id, lab_batch, is_control_sample), names_to=c("condition",".value"), names_pattern="([^_]+)_(.+)")
     results_pivoted <- results_pivoted %T>% {options(warn=-1)} %>% mutate(adjusted = as.numeric(adjusted),
@@ -104,20 +107,20 @@ task AggregateResults {
                                                         num_not_high = sum(risk=="NOT_HIGH", na.rm=TRUE),
                                                         num_not_resulted = sum(risk=="NOT_RESULTED", na.rm = TRUE))
 
-    write_tsv(results_summarised, paste0(lab_batch, "_summarised_results.tsv"))
+    write_tsv(results_summarised, "~{output_prefix}_summarised_results.tsv")
 
     ggplot(results_pivoted, aes(x=adjusted)) +
       geom_density(aes(color=condition), fill=NA, position = "identity") +
       xlim(-5,5) + theme_bw() + xlab("z-score") + geom_function(fun=dnorm) +
       ylab("density")
-    ggsave(filename = paste0(lab_batch, "_score_distribution.png"), dpi=300, width = 6, height = 6)
+    ggsave(filename = "~{output_prefix}_score_distribution.png", dpi=300, width = 6, height = 6)
 
-    write_tsv(results_pivoted, paste0(lab_batch, "_pivoted_results.tsv"))
+    write_tsv(results_pivoted, "~{output_prefix}_pivoted_results.tsv")
 
     writeLines(lab_batch, "lab_batch.txt")
 
     missing_sites_shifts <-  c("~{sep='","' missing_sites_shifts}") %>% map(read_tsv) %>% reduce(bind_rows)
-    write_tsv(missing_sites_shifts, paste0(lab_batch, "_missing_sites_shifts.tsv"))
+    write_tsv(missing_sites_shifts, "~{output_prefix}_missing_sites_shifts.tsv")
 
     EOF
   >>>
@@ -129,12 +132,12 @@ task AggregateResults {
   }
 
   output {
-    File batch_all_results = "~{lab_batch}_all_results.tsv"
-    File batch_control_results = "~{lab_batch}_control_results.tsv"
-    File batch_summarised_results = "~{lab_batch}_summarised_results.tsv"
-    File batch_pivoted_results = "~{lab_batch}_pivoted_results.tsv"
-    File batch_score_distribution = "~{lab_batch}_score_distribution.png"
-    File batch_missing_sites_shifts = "~{lab_batch}_missing_sites_shifts.tsv"
+    File batch_all_results = "~{output_prefix}_all_results.tsv"
+    File batch_control_results = "~{output_prefix}_control_results.tsv"
+    File batch_summarised_results = "~{output_prefix}_summarised_results.tsv"
+    File batch_pivoted_results = "~{output_prefix}_pivoted_results.tsv"
+    File batch_score_distribution = "~{output_prefix}_score_distribution.png"
+    File batch_missing_sites_shifts = "~{output_prefix}_missing_sites_shifts.tsv"
   }
 }
 
@@ -143,9 +146,11 @@ task PlotPCA {
     Array[File] target_pc_projections
     File population_pc_projections
     String lab_batch
+    Int group_n
     String population_name
   }
 
+  String output_prefix = lab_batch + if group_n > 1 then  "_group_" + group_n else ""
   command <<<
     Rscript - <<- "EOF"
     library(dplyr)
@@ -161,7 +166,7 @@ task PlotPCA {
       geom_point(data=target_pcs, aes(color="~{lab_batch}")) +
       theme_bw()
 
-    ggsave(filename = "~{lab_batch}_PCA_plot.png", dpi=300, width = 6, height = 6)
+    ggsave(filename = "~{output_prefix}_PCA_plot.png", dpi=300, width = 6, height = 6)
 
     EOF
 
@@ -174,7 +179,7 @@ task PlotPCA {
   }
 
   output {
-    File pc_plot = "~{lab_batch}_PCA_plot.png"
+    File pc_plot = "~{output_prefix}_PCA_plot.png"
   }
 }
 
@@ -190,14 +195,17 @@ task BuildHTMLReport {
     File population_pc_projections
     String population_name
     String lab_batch
+    Int group_n
   }
 
+  String output_prefix = lab_batch + if group_n > 1 then  "_group_" + group_n else ""
+  String title_batch = lab_batch + if group_n > 1 then  "(group " + group_n + ")"else ""
   command <<<
     set -xeo pipefail
 
-    cat << EOF > ~{lab_batch}_report.Rmd
+    cat << EOF > ~{output_prefix}_report.Rmd
     ---
-    title: "Batch ~{lab_batch} PRS Summary"
+    title: "Batch ~{title_batch} PRS Summary"
     output:
     html_document:
       df_print: paged
@@ -386,7 +394,7 @@ task BuildHTMLReport {
     \`\`\`
     EOF
 
-    Rscript -e "library(rmarkdown); rmarkdown::render('~{lab_batch}_report.Rmd', 'html_document')"
+    Rscript -e "library(rmarkdown); rmarkdown::render('~{output_prefix}_report.Rmd', 'html_document')"
   >>>
 
   runtime {
@@ -396,6 +404,6 @@ task BuildHTMLReport {
   }
 
   output {
-    File report = "~{lab_batch}_report.html"
+    File report = "~{output_prefix}_report.html"
   }
 }
@@ -0,0 +1,230 @@
+import firecloud.api as fapi
+import argparse
+from datetime import datetime
+import pytz
+from dataclasses import dataclass, field
+from io import StringIO
+
+
+@dataclass
+class AggregationSet:
+    lab_batch: str
+    group: int = 1
+    delivered: bool = False
+    contains_control: bool = False
+    set_id: str = field(init=False)
+
+    def __post_init__(self):
+        if self.group > 1:
+            self.set_id = f'{self.lab_batch}_group_{self.group}'
+        elif self.group == 1:
+            self.set_id = self.lab_batch
+        else:
+            raise RuntimeError(
+                f'Group of aggregation set for lab_batch {self.lab_batch} is {self.group}, should not be less than 1')
+
+
+def pre_existing_aggregation_set(lab_batch, group, delivered):
+    return AggregationSet(lab_batch, group, delivered, True)
+
+
+def next_aggregation_set(agg_set):
+    new_group = agg_set.group + 1
+    return AggregationSet(agg_set.lab_batch, new_group)
+
+
+class GroupBuilder:
+
+    def __init__(self, workspace_namespace, workspace_name):
+        self.workspace_namespace = workspace_namespace
+        self.workspace_name = workspace_name
+
+        print('Finding tables to group by lab_batch')
+        entity_types_response = fapi.list_entity_types(self.workspace_namespace, self.workspace_name)
+        if not entity_types_response.ok:
+            raise RuntimeError(f'ERROR: {entity_types_response.text}')
+        self.entity_types_dict = entity_types_response.json()
+        self.available_tables = self.entity_types_dict.keys()
+
+    def build_groups(self):
+        for table_name, description in self.entity_types_dict.items():
+            if all(x in description['attributeNames'] for x in ['is_control_sample', 'lab_batch']):
+                self.group_samples_into_batches(table_name)
+
+    def group_samples_into_batches(self, table_name):
+        samples_already_in_aggregation_sets = set()  # set of samples already in aggregation sets
+        lab_batch_sample_sets_dict = dict()  # dict from lab_batch to highest group aggregation_set for that lab_batch
+
+        if f'{table_name}_set' in self.available_tables:
+            # Download current sample_set table
+            print(f'Downloading {table_name}_set table...')
+            sample_set_response = fapi.get_entities(self.workspace_namespace, self.workspace_name, f'{table_name}_set')
+            if not sample_set_response.ok:
+                raise RuntimeError(f'ERROR: {sample_set_response.text}')
+            sample_sets_dict = sample_set_response.json()
+
+            for sample_set in sample_sets_dict:
+                samples = [e['entityName'] for e in sample_set['attributes'][f'{table_name}s']['items']]
+                samples_already_in_aggregation_sets.update(samples)
+
+            for sample_set in sample_sets_dict:
+                attributes = sample_set['attributes']
+                lab_batch = attributes['lab_batch']
+                this_aggregation_set = AggregationSet(lab_batch, attributes['group'], attributes['delivered'], True)
+                if lab_batch in lab_batch_sample_sets_dict:
+                    if lab_batch_sample_sets_dict[lab_batch].group < this_aggregation_set.group:
+                        if not lab_batch_sample_sets_dict[lab_batch].delivered:
+                            raise RuntimeError(
+                                f'Aggregation set {lab_batch_sample_sets_dict[lab_batch].set_id}'
+                                f' has not been delivered, '
+                                f'but later set {this_aggregation_set.set_id} also exists')
+                        lab_batch_sample_sets_dict[attributes['lab_batch']] = this_aggregation_set
+                else:
+                    lab_batch_sample_sets_dict[attributes['lab_batch']] = this_aggregation_set
+
+        # Read samples from samples table
+        print(f'Reading {table_name} table...')
+        sample_response = fapi.get_entities(self.workspace_namespace, self.workspace_name, f'{table_name}')
+        if not sample_response.ok:
+            raise RuntimeError(f'ERROR: {sample_response.text}')
+
+        samples = sample_response.json()
+        # Writing new sample_set_membership.tsv
+        added_sample_sets_dict = dict()  # dictionary from lab_batch to aggregation sets with added samples
+        control_samples_dict = dict()  # dictionary from lab_batch to sample id of control sample
+        added_samples_dict = dict()  # dictionary from set_id to list of samples to be added to the set
+        with StringIO() as new_membership_io, \
+                StringIO() as samples_updated_io:
+            # Write header
+            new_membership_io.write(f'membership:{table_name}_set_id\t{table_name}\n')
+            samples_updated_io.write(f'entity:{table_name}_id\trework\n')
+            for sample in samples:
+                if 'lab_batch' not in sample['attributes']:
+                    continue
+                sample_name = sample['name']
+                lab_batch = sample['attributes']['lab_batch']
+                is_control_sample = sample['attributes']['is_control_sample']
+                rework = sample['attributes'].get('rework', False)
+                if is_control_sample:
+                    # do we already have a control sample for this lab batch?  that would be bad...
+                    if lab_batch in control_samples_dict:
+                        raise RuntimeError(
+                            f'Multiple control samples for lab_bath {lab_batch}: {sample_name}, '
+                            f'{control_samples_dict[lab_batch]}')
+                    # store control sample name in dictionary
+                    control_samples_dict[lab_batch] = sample_name
+                    # we do not create aggregation sets if we only have the control sample.
+                    # We will add control samples to newly created aggregation sets later
+                    continue
+                if rework or sample_name not in samples_already_in_aggregation_sets:
+                    # this (non-control) sample needs to be added to an aggregation set.
+                    # Find (or create) aggregation set to add it to.
+                    if lab_batch not in added_sample_sets_dict:
+                        if lab_batch in lab_batch_sample_sets_dict:
+                            previous_aggregation_set = lab_batch_sample_sets_dict[lab_batch]
+                            if previous_aggregation_set.delivered:
+                                # we need to create the next aggregation set for this lab batch
+                                added_sample_sets_dict[lab_batch] = AggregationSet(previous_aggregation_set.lab_batch,
+                                                                                   previous_aggregation_set.group + 1)
+                            else:
+                                # we can add to the previous aggregation set
+                                added_sample_sets_dict[lab_batch] = previous_aggregation_set
+                        else:
+                            # we need to create the first aggregation set for this lab batch
+                            added_sample_sets_dict[lab_batch] = AggregationSet(lab_batch)
+                    set_id = added_sample_sets_dict[lab_batch].set_id
+                    if set_id in added_samples_dict:
+                        added_samples_dict[set_id].append(sample_name)
+                    else:
+                        added_samples_dict[set_id] = [sample_name]
+            # loop through added_sample_sets_dict and write set membership if we have control sample for set
+            lab_batches_without_controls = list()
+            for lab_batch, agg_set in added_sample_sets_dict.items():
+                set_id = agg_set.set_id
+                if agg_set.contains_control:
+                    # if this aggregation set already contains control, we can simply add new samples
+                    for sample in added_samples_dict[set_id]:
+                        new_membership_io.write(f'{set_id}\t{sample}\n')
+                        samples_updated_io.write(f'{sample}\tfalse\n')
+                elif lab_batch in control_samples_dict:
+                    # found control sample, so this aggregation set can be added
+                    # add control sample to this aggregation set
+                    added_samples_dict[set_id].append(control_samples_dict[lab_batch])
+                    # write samples, including controls
+                    for sample in added_samples_dict[set_id]:
+                        new_membership_io.write(f'{set_id}\t{sample}\n')
+                        samples_updated_io.write(f'{sample}\tfalse\n')
+                else:
+                    # no control sample for this aggregation set found, so will not aggregate yet
+                    del added_samples_dict[set_id]
+                    lab_batches_without_controls.append(lab_batch)
+            for lab_batch in lab_batches_without_controls:
+                del added_sample_sets_dict[lab_batch]
+            if len(added_samples_dict) == 0:
+                print(f'No new {table_name}_sets to be added.')
+            else:
+                if f'{table_name}_set' not in self.available_tables:
+                    print(f'Creating new table {table_name}_set')
+                    # Need to upload tsv to create new table
+                    with StringIO() as new_sample_sets_io:
+                        new_sample_sets_io.write(f'entity:{table_name}_set_id\n')
+                        for set_id in added_samples_dict:
+                            new_sample_sets_io.write(f'{set_id}\n')
+                        upload_new_table_response = fapi.upload_entities_tsv(self.workspace_namespace,
+                                                                             self.workspace_name,
+                                                                             new_sample_sets_io,
+                                                                             "flexible")
+                        if not upload_new_table_response.ok:
+                            raise RuntimeError(f'ERROR: {upload_new_table_response.text}')
+                print(f'Uploading new {table_name}_set table... ')
+                upload_response = fapi.upload_entities_tsv(self.workspace_namespace, self.workspace_name,
+                                                           new_membership_io,
+                                                           "flexible")
+                if not upload_response.ok:
+                    raise RuntimeError(f'ERROR: {upload_response.text}')
+                # Add date and time created to sample_set
+                print(f'Adding date and time to newly created {table_name}_sets...')
+
+                now = str(datetime.now(pytz.timezone('US/Eastern')))
+                for i, (this_lab_batch, this_aggregation_set) in enumerate(added_sample_sets_dict.items()):
+                    update_response = fapi.update_entity(self.workspace_namespace, self.workspace_name,
+                                                         f'{table_name}_set', this_aggregation_set.set_id,
+                                                         [{"op": "AddUpdateAttribute",
+                                                           "attributeName": "time_sample_set_updated",
+                                                           "addUpdateAttribute": now},
+                                                          {"op": "AddUpdateAttribute", "attributeName": "delivered",
+                                                           "addUpdateAttribute": False},
+                                                          {"op": "AddUpdateAttribute", "attributeName": "redeliver",
+                                                           "addUpdateAttribute": False},
+                                                          {"op": "AddUpdateAttribute", "attributeName": "group",
+                                                           "addUpdateAttribute": this_aggregation_set.group},
+                                                          {"op": "AddUpdateAttribute", "attributeName": "lab_batch",
+                                                           "addUpdateAttribute": this_aggregation_set.lab_batch}
+                                                          ])
+                    if not update_response.ok:
+                        raise RuntimeError(f'ERROR: {update_response.text}')
+                    print(f'    Completed {i + 1}/{len(added_samples_dict)}')
+
+                print(f'Updating rework field in {table_name} table')
+                upload_sample_rework_response = fapi.upload_entities_tsv(self.workspace_namespace, self.workspace_name,
+                                                                         samples_updated_io,
+                                                                         "flexible")
+                if not upload_sample_rework_response.ok:
+                    raise RuntimeError(f'ERROR: {upload_sample_rework_response.text}')
+                # Uploading new sample_set table
+                print('SUCCESS')
+                print(f'Printing update {table_name}_set_membership.tsv:')
+                print(new_membership_io.getvalue())
+
+
+def run(workspace_namespace, workspace_name):
+    group_builder = GroupBuilder(workspace_namespace, workspace_name)
+    group_builder.build_groups()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--workspace_namespace", dest="workspace_namespace", required=True)
+    parser.add_argument("--workspace_name", dest="workspace_name", required=True)
+    args = parser.parse_args()
+    run(args.workspace_namespace, args.workspace_name)
@@ -0,0 +1,13 @@
+[build-system]
+requires = ["setuptools>=61.0", "setuptools_scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools_scm]
+write_to = "ImputationPipeline/CreateAggregationSets/_version.py"
+root = "../../"
+
+[project]
+name = "CreateAggregationSets"
+dynamic = ["version"]
+dependencies = ['firecloud >= 0.16.33', 'pytz >= 2022.2.1']
+requires-python = ">=3.7"