nasa
diff --git a/‎Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md
Lines changed: 10 additions & 31 deletions b/‎Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md
Lines changed: 10 additions & 31 deletions
diff --git a/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md
Lines changed: 21 additions & 0 deletions b/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md
Lines changed: 21 additions & 0 deletions
diff --git a/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md
Lines changed: 23 additions & 10 deletions b/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md
Lines changed: 23 additions & 10 deletions
diff --git a/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd
Lines changed: 5 additions & 5 deletions b/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd
Lines changed: 5 additions & 5 deletions
diff --git a/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py
Lines changed: 2 additions & 2 deletions b/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config
Lines changed: 24 additions & 5 deletions b/‎Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config
Lines changed: 24 additions & 5 deletions
@@ -66,7 +66,7 @@ Lauren Sanders (acting GeneLab Project Scientist)
 |biomaRt|2.50.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)|
 |matrixStats|0.63.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)|
 |statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)|
-|dp_tools|1.3.2|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
+|dp_tools|1.3.4|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
 |singularity|3.9|[https://sylabs.io](https://sylabs.io)|
 |Quarto|1.1.251|[https://quarto.org](https://quarto.org)|
 
@@ -295,7 +295,7 @@ legend("topright", legend = colnames(raw_data@assayData$exprs),
         lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
         col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot
         ncol = number_of_sets, # Set number of columns by number of sets
-        cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
+        cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
       )
 
 # Reset par
@@ -478,7 +478,7 @@ legend("topright", legend = colnames(norm_data@assayData$exprs),
         lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
         col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot
         ncol = number_of_sets, # Set number of columns by number of sets
-        cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
+        cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
       )
 
 # Reset par
@@ -618,21 +618,6 @@ shortenedOrganismName <- function(long_name) {
   return(short_name)
 }
 
-
-# locate dataset
-expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
-print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
-
-
-# Specify Ensembl version used in current GeneLab reference annotations
-ENSEMBL_VERSION <- '107'
-
-ensembl <- biomaRt::useEnsembl(biomart = "genes", 
-                               dataset = expected_dataset_name,
-                               version = ENSEMBL_VERSION)
-print(ensembl)
-
-
 getBioMartAttribute <- function(df_rs) {
   #' Returns resolved biomart attribute source from runsheet
 
@@ -724,14 +709,6 @@ if (organism %in% c("athaliana")) {
 
   probe_ids <- rownames(probeset_level_data)
 
-  # DEBUG:START
-  if ( is.integer(params$DEBUG_limit_biomart_query) ) {
-    warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
-    message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
-    probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query]
-  }
-  # DEBUG:END
-
   # Create probe map
   # Run Biomart Queries in chunks to prevent request timeouts
   #   Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
@@ -764,7 +741,6 @@ listToUniquePipedString <- function(str_list) {
 }
 
 unique_probe_ids <- df_mapping %>% 
-                      # note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPD
                       dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type
                       dplyr::group_by(!!sym(expected_attribute_name)) %>% 
                       dplyr::summarise(
@@ -1217,14 +1193,17 @@ norm_data_matrix_annotated <- oligo::exprs(norm_data) %>%
   dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID
   dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID
   dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>%
-  dplyr::left_join(annot, by = "ENSEMBL") %>% # Join with GeneLab Reference Annotation Table
-  dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) # Convert NA mapping to 0
+  dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table
+  dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0
+  dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL ) 
+
 
 
 norm_data_matrix_annotated <- norm_data_matrix_annotated %>% 
   dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
 
 write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe.csv"), row.names = FALSE)
+
 ```
 
 **Input Data:**
@@ -1237,8 +1216,8 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm
 
 **Output Data:**
 
-- **differential_expression.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probe DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.)
+- **differential_expression.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.)
 - **normalized_expression_probeset.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.)
 - visualization_PCA_table.csv (file used to generate GeneLab PCA plots)
 - **raw_intensities_probe.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
-- **normalized_intensities_probe.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations.  The ProbeID is the unique index column.)
+- **normalized_intensities_probe.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations.  The ProbeID is the unique index column.)
@@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.2](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.2/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix)
+
+### Added
+
+- Workflow now produces a file called meta.sh (in the 'GeneLab' sub-directory) that contains information about the workflow run. This file is used by the post processing workflow to generate a protocol description. (5a8a255)
+- POST_PROCESSING will now generate a protocol description using the contents of meta.sh and text templates. (801e2ad)
+- Workflow can now be run using an ISA archive by supplying parameter: 'isaArchivePath' (as either a local path or public web uri) (8822069)
+
+### Changed
+
+- Update dp_tools from 1.3.2 to 1.3.4 (158ce5e)
+  - This updates the POST_PROCESSING workflow assay table to join multiple files by ',' instead of ',<SPACE>' and enables max flag code setting.
+- Slightly reduced stringency in V&V check for log2fc computation to account for rounding errors, specifically from 99.9% of rows within tolerance to 99.5%. (9fd2c11)
+- Publish directory behavior reworked to use the OSD accession as part of the default name. Now uses `resultsDir` instead of `outputDir` as the parameter name when a user does control the published files directory. (97cba72)
+
+### Fixed
+
+- Halt level flags now properly trigger workflow halt. (0885175)
+- Boxplots now show all y-axis labels when working with many samples. (7ec10d4s)
+- Density plot legend cex (character expansion) now has a minimum of 0.35 (rather than raising an exception for very large numbers of samples) (9a54fdc)
+
 ## [1.0.1](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.1/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2023-04-28
 
 ### Added
 
@@ -36,7 +36,7 @@ document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md):
 
        |Flag Codes|Flag Name|Interpretation|
        |:---------|:--------|:-------------|
-       | 2    | MANUAL   | Special flag that indicates a manual check that is advised. Often used to advise what assess in QA plots. |
+       | 2    | MANUAL   | Special flag that indicates a manual check that is advised. Often used to advise what should be visually assessed in QA plots. |
        | 20    | GREEN   | Indicates the check passed all validation conditions |
        | 30    | YELLOW  | Indicates the check was flagged for minor issues (e.g. slight outliers) |
        | 50    | RED     | Indicates the check was flagged for moderate issues (e.g. major outliers) |
@@ -54,6 +54,7 @@ document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md):
 - [3. Run the Workflow](#3-run-the-workflow)
   - [3a. Approach 1: Run the workflow on a GeneLab Affymetrix Microarray dataset](#3a-approach-1-run-the-workflow-on-a-genelab-affymetrix-microarray-dataset)
   - [3b. Approach 2: Run the workflow on a non-GLDS dataset using a user-created runsheet](#3b-approach-2-run-the-workflow-on-a-non-glds-dataset-using-a-user-created-runsheet)
+  - [3c. Approach 3: Run the workflow using an ISA Archive](#3c-approach-3-run-the-workflow-using-an-isa-archive)
 - [4. Additional Output Files](#4-additional-output-files)
 
 
@@ -96,9 +97,9 @@ All files required for utilizing the NF_MAAffymetrix GeneLab workflow for proces
 copy of latest NF_MAAffymetrix version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: 
 
 ```bash
-wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAffymetrix_1.0.1/NF_MAAffymetrix_1.0.1.zip
+wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAffymetrix_1.0.2/NF_MAAffymetrix_1.0.2.zip
 
-unzip NF_MAAffymetrix_1.0.1.zip
+unzip NF_MAAffymetrix_1.0.2.zip
 ```
 
 <br>
@@ -107,15 +108,15 @@ unzip NF_MAAffymetrix_1.0.1.zip
 
 ### 3. Run the Workflow
 
-While in the location containing the `NF_MAAffymetrix_1.0.1` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAffymetrix workflow:
+While in the location containing the `NF_MAAffymetrix_1.0.2` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAffymetrix workflow:
 > Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters.  Take care to use the proper number of hyphens for each argument.
 
 <br>
 
 #### 3a. Approach 1: Run the workflow on a GeneLab Affymetrix Microarray dataset
 
 ```bash
-nextflow run NF_MAAffymetrix_1.0.1/main.nf \ 
+nextflow run NF_MAAffymetrix_1.0.2/main.nf \ 
    -profile singularity \
    --osdAccession OSD-266 \
    --gldsAccession GLDS-266 
@@ -128,16 +129,28 @@ nextflow run NF_MAAffymetrix_1.0.1/main.nf \
 > Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md).
 
 ```bash
-nextflow run NF_MAAffymetrix_1.0.1/main.nf \ 
+nextflow run NF_MAAffymetrix_1.0.2/main.nf \ 
    -profile singularity \
    --runsheetPath </path/to/runsheet> 
 ```
 
 <br>
 
+#### 3c. Approach 3: Run the workflow using an ISA Archive
+
+> Note: Specifications for the ISA Tab Archive format can be found [here](https://isa-specs.readthedocs.io/en/latest/isatab.html).
+
+```bash
+nextflow run NF_MAAffymetrix_1.0.2/main.nf \ 
+   -profile singularity \
+   --isaArchivePath </path/to/isaArchive> 
+```
+
+<br>
+
 **Required Parameters For All Approaches:**
 
-* `NF_MAAffymetrix_1.0.1/main.nf` - Instructs Nextflow to run the NF_MAAffymetrix workflow 
+* `NF_MAAffymetrix_1.0.2/main.nf` - Instructs Nextflow to run the NF_MAAffymetrix workflow 
 
 * `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow
 
@@ -162,14 +175,14 @@ nextflow run NF_MAAffymetrix_1.0.1/main.nf \
 
 * `--skipVV` - skip the automated V&V processes (Default: the automated V&V processes are active) 
 
-* `--outputDir` - specifies the directory to save the raw and processed data files (Default: files are saved in the launch directory)  
+* `--resultsDir` - specifies the output directory for all files produced by the workflow (Default: <OSD-NNN_GLDS-NNN> if OSD and GLDS accessions are specified.  Otherwise, the workflow launch directory.) 
 
 <br>
 
 All parameters listed above and additional optional arguments for the NF_MAAffymetrix workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
 
 ```bash
-nextflow run NF_MAAffymetrix_1.0.1/main.nf --help
+nextflow run NF_MAAffymetrix_1.0.2/main.nf --help
 ```
 
 See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
@@ -183,7 +196,7 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex
 All R code steps and output are rendered within a Quarto document yielding the following:
 
    - Output:
-     - NF_MAAffymetrix_1.0.1.html (html report containing executed code and output including QA plots)
+     - NF_MAAffymetrix_1.0.2.html (html report containing executed code and output including QA plots)
   
 
 The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below:
 
@@ -1,6 +1,6 @@
 ---
 title: "Affymetrix Processing"
-subtitle: "Workflow Version: NF_MAAffymetrix_1.0.1"
+subtitle: "Workflow Version: NF_MAAffymetrix_1.0.2"
 date: now
 title-block-banner: true
 format:
@@ -210,7 +210,7 @@ legend("topright", legend = colnames(raw_data@assayData$exprs),
         lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
         col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot
         ncol = number_of_sets, # Set number of columns by number of sets
-        cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
+        cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
       )
 
 # Reset par
@@ -277,7 +277,7 @@ if (inherits(raw_data, "GeneFeatureSet")) {
 #| warning: false # NAN can be produced due to log transformations
 #| column: screen-inset-right # Allow images to flow all the way to the right
 #| fig-width: 14
-#| fig-height: !expr max(8, dim(raw_data)[2] * 0.2)
+#| fig-height: !expr max(8, 2 + dim(raw_data)[2] * 0.2)
 #| fig-align: left
 max_samplename_length <- max(nchar(colnames(raw_data)))
 dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10)
@@ -360,7 +360,7 @@ legend("topright", legend = colnames(norm_data@assayData$exprs),
         lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
         col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot
         ncol = number_of_sets, # Set number of columns by number of sets
-        cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
+        cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1
       )
 
 # Reset par
@@ -409,7 +409,7 @@ MA_plot <- oligo::MAplot(
 #| warning: false # NAN can be produced due to log transformations
 #| column: screen-inset-right # Allow images to flow all the way to the right
 #| fig-width: 14
-#| fig-height: !expr max(8, dim(norm_data)[2] * 0.2)
+#| fig-height: !expr max(8, 2 + dim(norm_data)[2] * 0.2)
 #| fig-align: left
 max_samplename_length <- max(nchar(colnames(norm_data)))
 dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10)
 
@@ -347,7 +347,7 @@ def check_dge_table_log2fc_within_reason(
     """ Note: This function assumes the normalized expression values are log2 transformed
     """
     LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 1  # Percent
-    PERCENT_ROWS_WITHIN_TOLERANCE = 99.9  # Percent
+    PERCENT_ROWS_WITHIN_TOLERANCE = 99.5  # Percent
 
     # data specific preprocess
     expected_groups = utils_runsheet_to_expected_groups(runsheet, formatting = GroupFormatting.ampersand_join_and_remove_non_ascii, map_to_lists=True)
@@ -380,7 +380,7 @@ def check_dge_table_log2fc_within_reason(
         )
 
         if percent_within_tolerance < PERCENT_ROWS_WITHIN_TOLERANCE: # add current query column to error list
-            error_list.append((query_column,percent_within_tolerance))
+            error_list.append((query_column,percent_within_tolerance,f"First index out of tolerance: {abs_percent_difference.idxmin()}"))
 
     # inplace sort error list for deterministic order
     error_list.sort()
 
@@ -1,19 +1,38 @@
 nextflow.enable.moduleBinaries = true
 
 params {
-  /*
-  Parameters that MUST be supplied
+
+  /* Here GLDS and OSD accession are defined.  
+  Default behaviour is as follows:
+  - If accessions are not set, then either runsheet or an ISA Archive MUST be supplied
+  - If both accessions are set:
+    - If runsheet and ISA archive are left unset, then the ISA archive will be fetched from the GeneLab API and runsheet generated from the runsheet.
+    - If either runsheet or ISA archive are set, they will be used but the output directory and tags will reflect the appropriate accessions.  This is useful when processing from the OSDR but OSDR metadata is not ready as is.
+    - If both runsheet and ISA archive are set, the workflow will halt.
+  - If only one accession is set, then the workflow will halt.
+  
   */
-  gldsAccession = null // GeneLab Data Accession Number, e.g. GLDS-104
-  osdAccession = null // OSD Data Accession Number, e.g. OSD-367
+  gldsAccession = "NOT_OSDR" // GeneLab Data Accession Number, e.g. GLDS-104
+  osdAccession = "NOT_OSDR" // OSD Data Accession Number, e.g. OSD-367
+
+  // Catch case where only one is set
+  if (params.gldsAccession != "NOT_OSDR" && params.osdAccession == "NOT_OSDR") {
+    println "ERROR: GLDS accession set but OSD accession is not set.  Please set both or neither."
+    System.exit(1)
+  }
+  if (params.gldsAccession == "NOT_OSDR" && params.osdAccession != "NOT_OSDR") {
+    println "ERROR: OSD accession set but GLDS accession is not set.  Please set both or neither."
+    System.exit(1)
+  }
 
+  resultsDir = (params.gldsAccession != "NOT_OSDR" && params.osdAccession != "NOT_OSDR") ? "./${params.osdAccession}_${params.gldsAccession}" : "." // the location for the output from the pipeline (also includes raw data and metadata)
 
   /*
   Parameters that CAN be overwritten
   */
   runsheetPath = false
   biomart_attribute = false // Must be supplied if runsheet 'Array design REF' column doesn't indicate it
-  outputDir = "." // the location for the output from the pipeline (also includes raw data and metadata)
+  isaArchivePath = false // Alternative to fetching the ISA archive for an associated OSD/GLDS dataset
   publish_dir_mode = "link" // method for creating publish directory.  Default here for hardlink
   help = false // display help menu and exit workflow program