nasa
diff --git a/‎Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md
Lines changed: 125 additions & 48 deletions b/‎Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md
Lines changed: 125 additions & 48 deletions
diff --git a/‎Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/CHANGELOG.md
Lines changed: 40 additions & 0 deletions b/‎Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/CHANGELOG.md
Lines changed: 40 additions & 0 deletions
diff --git a/‎Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md
Lines changed: 17 additions & 8 deletions b/‎Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md
Lines changed: 17 additions & 8 deletions
@@ -64,7 +64,7 @@ Lauren Sanders (acting GeneLab Project Scientist)
 |biomaRt|2.50.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)|
 |matrixStats|0.63.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)|
 |statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)|
-|dp_tools|1.3.0|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
+|dp_tools|1.3.1|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
 |singularity|3.9|[https://sylabs.io](https://sylabs.io)|
 |Quarto|1.2.313|[https://quarto.org](https://quarto.org)|
 
@@ -255,6 +255,20 @@ raw_data <- limma::read.maimages(df_local_paths$`Local Paths`,
                                  names = df_local_paths$`Sample Name` # Map column names as Sample Names (instead of default filenames)
                                  )
 
+# Handle raw data which lacks certain replaceable column data
+
+## This likely arises as Agilent Feature Extraction (the process that generates the raw data files on OSDR) 
+##   gives some user flexibilty in what probe column to ouput
+
+## Missing ProbeUID "Unique integer for each unique probe in a design"
+### Source: https://www.agilent.com/cs/library/usermanuals/public/GEN-MAN-G4460-90057.pdf Page 178
+### Remedy: Assign unique integers for each probe
+
+if ( !("ProbeUID" %in% colnames(raw_data$genes)) ) {
+  # Assign unique integers for each probe
+  print("Assigning `ProbeUID` as original files did not include them")
+  raw_data$genes$ProbeUID <- seq_len(nrow(raw_data$genes))
+}
 
 # Summarize raw data
 print(paste0("Number of Arrays: ", dim(raw_data)[2]))
@@ -568,20 +582,6 @@ shortenedOrganismName <- function(long_name) {
 }
 
 
-# locate dataset
-expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
-print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
-
-
-# Specify Ensembl version used in current GeneLab reference annotations
-ENSEMBL_VERSION <- '107'
-
-ensembl <- biomaRt::useEnsembl(biomart = "genes", 
-                               dataset = expected_dataset_name,
-                               version = ENSEMBL_VERSION)
-print(ensembl)
-
-
 getBioMartAttribute <- function(df_rs) {
   #' Returns resolved biomart attribute source from runsheet
 
@@ -598,34 +598,107 @@ getBioMartAttribute <- function(df_rs) {
   }
 }
 
-expected_attribute_name <- getBioMartAttribute(df_rs)
-print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
-
-probe_ids <- unique(norm_data$genes$ProbeName)
-
-
-# Create probe map
-# Run Biomart Queries in chunks to prevent request timeouts
-#   Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
-CHUNK_SIZE= 8000
-probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
-df_mapping <- data.frame()
-for (i in seq_along(probe_id_chunks)) {
-  probe_id_chunk <- probe_id_chunks[[i]]
-  print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
-  chunk_results <- biomaRt::getBM(
-      attributes = c(
-          expected_attribute_name,
-          "ensembl_gene_id"
-          ), 
-          filters = expected_attribute_name, 
-          values = probe_id_chunk, 
-          mart = ensembl)
-
-  df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
-  Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
+get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) {
+  #' Obtain mapping table directly from ftp.  Useful when biomart live service no longer exists for desired version
+  
+  request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz")
+
+  print(glue::glue("Mappings file URL: {request_url}"))
+
+  # Create a temporary file name
+  temp_file <- tempfile(fileext = ".gz")
+
+  # Download the gzipped table file using the download.file function
+  download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps
+
+  # Uncompress the file
+  uncompressed_temp_file <- tempfile()
+  gzcon <- gzfile(temp_file, "rt")
+  content <- readLines(gzcon)
+  writeLines(content, uncompressed_temp_file)
+  close(gzcon)
+
+
+  # Load the data into a dataframe
+  mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file
+                        # Add column names as follows: MAPID, TAIR, PROBEID
+                        col.names = c("MAPID", "ensembl_gene_id", biomart_attribute),
+                        header = FALSE, # No header in original table
+                        sep = "\t") # Tab separated
+
+  # Clean up temporary files
+  unlink(temp_file)
+  unlink(uncompressed_temp_file)
+
+  return(mapping)
+}
+
+
+organism <- shortenedOrganismName(unique(df_rs$organism))
+
+if (organism %in% c("athaliana")) {
+  ensembl_genomes_version = "54"
+  ensembl_genomes_portal = "plants"
+  print(glue::glue("Using ensembl genomes ftp to get specific version of probe id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}"))
+  expected_attribute_name <- getBioMartAttribute(df_rs)
+  df_mapping <- get_ensembl_genomes_mappings_from_ftp(
+    organism = organism,
+    ensembl_genomes_portal = ensembl_genomes_portal,
+    ensembl_genomes_version = ensembl_genomes_version,
+    biomart_attribute = expected_attribute_name
+    )
+
+  # TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010'
+  # So here we remove the '.NNN' from the mapping table where .NNN is any number
+  df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "")
+} else {
+  # Use biomart from main Ensembl website which archives keep each release on the live service
+  # locate dataset
+  expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
+  print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
+
+
+  # Specify Ensembl version used in current GeneLab reference annotations
+  ENSEMBL_VERSION <- '107'
+
+  print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}"))
+
+  ensembl <- biomaRt::useEnsembl(biomart = "genes", 
+                                dataset = expected_dataset_name,
+                                version = ENSEMBL_VERSION)
+  print(ensembl)
+
+  expected_attribute_name <- getBioMartAttribute(df_rs)
+  print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
+
+  probe_ids <- unique(norm_data$genes$ProbeName)
+
+
+  # Create probe map
+  # Run Biomart Queries in chunks to prevent request timeouts
+  #   Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
+  CHUNK_SIZE= 1500
+  probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
+  df_mapping <- data.frame()
+  for (i in seq_along(probe_id_chunks)) {
+    probe_id_chunk <- probe_id_chunks[[i]]
+    print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
+    chunk_results <- biomaRt::getBM(
+        attributes = c(
+            expected_attribute_name,
+            "ensembl_gene_id"
+            ), 
+            filters = expected_attribute_name, 
+            values = probe_id_chunk, 
+            mart = ensembl)
+
+    df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
+    Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
+  }
 }
 
+# At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism
+
 # Convert list of multi-mapped genes to string
 listToUniquePipedString <- function(str_list) {
   #! convert lists into strings denoting unique elements separated by '|' characters
@@ -827,9 +900,7 @@ reformat_names <- function(colname, group_name_mapping) {
                   stringr::str_replace(pattern = stringr::fixed("Genes.ProbeName"), replacement = "ProbeName") %>% 
                   stringr::str_replace(pattern = stringr::fixed("Genes.count_ENSEMBL_mappings"), replacement = "count_ENSEMBL_mappings") %>% 
                   stringr::str_replace(pattern = stringr::fixed("Genes.ProbeUID"), replacement = "ProbeUID") %>% 
-                  stringr::str_replace(pattern = stringr::fixed("Genes.SYMBOL"), replacement = "SYMBOL") %>% 
                   stringr::str_replace(pattern = stringr::fixed("Genes.ENSEMBL"), replacement = "ENSEMBL") %>% 
-                  stringr::str_replace(pattern = stringr::fixed("Genes.GOSLIM_IDS"), replacement = "GOSLIM_IDS") %>% 
                   stringr::str_replace(pattern = ".condition", replacement = "v")
 
   # remap to group names before make.names was applied
@@ -934,7 +1005,8 @@ map_primary_keytypes <- c(
 df_interim <- merge(
                 annot,
                 df_interim,
-                by = map_primary_keytypes[[unique(df_rs$organism)]],
+                by.x = map_primary_keytypes[[unique(df_rs$organism)]],
+                by.y = "ENSEMBL",
                 # ensure all original dge rows are kept.
                 # If unmatched in the annotation database, then fill missing with NAN
                 all.y = TRUE
@@ -1013,10 +1085,13 @@ FINAL_COLUMN_ORDER <- c(
 
 ## Assert final column order includes all columns from original table
 if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) {
-  FINAL_COLUMN_ORDER_STRING <- paste(FINAL_COLUMN_ORDER, collapse = ":::::")
-  stop(glue::glue("Column reordering attempt resulted in different sets of columns than orignal. Order attempted: {FINAL_COLUMN_ORDER_STRING}"))
+  write.csv(FINAL_COLUMN_ORDER, "FINAL_COLUMN_ORDER.csv")
+  NOT_IN_DF_INTERIM <- paste(setdiff(FINAL_COLUMN_ORDER, colnames(df_interim)), collapse = ":::")
+  NOT_IN_FINAL_COLUMN_ORDER <- paste(setdiff(colnames(df_interim), FINAL_COLUMN_ORDER), collapse = ":::")
+  stop(glue::glue("Column reordering attempt resulted in different sets of columns than original. Names unique to 'df_interim': {NOT_IN_FINAL_COLUMN_ORDER}. Names unique to 'FINAL_COLUMN_ORDER': {NOT_IN_DF_INTERIM}."))
 }
 
+
 ## Perform reordering
 df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
 
@@ -1042,7 +1117,8 @@ raw_data_matrix <- background_corrected_data$genes %>%
 raw_data_matrix_annotated <- merge(
                 annot,
                 raw_data_matrix,
-                by = map_primary_keytypes[[unique(df_rs$organism)]],
+                by.x = map_primary_keytypes[[unique(df_rs$organism)]],
+                by.y = "ENSEMBL",
                 # ensure all original dge rows are kept.
                 # If unmatched in the annotation database, then fill missing with NAN
                 all.y = TRUE
@@ -1071,7 +1147,8 @@ norm_data_matrix <- norm_data$genes %>%
 norm_data_matrix_annotated <- merge(
                 annot,
                 norm_data_matrix,
-                by = map_primary_keytypes[[unique(df_rs$organism)]],
+                by.x = map_primary_keytypes[[unique(df_rs$organism)]],
+                by.y = "ENSEMBL",
                 # ensure all original dge rows are kept.
                 # If unmatched in the annotation database, then fill missing with NAN
                 all.y = TRUE
 
@@ -0,0 +1,40 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.0.2](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.2/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2023-04-28
+
+### Added
+
+### Added
+
+- Support for Arabidposis Thaliana datasets using the plants ensembl FTP server.
+
+### Changed
+
+- When encountering error about column reordering, the expected order is saved for debugging purposes.
+- Post Processing Workflow: Assay Table Update now added '_array_' prefix to processed files instead of '_microarray_' prefix.
+
+## [1.0.1](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.1/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2023-03-31
+
+### Removed
+
+- Deprecated column renaming code (abcd380)
+
+### Fixed
+
+- Bumped dp_tools from 1.3.0 to 1.3.1 to address 'ISO-8859-1' encoded ISA archive files (example: OSD-271-v2) (d518f40)
+- Added handling for raw data that lacks the ProbeUID column (example: OSD-271-v2) (efbc237)
+
+### Changed
+
+- Reordering error message is now more informative (007e36c)
+
+## [1.0.0](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.0/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2023-03-22
+
+### Added
+
+- First internal production ready release of the Agilent 1 Channel Microarray Processing Workflow
@@ -95,9 +95,12 @@ All files required for utilizing the NF_MAAgilent1ch GeneLab workflow for proces
 copy of latest NF_MAAgilent1ch version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: 
 
 ```bash
-wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAgilent1ch_1.0.0/NF_MAAgilent1ch_1.0.0.zip
+wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAgilent1ch_1.0.2
+/NF_MAAgilent1ch_1.0.2
+.zip
 
-unzip NF_MAAgilent1ch_1.0.0.zip
+unzip NF_MAAgilent1ch_1.0.2
+.zip
 ```
 
 <br>
@@ -106,15 +109,17 @@ unzip NF_MAAgilent1ch_1.0.0.zip
 
 ### 3. Run the Workflow
 
-While in the location containing the `NF_MAAgilent1ch_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAgilent1ch workflow:
+While in the location containing the `NF_MAAgilent1ch_1.0.2
+` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAgilent1ch workflow:
 > Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters.  Take care to use the proper number of hyphens for each argument.
 
 <br>
 
 #### 3a. Approach 1: Run the workflow on a GeneLab Agilent 1 Channel Microarray dataset
 
 ```bash
-nextflow run NF_MAAgilent1ch_1.0.0/main.nf \ 
+nextflow run NF_MAAgilent1ch_1.0.2
+/main.nf \ 
    -profile singularity \
    --osdAccession OSD-548 \
    --gldsAccession GLDS-548 
@@ -127,7 +132,8 @@ nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
 > Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md).
 
 ```bash
-nextflow run NF_MAAgilent1ch_1.0.0/main.nf \ 
+nextflow run NF_MAAgilent1ch_1.0.2
+/main.nf \ 
    -profile singularity \
    --runsheetPath </path/to/runsheet> 
 ```
@@ -136,7 +142,8 @@ nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
 
 **Required Parameters For All Approaches:**
 
-* `NF_MAAgilent1ch_1.0.0/main.nf` - Instructs Nextflow to run the NF_MAAgilent1ch workflow 
+* `NF_MAAgilent1ch_1.0.2
+/main.nf` - Instructs Nextflow to run the NF_MAAgilent1ch workflow 
 
 * `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow
 
@@ -168,7 +175,8 @@ nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
 All parameters listed above and additional optional arguments for the NF_MAAgilent1ch workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
 
 ```bash
-nextflow run NF_MAAgilent1ch_1.0.0/main.nf --help
+nextflow run NF_MAAgilent1ch_1.0.2
+/main.nf --help
 ```
 
 See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
@@ -182,7 +190,8 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex
 All R code steps and output are rendered within a Quarto document yielding the following:
 
    - Output:
-     - NF_MAAgilent1ch_1.0.0.html (html report containing executed code and output including QA plots)
+     - NF_MAAgilent1ch_1.0.2
+.html (html report containing executed code and output including QA plots)
   
 
 The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below: