Skip to content

Commit c05a745

Browse files
Merge pull request #64 from nasa/microarray-workflows-integration
Microarray workflows integration
2 parents a3d64e6 + bc85e15 commit c05a745

File tree

107 files changed

+9034
-49
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+9034
-49
lines changed

Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,36 @@ dir.create(DIR_DGE)
164164
original_par <- par()
165165
options(preferRaster=TRUE) # use Raster when possible to avoid antialiasing artifacts in images
166166

167+
# Utility function to improve robustness of function calls
168+
# Used to remedy intermittent internet issues during runtime
169+
retry_with_delay <- function(func, ...) {
170+
max_attempts = 5
171+
initial_delay = 10
172+
delay_increase = 30
173+
attempt <- 1
174+
current_delay <- initial_delay
175+
while (attempt <= max_attempts) {
176+
result <- tryCatch(
177+
expr = func(...),
178+
error = function(e) e
179+
)
180+
181+
if (!inherits(result, "error")) {
182+
return(result)
183+
} else {
184+
if (attempt < max_attempts) {
185+
message(paste("Retry attempt", attempt, "failed for function with name <", deparse(substitute(func)) ,">. Retrying in", current_delay, "second(s)..."))
186+
Sys.sleep(current_delay)
187+
current_delay <- current_delay + delay_increase
188+
} else {
189+
stop(paste("Max retry attempts reached. Last error:", result$message))
190+
}
191+
}
192+
193+
attempt <- attempt + 1
194+
}
195+
}
196+
167197
df_rs <- read.csv(runsheet, check.names = FALSE) %>%
168198
dplyr::mutate_all(function(x) iconv(x, "latin1", "ASCII", sub="")) # Convert all characters to ascii, when not possible, remove the character
169199
## Determines the organism specific annotation file to use based on the organism in the runsheet
@@ -184,7 +214,7 @@ fetch_organism_specific_annotation_file_path <- function(organism) {
184214

185215
return(annotation_file_path)
186216
}
187-
annotation_file_path <- fetch_organism_specific_annotation_file_path(unique(df_rs$organism))
217+
annotation_file_path <- retry_with_delay(fetch_organism_specific_annotation_file_path, unique(df_rs$organism))
188218

189219
allTrue <- function(i_vector) {
190220
if ( length(i_vector) == 0 ) {
@@ -218,7 +248,7 @@ downloadFilesFromRunsheet <- function(df_runsheet) {
218248

219249
if ( runsheetPathsAreURIs(df_rs) ) {
220250
print("Determined Raw Data Locations are URIS")
221-
local_paths <- downloadFilesFromRunsheet(df_rs)
251+
local_paths <- retry_with_delay(downloadFilesFromRunsheet, df_rs)
222252
} else {
223253
print("Or Determined Raw Data Locations are local paths")
224254
local_paths <- df_rs$`Array Data File Path`
@@ -244,9 +274,12 @@ df_local_paths <- data.frame(`Sample Name` = df_rs$`Sample Name`, `Local Paths`
244274

245275

246276
# Load raw data into R object
247-
raw_data <- oligo::read.celfiles(df_local_paths$`Local Paths`,
248-
sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames)
249-
)
277+
# Retry with delay here to accomodate oligo's automatic loading of annotation packages and occasional internet related failures to load
278+
raw_data <- retry_with_delay(
279+
oligo::read.celfiles,
280+
df_local_paths$`Local Paths`,
281+
sampleNames = df_local_paths$`Sample Name`# Map column names as Sample Names (instead of default filenames)
282+
)
250283

251284

252285
# Summarize raw data
@@ -352,6 +385,14 @@ if (inherits(raw_data, "GeneFeatureSet")) {
352385
ylim=c(-2, 4),
353386
main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string
354387
)
388+
} else if (inherits(raw_data, "ExpressionFeatureSet")) {
389+
MA_plot <- oligo::MAplot(
390+
raw_data,
391+
ylim=c(-2, 4),
392+
main="" # This function uses 'main' as a suffix to the sample name. Here we want just the sample name, thus here main is an empty string
393+
)
394+
} else {
395+
stop(glue::glue("No strategy for MA plots for {raw_data}"))
355396
}
356397
```
357398

@@ -674,11 +715,12 @@ if (organism %in% c("athaliana")) {
674715
ensembl_genomes_portal = "plants"
675716
print(glue::glue("Using ensembl genomes ftp to get specific version of probeset id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}"))
676717
expected_attribute_name <- getBioMartAttribute(df_rs)
677-
df_mapping <- get_ensembl_genomes_mappings_from_ftp(
678-
organism = organism,
679-
ensembl_genomes_portal = ensembl_genomes_portal,
680-
ensembl_genomes_version = ensembl_genomes_version,
681-
biomart_attribute = expected_attribute_name
718+
df_mapping <- retry_with_delay(
719+
get_ensembl_genomes_mappings_from_ftp,
720+
organism = organism,
721+
ensembl_genomes_portal = ensembl_genomes_portal,
722+
ensembl_genomes_version = ensembl_genomes_version,
723+
biomart_attribute = expected_attribute_name
682724
)
683725

684726
# TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010'
@@ -853,8 +895,8 @@ design_data <- runsheetToDesignMatrix(runsheet)
853895
design <- design_data$matrix
854896

855897
# Write SampleTable.csv and contrasts.csv file
856-
write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable.csv"), row.names = FALSE)
857-
write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts.csv"))
898+
write.csv(design_data$groups, file.path(DIR_DGE, "SampleTable_GLmicroarray.csv"), row.names = FALSE)
899+
write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts_GLmicroarray.csv"))
858900
```
859901

860902
**Input Data:**
@@ -864,8 +906,8 @@ write.csv(design_data$contrasts, file.path(DIR_DGE, "contrasts.csv"))
864906
**Output Data:**
865907

866908
- `design` (R object containing the limma study design matrix, indicating the group that each sample belongs to)
867-
- **SampleTable.csv** (table containing samples and their respective groups)
868-
- **contrasts.csv** (table containing all pairwise comparisons)
909+
- **SampleTable_GLmicroarray.csv** (table containing samples and their respective groups)
910+
- **contrasts_GLmicroarray.csv** (table containing all pairwise comparisons)
869911

870912
<br>
871913

@@ -1116,7 +1158,7 @@ if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) {
11161158
df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
11171159

11181160
# Save to file
1119-
write.csv(df_interim, file.path(DIR_DGE, "differential_expression.csv"), row.names = FALSE)
1161+
write.csv(df_interim, file.path(DIR_DGE, "differential_expression_GLmicroarray.csv"), row.names = FALSE)
11201162

11211163
## Output column subset file with just normalized probeset level expression values
11221164
write.csv(
@@ -1125,12 +1167,12 @@ write.csv(
11251167
"ProbesetID",
11261168
"count_ENSEMBL_mappings",
11271169
all_samples)
1128-
], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset.csv"), row.names = FALSE)
1170+
], file.path(DIR_NORMALIZED_EXPRESSION, "normalized_expression_probeset_GLmicroarray.csv"), row.names = FALSE)
11291171

11301172
### Generate and export PCA table for GeneLab visualization plots
11311173
PCA_raw <- prcomp(t(exprs(probeset_level_data)), scale = FALSE) # Note: expression at the Probeset level is already log2 transformed
11321174
write.csv(PCA_raw$x,
1133-
file.path(DIR_DGE, "visualization_PCA_table.csv")
1175+
file.path(DIR_DGE, "visualization_PCA_table_GLmicroarray.csv")
11341176
)
11351177

11361178
## Generate raw intensity matrix that includes annotations
@@ -1179,7 +1221,7 @@ background_corrected_data_annotated <- oligo::exprs(background_corrected_data) %
11791221
background_corrected_data_annotated <- background_corrected_data_annotated %>%
11801222
dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
11811223

1182-
write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe.csv"), row.names = FALSE)
1224+
write.csv(background_corrected_data_annotated, file.path(DIR_RAW_DATA, "raw_intensities_probe_GLmicroarray.csv"), row.names = FALSE)
11831225

11841226
## Generate normalized expression matrix that includes annotations
11851227
norm_data_matrix_annotated <- oligo::exprs(norm_data) %>%
@@ -1199,7 +1241,7 @@ norm_data_matrix_annotated <- oligo::exprs(norm_data) %>%
11991241
norm_data_matrix_annotated <- norm_data_matrix_annotated %>%
12001242
dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
12011243

1202-
write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe.csv"), row.names = FALSE)
1244+
write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe_GLmicroarray.csv"), row.names = FALSE)
12031245

12041246
```
12051247

@@ -1213,10 +1255,10 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm
12131255

12141256
**Output Data:**
12151257

1216-
- **differential_expression.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.)
1217-
- **normalized_expression_probeset.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.)
1218-
- visualization_PCA_table.csv (file used to generate GeneLab PCA plots)
1219-
- **raw_intensities_probe.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
1220-
- **normalized_intensities_probe.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
1258+
- **differential_expression_GLmicroarray.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.)
1259+
- **normalized_expression_probeset_GLmicroarray.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.)
1260+
- visualization_PCA_table_GLmicroarray.csv (file used to generate GeneLab PCA plots)
1261+
- **raw_intensities_probe_GLmicroarray.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
1262+
- **normalized_intensities_probe_GLmicroarray.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
12211263

1222-
> All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_\*.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset.
1264+
> All steps of the Microarray pipeline are performed using R markdown and the completed R markdown is rendered (via Quarto) as an html file (**NF_MAAffymetrix_v\*_GLmicroarray.html**) and published in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/) for the respective dataset.

Microarray/Affymetrix/README.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# GeneLab bioinformatics processing pipeline for Affymetrix microarray data
22

33

4-
> **The document [`GL-DPPD-7114.md`](Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md) holds an overview and example commands for how GeneLab processes Affymetrix microarray datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code is provided for each GLDS dataset along with the processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).**
4+
> **The document [`GL-DPPD-7114.md`](Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md) holds an overview and example commands for how GeneLab processes Affymetrix microarray datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code is provided for each GLDS dataset along with the processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).**
55
66
---
77

@@ -14,13 +14,14 @@
1414

1515
* [**Pipeline_GL-DPPD-7114_Versions**](Pipeline_GL-DPPD-7114_Versions)
1616

17-
- Contains the current and previous GeneLab Affymetrix microarray data processing pipeline (NF_MAAffy) versions documentation
17+
- Contains the current and previous GeneLab Affymetrix microarray data processing pipeline (NF_MAAffymetrix) versions documentation
1818

1919
* [**Workflow_Documentation**](Workflow_Documentation)
2020

21-
- Contains instructions for installing and running the GeneLab NF_MAAffy workflow
22-
> Note: The NF_MAAffy workflow is currently in development and not yet available
21+
- Contains instructions for installing and running the GeneLab NF_MAAffymetrix workflow
2322

2423
---
25-
**Developed and maintained by:**
26-
Jonathan Oribello
24+
**Developed by:**
25+
Jonathan Oribello
26+
**Maintained by:**
27+
Alexis Torres (alexis.torres@nasa.gov)
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [1.0.3](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.3/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2024-02-26
9+
10+
### Added
11+
12+
- Retry wrapper for functions that utilize internet resources. This is aimed to reduce failures due solely due to intermittent network issues. (ceb6d9a3)
13+
14+
### Fixed
15+
16+
- Missing Raw Data MA Plots when handling designs that loaded as `ExpressionFeatureSet` objects. (7af7192e)
17+
- Additionally, future unhandled raw data classes will raise an exception rather than fail to plot silently.
18+
19+
## [1.0.2](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.2/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2023-05-24
20+
21+
### Added
22+
23+
- Workflow now produces a file called meta.sh (in the 'GeneLab' sub-directory) that contains information about the workflow run. This file is used by the post processing workflow to generate a protocol description. (5a8a255)
24+
- POST_PROCESSING will now generate a protocol description using the contents of meta.sh and text templates. (801e2ad)
25+
- Workflow can now be run using an ISA archive by supplying parameter: 'isaArchivePath' (as either a local path or public web uri) (8822069)
26+
27+
### Changed
28+
29+
- Update dp_tools from 1.3.2 to 1.3.4 (158ce5e)
30+
- This updates the POST_PROCESSING workflow assay table to join multiple files by ',' instead of ',<SPACE>' and enables max flag code setting.
31+
- Slightly reduced stringency in V&V check for log2fc computation to account for rounding errors, specifically from 99.9% of rows within tolerance to 99.5%. (9fd2c11)
32+
- Publish directory behavior reworked to use the OSD accession as part of the default name. Now uses `resultsDir` instead of `outputDir` as the parameter name when a user does control the published files directory. (97cba72)
33+
34+
### Fixed
35+
36+
- Halt level flags now properly trigger workflow halt. (0885175)
37+
- Boxplots now show all y-axis labels when working with many samples. (7ec10d4s)
38+
- Density plot legend cex (character expansion) now has a minimum of 0.35 (rather than raising an exception for very large numbers of samples) (9a54fdc)
39+
40+
## [1.0.1](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.1/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2023-04-28
41+
42+
### Added
43+
44+
- Support for Arabidposis Thaliana datasets using the plants ensembl FTP server.
45+
- Support for raw data FeatureSets (building on existing support for ExpressionSets)
46+
- Better support for non-ascii characters in the runsheet, usually caused by such characters in the original ISA archive the runsheet is generated from.
47+
48+
### Fixed
49+
50+
- Typos related to shared code with Agilent 1 Channel platform.
51+
52+
### Changed
53+
54+
- Error message when encountering unique columns when reordering tables is now clearer about what unique columns were found.
55+
- Post Processing Workflow: Assay Table Update now added '_array_' prefix to processed files instead of '_microarray_' prefix.
56+
57+
## [1.0.0](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.0/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2023-04-24
58+
59+
### Added
60+
61+
- First internal production ready release of the Affymetrix Microarray Pipeline Nextflow Workflow

0 commit comments

Comments
 (0)