Skip to content

Commit 3e4d624

Browse files
Merge pull request #152 from asaravia-butler/DEV_NF_MAAffymetrix
NF_MAAffymetrix_1.0.2 Release Validated
2 parents 9c643de + 6a6d0db commit 3e4d624

39 files changed

+390
-149
lines changed

Microarray/Affymetrix/Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md

Lines changed: 10 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ Lauren Sanders (acting GeneLab Project Scientist)
6666
|biomaRt|2.50.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)|
6767
|matrixStats|0.63.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)|
6868
|statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)|
69-
|dp_tools|1.3.2|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
69+
|dp_tools|1.3.4|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
7070
|singularity|3.9|[https://sylabs.io](https://sylabs.io)|
7171
|Quarto|1.1.251|[https://quarto.org](https://quarto.org)|
7272

@@ -295,7 +295,7 @@ legend("topright", legend = colnames(raw_data@assayData$exprs),
295295
lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
296296
col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot
297297
ncol = number_of_sets, # Set number of columns by number of sets
298-
cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
298+
cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
299299
)
300300

301301
# Reset par
@@ -478,7 +478,7 @@ legend("topright", legend = colnames(norm_data@assayData$exprs),
478478
lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
479479
col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot
480480
ncol = number_of_sets, # Set number of columns by number of sets
481-
cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
481+
cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
482482
)
483483

484484
# Reset par
@@ -618,21 +618,6 @@ shortenedOrganismName <- function(long_name) {
618618
return(short_name)
619619
}
620620

621-
622-
# locate dataset
623-
expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
624-
print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
625-
626-
627-
# Specify Ensembl version used in current GeneLab reference annotations
628-
ENSEMBL_VERSION <- '107'
629-
630-
ensembl <- biomaRt::useEnsembl(biomart = "genes",
631-
dataset = expected_dataset_name,
632-
version = ENSEMBL_VERSION)
633-
print(ensembl)
634-
635-
636621
getBioMartAttribute <- function(df_rs) {
637622
#' Returns resolved biomart attribute source from runsheet
638623

@@ -724,14 +709,6 @@ if (organism %in% c("athaliana")) {
724709

725710
probe_ids <- rownames(probeset_level_data)
726711

727-
# DEBUG:START
728-
if ( is.integer(params$DEBUG_limit_biomart_query) ) {
729-
warning(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
730-
message(paste("DEBUG MODE: Limiting query to", params$DEBUG_limit_biomart_query, "entries"))
731-
probe_ids <- probe_ids[1:params$DEBUG_limit_biomart_query]
732-
}
733-
# DEBUG:END
734-
735712
# Create probe map
736713
# Run Biomart Queries in chunks to prevent request timeouts
737714
# Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
@@ -764,7 +741,6 @@ listToUniquePipedString <- function(str_list) {
764741
}
765742

766743
unique_probe_ids <- df_mapping %>%
767-
# note: '!!sym(VAR)' syntax allows usage of variable 'VAR' in dplyr functions due to NSE. ref: https://dplyr.tidyverse.org/articles/programming.html # NON_DPPD
768744
dplyr::mutate(dplyr::across(!!sym(expected_attribute_name), as.character)) %>% # Ensure probeset ids treated as character type
769745
dplyr::group_by(!!sym(expected_attribute_name)) %>%
770746
dplyr::summarise(
@@ -1217,14 +1193,17 @@ norm_data_matrix_annotated <- oligo::exprs(norm_data) %>%
12171193
dplyr::rename( ProbesetID = man_fsetid ) %>% # Rename from getProbeInfo name to ProbesetID
12181194
dplyr::rename( ProbeID = fid ) %>% # Rename from getProbeInfo name to ProbeID
12191195
dplyr::left_join(unique_probe_ids, by = c("ProbesetID" = expected_attribute_name ) ) %>%
1220-
dplyr::left_join(annot, by = "ENSEMBL") %>% # Join with GeneLab Reference Annotation Table
1221-
dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) # Convert NA mapping to 0
1196+
dplyr::left_join(annot, by = c("ENSEMBL" = map_primary_keytypes[[unique(df_rs$organism)]])) %>% # Join with GeneLab Reference Annotation Table using key name expected in organism specific annotation table
1197+
dplyr::mutate( count_ENSEMBL_mappings = ifelse(is.na(ENSEMBL), 0, count_ENSEMBL_mappings) ) %>% # Convert NA mapping to 0
1198+
dplyr::rename( !!map_primary_keytypes[[unique(df_rs$organism)]] := ENSEMBL )
1199+
12221200

12231201

12241202
norm_data_matrix_annotated <- norm_data_matrix_annotated %>%
12251203
dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
12261204

12271205
write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "normalized_intensities_probe.csv"), row.names = FALSE)
1206+
12281207
```
12291208

12301209
**Input Data:**
@@ -1237,8 +1216,8 @@ write.csv(norm_data_matrix_annotated, file.path(DIR_NORMALIZED_EXPRESSION, "norm
12371216

12381217
**Output Data:**
12391218

1240-
- **differential_expression.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probe DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.)
1219+
- **differential_expression.csv** (table containing normalized probeset expression values for each sample, group statistics, Limma probeset DE results for each pairwise comparison, and gene annotations. The ProbesetID is the unique index column.)
12411220
- **normalized_expression_probeset.csv** (table containing the background corrected, normalized probeset expression values for each sample. The ProbesetID is the unique index column.)
12421221
- visualization_PCA_table.csv (file used to generate GeneLab PCA plots)
12431222
- **raw_intensities_probe.csv** (table containing the background corrected, unnormalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
1244-
- **normalized_intensities_probe.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)
1223+
- **normalized_intensities_probe.csv** (table containing the background corrected, normalized probe intensity values for each sample including gene annotations. The ProbeID is the unique index column.)

Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,27 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.0.2](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.2/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix)
9+
10+
### Added
11+
12+
- Workflow now produces a file called meta.sh (in the 'GeneLab' sub-directory) that contains information about the workflow run. This file is used by the post processing workflow to generate a protocol description. (5a8a255)
13+
- POST_PROCESSING will now generate a protocol description using the contents of meta.sh and text templates. (801e2ad)
14+
- Workflow can now be run using an ISA archive by supplying parameter: 'isaArchivePath' (as either a local path or public web uri) (8822069)
15+
16+
### Changed
17+
18+
- Update dp_tools from 1.3.2 to 1.3.4 (158ce5e)
19+
- This updates the POST_PROCESSING workflow assay table to join multiple files by ',' instead of ',<SPACE>' and enables max flag code setting.
20+
- Slightly reduced stringency in V&V check for log2fc computation to account for rounding errors, specifically from 99.9% of rows within tolerance to 99.5%. (9fd2c11)
21+
- Publish directory behavior reworked to use the OSD accession as part of the default name. Now uses `resultsDir` instead of `outputDir` as the parameter name when a user does control the published files directory. (97cba72)
22+
23+
### Fixed
24+
25+
- Halt level flags now properly trigger workflow halt. (0885175)
26+
- Boxplots now show all y-axis labels when working with many samples. (7ec10d4s)
27+
- Density plot legend cex (character expansion) now has a minimum of 0.35 (rather than raising an exception for very large numbers of samples) (9a54fdc)
28+
829
## [1.0.1](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAffymetrix_1.0.1/Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix) - 2023-04-28
930

1031
### Added

Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/README.md

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md):
3636

3737
|Flag Codes|Flag Name|Interpretation|
3838
|:---------|:--------|:-------------|
39-
| 2 | MANUAL | Special flag that indicates a manual check that is advised. Often used to advise what assess in QA plots. |
39+
| 2 | MANUAL | Special flag that indicates a manual check that is advised. Often used to advise what should be visually assessed in QA plots. |
4040
| 20 | GREEN | Indicates the check passed all validation conditions |
4141
| 30 | YELLOW | Indicates the check was flagged for minor issues (e.g. slight outliers) |
4242
| 50 | RED | Indicates the check was flagged for moderate issues (e.g. major outliers) |
@@ -54,6 +54,7 @@ document](../../Pipeline_GL-DPPD-7114_Versions/GL-DPPD-7114.md):
5454
- [3. Run the Workflow](#3-run-the-workflow)
5555
- [3a. Approach 1: Run the workflow on a GeneLab Affymetrix Microarray dataset](#3a-approach-1-run-the-workflow-on-a-genelab-affymetrix-microarray-dataset)
5656
- [3b. Approach 2: Run the workflow on a non-GLDS dataset using a user-created runsheet](#3b-approach-2-run-the-workflow-on-a-non-glds-dataset-using-a-user-created-runsheet)
57+
- [3c. Approach 3: Run the workflow using an ISA Archive](#3c-approach-3-run-the-workflow-using-an-isa-archive)
5758
- [4. Additional Output Files](#4-additional-output-files)
5859

5960

@@ -96,9 +97,9 @@ All files required for utilizing the NF_MAAffymetrix GeneLab workflow for proces
9697
copy of latest NF_MAAffymetrix version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands:
9798
9899
```bash
99-
wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAffymetrix_1.0.1/NF_MAAffymetrix_1.0.1.zip
100+
wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAffymetrix_1.0.2/NF_MAAffymetrix_1.0.2.zip
100101
101-
unzip NF_MAAffymetrix_1.0.1.zip
102+
unzip NF_MAAffymetrix_1.0.2.zip
102103
```
103104
104105
<br>
@@ -107,15 +108,15 @@ unzip NF_MAAffymetrix_1.0.1.zip
107108
108109
### 3. Run the Workflow
109110
110-
While in the location containing the `NF_MAAffymetrix_1.0.1` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAffymetrix workflow:
111+
While in the location containing the `NF_MAAffymetrix_1.0.2` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAffymetrix workflow:
111112
> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument.
112113
113114
<br>
114115
115116
#### 3a. Approach 1: Run the workflow on a GeneLab Affymetrix Microarray dataset
116117
117118
```bash
118-
nextflow run NF_MAAffymetrix_1.0.1/main.nf \
119+
nextflow run NF_MAAffymetrix_1.0.2/main.nf \
119120
-profile singularity \
120121
--osdAccession OSD-266 \
121122
--gldsAccession GLDS-266
@@ -128,16 +129,28 @@ nextflow run NF_MAAffymetrix_1.0.1/main.nf \
128129
> Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md).
129130
130131
```bash
131-
nextflow run NF_MAAffymetrix_1.0.1/main.nf \
132+
nextflow run NF_MAAffymetrix_1.0.2/main.nf \
132133
-profile singularity \
133134
--runsheetPath </path/to/runsheet>
134135
```
135136
136137
<br>
137138
139+
#### 3c. Approach 3: Run the workflow using an ISA Archive
140+
141+
> Note: Specifications for the ISA Tab Archive format can be found [here](https://isa-specs.readthedocs.io/en/latest/isatab.html).
142+
143+
```bash
144+
nextflow run NF_MAAffymetrix_1.0.2/main.nf \
145+
-profile singularity \
146+
--isaArchivePath </path/to/isaArchive>
147+
```
148+
149+
<br>
150+
138151
**Required Parameters For All Approaches:**
139152
140-
* `NF_MAAffymetrix_1.0.1/main.nf` - Instructs Nextflow to run the NF_MAAffymetrix workflow
153+
* `NF_MAAffymetrix_1.0.2/main.nf` - Instructs Nextflow to run the NF_MAAffymetrix workflow
141154
142155
* `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow
143156
@@ -162,14 +175,14 @@ nextflow run NF_MAAffymetrix_1.0.1/main.nf \
162175
163176
* `--skipVV` - skip the automated V&V processes (Default: the automated V&V processes are active)
164177
165-
* `--outputDir` - specifies the directory to save the raw and processed data files (Default: files are saved in the launch directory)
178+
* `--resultsDir` - specifies the output directory for all files produced by the workflow (Default: <OSD-NNN_GLDS-NNN> if OSD and GLDS accessions are specified. Otherwise, the workflow launch directory.)
166179
167180
<br>
168181
169182
All parameters listed above and additional optional arguments for the NF_MAAffymetrix workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
170183
171184
```bash
172-
nextflow run NF_MAAffymetrix_1.0.1/main.nf --help
185+
nextflow run NF_MAAffymetrix_1.0.2/main.nf --help
173186
```
174187
175188
See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
@@ -183,7 +196,7 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex
183196
All R code steps and output are rendered within a Quarto document yielding the following:
184197
185198
- Output:
186-
- NF_MAAffymetrix_1.0.1.html (html report containing executed code and output including QA plots)
199+
- NF_MAAffymetrix_1.0.2.html (html report containing executed code and output including QA plots)
187200
188201
189202
The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below:

Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/Affymetrix.qmd

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
title: "Affymetrix Processing"
3-
subtitle: "Workflow Version: NF_MAAffymetrix_1.0.1"
3+
subtitle: "Workflow Version: NF_MAAffymetrix_1.0.2"
44
date: now
55
title-block-banner: true
66
format:
@@ -210,7 +210,7 @@ legend("topright", legend = colnames(raw_data@assayData$exprs),
210210
lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
211211
col = oligo::darkColors(n = ncol(raw_data)), # Ensure legend color is in sync with plot
212212
ncol = number_of_sets, # Set number of columns by number of sets
213-
cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
213+
cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1 with minimum of 35%
214214
)
215215
216216
# Reset par
@@ -277,7 +277,7 @@ if (inherits(raw_data, "GeneFeatureSet")) {
277277
#| warning: false # NAN can be produced due to log transformations
278278
#| column: screen-inset-right # Allow images to flow all the way to the right
279279
#| fig-width: 14
280-
#| fig-height: !expr max(8, dim(raw_data)[2] * 0.2)
280+
#| fig-height: !expr max(8, 2 + dim(raw_data)[2] * 0.2)
281281
#| fig-align: left
282282
max_samplename_length <- max(nchar(colnames(raw_data)))
283283
dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10)
@@ -360,7 +360,7 @@ legend("topright", legend = colnames(norm_data@assayData$exprs),
360360
lty = c(1,2,3,4,5), # Seems like oligo::hist cycles through these first five line types
361361
col = oligo::darkColors(n = ncol(norm_data)), # Ensure legend color is in sync with plot
362362
ncol = number_of_sets, # Set number of columns by number of sets
363-
cex = 1 + 0.2 - (number_of_sets*0.2) # Reduce scale by 20% for each column beyond 1
363+
cex = max(0.35, 1 + 0.2 - (number_of_sets*0.2)) # Reduce scale by 20% for each column beyond 1
364364
)
365365
366366
# Reset par
@@ -409,7 +409,7 @@ MA_plot <- oligo::MAplot(
409409
#| warning: false # NAN can be produced due to log transformations
410410
#| column: screen-inset-right # Allow images to flow all the way to the right
411411
#| fig-width: 14
412-
#| fig-height: !expr max(8, dim(norm_data)[2] * 0.2)
412+
#| fig-height: !expr max(8, 2 + dim(norm_data)[2] * 0.2)
413413
#| fig-align: left
414414
max_samplename_length <- max(nchar(colnames(norm_data)))
415415
dynamic_lefthand_margin <- max(max_samplename_length * 0.7, 10)

Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/bin/dp_tools__affymetrix/checks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ def check_dge_table_log2fc_within_reason(
347347
""" Note: This function assumes the normalized expression values are log2 transformed
348348
"""
349349
LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 1 # Percent
350-
PERCENT_ROWS_WITHIN_TOLERANCE = 99.9 # Percent
350+
PERCENT_ROWS_WITHIN_TOLERANCE = 99.5 # Percent
351351

352352
# data specific preprocess
353353
expected_groups = utils_runsheet_to_expected_groups(runsheet, formatting = GroupFormatting.ampersand_join_and_remove_non_ascii, map_to_lists=True)
@@ -380,7 +380,7 @@ def check_dge_table_log2fc_within_reason(
380380
)
381381

382382
if percent_within_tolerance < PERCENT_ROWS_WITHIN_TOLERANCE: # add current query column to error list
383-
error_list.append((query_column,percent_within_tolerance))
383+
error_list.append((query_column,percent_within_tolerance,f"First index out of tolerance: {abs_percent_difference.idxmin()}"))
384384

385385
# inplace sort error list for deterministic order
386386
error_list.sort()

Microarray/Affymetrix/Workflow_Documentation/NF_MAAffymetrix/workflow_code/config/default.config

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,38 @@
11
nextflow.enable.moduleBinaries = true
22

33
params {
4-
/*
5-
Parameters that MUST be supplied
4+
5+
/* Here GLDS and OSD accession are defined.
6+
Default behaviour is as follows:
7+
- If accessions are not set, then either runsheet or an ISA Archive MUST be supplied
8+
- If both accessions are set:
9+
- If runsheet and ISA archive are left unset, then the ISA archive will be fetched from the GeneLab API and runsheet generated from the runsheet.
10+
- If either runsheet or ISA archive are set, they will be used but the output directory and tags will reflect the appropriate accessions. This is useful when processing from the OSDR but OSDR metadata is not ready as is.
11+
- If both runsheet and ISA archive are set, the workflow will halt.
12+
- If only one accession is set, then the workflow will halt.
13+
614
*/
7-
gldsAccession = null // GeneLab Data Accession Number, e.g. GLDS-104
8-
osdAccession = null // OSD Data Accession Number, e.g. OSD-367
15+
gldsAccession = "NOT_OSDR" // GeneLab Data Accession Number, e.g. GLDS-104
16+
osdAccession = "NOT_OSDR" // OSD Data Accession Number, e.g. OSD-367
17+
18+
// Catch case where only one is set
19+
if (params.gldsAccession != "NOT_OSDR" && params.osdAccession == "NOT_OSDR") {
20+
println "ERROR: GLDS accession set but OSD accession is not set. Please set both or neither."
21+
System.exit(1)
22+
}
23+
if (params.gldsAccession == "NOT_OSDR" && params.osdAccession != "NOT_OSDR") {
24+
println "ERROR: OSD accession set but GLDS accession is not set. Please set both or neither."
25+
System.exit(1)
26+
}
927

28+
resultsDir = (params.gldsAccession != "NOT_OSDR" && params.osdAccession != "NOT_OSDR") ? "./${params.osdAccession}_${params.gldsAccession}" : "." // the location for the output from the pipeline (also includes raw data and metadata)
1029

1130
/*
1231
Parameters that CAN be overwritten
1332
*/
1433
runsheetPath = false
1534
biomart_attribute = false // Must be supplied if runsheet 'Array design REF' column doesn't indicate it
16-
outputDir = "." // the location for the output from the pipeline (also includes raw data and metadata)
35+
isaArchivePath = false // Alternative to fetching the ISA archive for an associated OSD/GLDS dataset
1736
publish_dir_mode = "link" // method for creating publish directory. Default here for hardlink
1837
help = false // display help menu and exit workflow program
1938

0 commit comments

Comments
 (0)