Skip to content

Commit 9038454

Browse files
committed
Merge remote-tracking branch 'origin' into DEV_NF_MAAffymetrix
2 parents 0a3ba8e + 7d3ba37 commit 9038454

File tree

19 files changed

+463
-163
lines changed

19 files changed

+463
-163
lines changed

Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md

Lines changed: 125 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ Lauren Sanders (acting GeneLab Project Scientist)
6464
|biomaRt|2.50.0|[https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html](https://bioconductor.org/packages/3.14/bioc/html/biomaRt.html)|
6565
|matrixStats|0.63.0|[https://github.com/HenrikBengtsson/matrixStats](https://github.com/HenrikBengtsson/matrixStats)|
6666
|statmod|1.5.0|[https://github.com/cran/statmod](https://github.com/cran/statmod)|
67-
|dp_tools|1.3.0|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
67+
|dp_tools|1.3.1|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
6868
|singularity|3.9|[https://sylabs.io](https://sylabs.io)|
6969
|Quarto|1.2.313|[https://quarto.org](https://quarto.org)|
7070

@@ -255,6 +255,20 @@ raw_data <- limma::read.maimages(df_local_paths$`Local Paths`,
255255
names = df_local_paths$`Sample Name` # Map column names as Sample Names (instead of default filenames)
256256
)
257257

258+
# Handle raw data which lacks certain replaceable column data
259+
260+
## This likely arises as Agilent Feature Extraction (the process that generates the raw data files on OSDR)
261+
## gives some user flexibilty in what probe column to ouput
262+
263+
## Missing ProbeUID "Unique integer for each unique probe in a design"
264+
### Source: https://www.agilent.com/cs/library/usermanuals/public/GEN-MAN-G4460-90057.pdf Page 178
265+
### Remedy: Assign unique integers for each probe
266+
267+
if ( !("ProbeUID" %in% colnames(raw_data$genes)) ) {
268+
# Assign unique integers for each probe
269+
print("Assigning `ProbeUID` as original files did not include them")
270+
raw_data$genes$ProbeUID <- seq_len(nrow(raw_data$genes))
271+
}
258272

259273
# Summarize raw data
260274
print(paste0("Number of Arrays: ", dim(raw_data)[2]))
@@ -568,20 +582,6 @@ shortenedOrganismName <- function(long_name) {
568582
}
569583

570584

571-
# locate dataset
572-
expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
573-
print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
574-
575-
576-
# Specify Ensembl version used in current GeneLab reference annotations
577-
ENSEMBL_VERSION <- '107'
578-
579-
ensembl <- biomaRt::useEnsembl(biomart = "genes",
580-
dataset = expected_dataset_name,
581-
version = ENSEMBL_VERSION)
582-
print(ensembl)
583-
584-
585585
getBioMartAttribute <- function(df_rs) {
586586
#' Returns resolved biomart attribute source from runsheet
587587

@@ -598,34 +598,107 @@ getBioMartAttribute <- function(df_rs) {
598598
}
599599
}
600600

601-
expected_attribute_name <- getBioMartAttribute(df_rs)
602-
print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
603-
604-
probe_ids <- unique(norm_data$genes$ProbeName)
605-
606-
607-
# Create probe map
608-
# Run Biomart Queries in chunks to prevent request timeouts
609-
# Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
610-
CHUNK_SIZE= 8000
611-
probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
612-
df_mapping <- data.frame()
613-
for (i in seq_along(probe_id_chunks)) {
614-
probe_id_chunk <- probe_id_chunks[[i]]
615-
print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
616-
chunk_results <- biomaRt::getBM(
617-
attributes = c(
618-
expected_attribute_name,
619-
"ensembl_gene_id"
620-
),
621-
filters = expected_attribute_name,
622-
values = probe_id_chunk,
623-
mart = ensembl)
624-
625-
df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
626-
Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
601+
get_ensembl_genomes_mappings_from_ftp <- function(organism, ensembl_genomes_portal, ensembl_genomes_version, biomart_attribute) {
602+
#' Obtain mapping table directly from ftp. Useful when biomart live service no longer exists for desired version
603+
604+
request_url <- glue::glue("https://ftp.ebi.ac.uk/ensemblgenomes/pub/{ensembl_genomes_portal}/release-{ensembl_genomes_version}/mysql/{ensembl_genomes_portal}_mart_{ensembl_genomes_version}/{organism}_eg_gene__efg_{biomart_attribute}__dm.txt.gz")
605+
606+
print(glue::glue("Mappings file URL: {request_url}"))
607+
608+
# Create a temporary file name
609+
temp_file <- tempfile(fileext = ".gz")
610+
611+
# Download the gzipped table file using the download.file function
612+
download.file(url = request_url, destfile = temp_file, method = "libcurl") # Use 'libcurl' to support ftps
613+
614+
# Uncompress the file
615+
uncompressed_temp_file <- tempfile()
616+
gzcon <- gzfile(temp_file, "rt")
617+
content <- readLines(gzcon)
618+
writeLines(content, uncompressed_temp_file)
619+
close(gzcon)
620+
621+
622+
# Load the data into a dataframe
623+
mapping <- read.table(uncompressed_temp_file, # Read the uncompressed file
624+
# Add column names as follows: MAPID, TAIR, PROBEID
625+
col.names = c("MAPID", "ensembl_gene_id", biomart_attribute),
626+
header = FALSE, # No header in original table
627+
sep = "\t") # Tab separated
628+
629+
# Clean up temporary files
630+
unlink(temp_file)
631+
unlink(uncompressed_temp_file)
632+
633+
return(mapping)
634+
}
635+
636+
637+
organism <- shortenedOrganismName(unique(df_rs$organism))
638+
639+
if (organism %in% c("athaliana")) {
640+
ensembl_genomes_version = "54"
641+
ensembl_genomes_portal = "plants"
642+
print(glue::glue("Using ensembl genomes ftp to get specific version of probe id mapping table. Ensembl genomes portal: {ensembl_genomes_portal}, version: {ensembl_genomes_version}"))
643+
expected_attribute_name <- getBioMartAttribute(df_rs)
644+
df_mapping <- get_ensembl_genomes_mappings_from_ftp(
645+
organism = organism,
646+
ensembl_genomes_portal = ensembl_genomes_portal,
647+
ensembl_genomes_version = ensembl_genomes_version,
648+
biomart_attribute = expected_attribute_name
649+
)
650+
651+
# TAIR from the mapping tables tend to be in the format 'AT1G01010.1' but the raw data has 'AT1G01010'
652+
# So here we remove the '.NNN' from the mapping table where .NNN is any number
653+
df_mapping$ensembl_gene_id <- stringr::str_replace_all(df_mapping$ensembl_gene_id, "\\.\\d+$", "")
654+
} else {
655+
# Use biomart from main Ensembl website which archives keep each release on the live service
656+
# locate dataset
657+
expected_dataset_name <- shortenedOrganismName(unique(df_rs$organism)) %>% stringr::str_c("_gene_ensembl")
658+
print(paste0("Expected dataset name: '", expected_dataset_name, "'"))
659+
660+
661+
# Specify Ensembl version used in current GeneLab reference annotations
662+
ENSEMBL_VERSION <- '107'
663+
664+
print(glue::glue("Using Ensembl biomart to get specific version of mapping table. Ensembl version: {ENSEMBL_VERSION}"))
665+
666+
ensembl <- biomaRt::useEnsembl(biomart = "genes",
667+
dataset = expected_dataset_name,
668+
version = ENSEMBL_VERSION)
669+
print(ensembl)
670+
671+
expected_attribute_name <- getBioMartAttribute(df_rs)
672+
print(paste0("Expected attribute name: '", expected_attribute_name, "'"))
673+
674+
probe_ids <- unique(norm_data$genes$ProbeName)
675+
676+
677+
# Create probe map
678+
# Run Biomart Queries in chunks to prevent request timeouts
679+
# Note: If timeout is occuring (possibly due to larger load on biomart), reduce chunk size
680+
CHUNK_SIZE= 1500
681+
probe_id_chunks <- split(probe_ids, ceiling(seq_along(probe_ids) / CHUNK_SIZE))
682+
df_mapping <- data.frame()
683+
for (i in seq_along(probe_id_chunks)) {
684+
probe_id_chunk <- probe_id_chunks[[i]]
685+
print(glue::glue("Running biomart query chunk {i} of {length(probe_id_chunks)}. Total probes IDS in query ({length(probe_id_chunk)})"))
686+
chunk_results <- biomaRt::getBM(
687+
attributes = c(
688+
expected_attribute_name,
689+
"ensembl_gene_id"
690+
),
691+
filters = expected_attribute_name,
692+
values = probe_id_chunk,
693+
mart = ensembl)
694+
695+
df_mapping <- df_mapping %>% dplyr::bind_rows(chunk_results)
696+
Sys.sleep(10) # Slight break between requests to prevent back-to-back requests
697+
}
627698
}
628699

700+
# At this point, we have df_mapping from either the biomart live service or the ensembl genomes ftp archive depending on the organism
701+
629702
# Convert list of multi-mapped genes to string
630703
listToUniquePipedString <- function(str_list) {
631704
#! convert lists into strings denoting unique elements separated by '|' characters
@@ -827,9 +900,7 @@ reformat_names <- function(colname, group_name_mapping) {
827900
stringr::str_replace(pattern = stringr::fixed("Genes.ProbeName"), replacement = "ProbeName") %>%
828901
stringr::str_replace(pattern = stringr::fixed("Genes.count_ENSEMBL_mappings"), replacement = "count_ENSEMBL_mappings") %>%
829902
stringr::str_replace(pattern = stringr::fixed("Genes.ProbeUID"), replacement = "ProbeUID") %>%
830-
stringr::str_replace(pattern = stringr::fixed("Genes.SYMBOL"), replacement = "SYMBOL") %>%
831903
stringr::str_replace(pattern = stringr::fixed("Genes.ENSEMBL"), replacement = "ENSEMBL") %>%
832-
stringr::str_replace(pattern = stringr::fixed("Genes.GOSLIM_IDS"), replacement = "GOSLIM_IDS") %>%
833904
stringr::str_replace(pattern = ".condition", replacement = "v")
834905

835906
# remap to group names before make.names was applied
@@ -934,7 +1005,8 @@ map_primary_keytypes <- c(
9341005
df_interim <- merge(
9351006
annot,
9361007
df_interim,
937-
by = map_primary_keytypes[[unique(df_rs$organism)]],
1008+
by.x = map_primary_keytypes[[unique(df_rs$organism)]],
1009+
by.y = "ENSEMBL",
9381010
# ensure all original dge rows are kept.
9391011
# If unmatched in the annotation database, then fill missing with NAN
9401012
all.y = TRUE
@@ -1013,10 +1085,13 @@ FINAL_COLUMN_ORDER <- c(
10131085

10141086
## Assert final column order includes all columns from original table
10151087
if (!setequal(FINAL_COLUMN_ORDER, colnames(df_interim))) {
1016-
FINAL_COLUMN_ORDER_STRING <- paste(FINAL_COLUMN_ORDER, collapse = ":::::")
1017-
stop(glue::glue("Column reordering attempt resulted in different sets of columns than orignal. Order attempted: {FINAL_COLUMN_ORDER_STRING}"))
1088+
write.csv(FINAL_COLUMN_ORDER, "FINAL_COLUMN_ORDER.csv")
1089+
NOT_IN_DF_INTERIM <- paste(setdiff(FINAL_COLUMN_ORDER, colnames(df_interim)), collapse = ":::")
1090+
NOT_IN_FINAL_COLUMN_ORDER <- paste(setdiff(colnames(df_interim), FINAL_COLUMN_ORDER), collapse = ":::")
1091+
stop(glue::glue("Column reordering attempt resulted in different sets of columns than original. Names unique to 'df_interim': {NOT_IN_FINAL_COLUMN_ORDER}. Names unique to 'FINAL_COLUMN_ORDER': {NOT_IN_DF_INTERIM}."))
10181092
}
10191093

1094+
10201095
## Perform reordering
10211096
df_interim <- df_interim %>% dplyr::relocate(dplyr::all_of(FINAL_COLUMN_ORDER))
10221097

@@ -1042,7 +1117,8 @@ raw_data_matrix <- background_corrected_data$genes %>%
10421117
raw_data_matrix_annotated <- merge(
10431118
annot,
10441119
raw_data_matrix,
1045-
by = map_primary_keytypes[[unique(df_rs$organism)]],
1120+
by.x = map_primary_keytypes[[unique(df_rs$organism)]],
1121+
by.y = "ENSEMBL",
10461122
# ensure all original dge rows are kept.
10471123
# If unmatched in the annotation database, then fill missing with NAN
10481124
all.y = TRUE
@@ -1071,7 +1147,8 @@ norm_data_matrix <- norm_data$genes %>%
10711147
norm_data_matrix_annotated <- merge(
10721148
annot,
10731149
norm_data_matrix,
1074-
by = map_primary_keytypes[[unique(df_rs$organism)]],
1150+
by.x = map_primary_keytypes[[unique(df_rs$organism)]],
1151+
by.y = "ENSEMBL",
10751152
# ensure all original dge rows are kept.
10761153
# If unmatched in the annotation database, then fill missing with NAN
10771154
all.y = TRUE
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [1.0.2](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.2/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2023-04-28
9+
10+
### Added
11+
12+
### Added
13+
14+
- Support for Arabidposis Thaliana datasets using the plants ensembl FTP server.
15+
16+
### Changed
17+
18+
- When encountering error about column reordering, the expected order is saved for debugging purposes.
19+
- Post Processing Workflow: Assay Table Update now added '_array_' prefix to processed files instead of '_microarray_' prefix.
20+
21+
## [1.0.1](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.1/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2023-03-31
22+
23+
### Removed
24+
25+
- Deprecated column renaming code (abcd380)
26+
27+
### Fixed
28+
29+
- Bumped dp_tools from 1.3.0 to 1.3.1 to address 'ISO-8859-1' encoded ISA archive files (example: OSD-271-v2) (d518f40)
30+
- Added handling for raw data that lacks the ProbeUID column (example: OSD-271-v2) (efbc237)
31+
32+
### Changed
33+
34+
- Reordering error message is now more informative (007e36c)
35+
36+
## [1.0.0](https://github.com/asaravia-butler/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_1.0.0/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch) - 2023-03-22
37+
38+
### Added
39+
40+
- First internal production ready release of the Agilent 1 Channel Microarray Processing Workflow

Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/README.md

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,12 @@ All files required for utilizing the NF_MAAgilent1ch GeneLab workflow for proces
9595
copy of latest NF_MAAgilent1ch version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands:
9696
9797
```bash
98-
wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAgilent1ch_1.0.0/NF_MAAgilent1ch_1.0.0.zip
98+
wget https://github.com/asaravia-butler/GeneLab_Data_Processing/releases/download/NF_MAAgilent1ch_1.0.2
99+
/NF_MAAgilent1ch_1.0.2
100+
.zip
99101
100-
unzip NF_MAAgilent1ch_1.0.0.zip
102+
unzip NF_MAAgilent1ch_1.0.2
103+
.zip
101104
```
102105
103106
<br>
@@ -106,15 +109,17 @@ unzip NF_MAAgilent1ch_1.0.0.zip
106109
107110
### 3. Run the Workflow
108111
109-
While in the location containing the `NF_MAAgilent1ch_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAgilent1ch workflow:
112+
While in the location containing the `NF_MAAgilent1ch_1.0.2
113+
` directory that was downloaded in [step 2](#2-download-the-workflow-files), you are now able to run the workflow. Below are three examples of how to run the NF_MAAgilent1ch workflow:
110114
> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --ensemblVersion) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument.
111115
112116
<br>
113117
114118
#### 3a. Approach 1: Run the workflow on a GeneLab Agilent 1 Channel Microarray dataset
115119
116120
```bash
117-
nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
121+
nextflow run NF_MAAgilent1ch_1.0.2
122+
/main.nf \
118123
-profile singularity \
119124
--osdAccession OSD-548 \
120125
--gldsAccession GLDS-548
@@ -127,7 +132,8 @@ nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
127132
> Note: Specifications for creating a runsheet manually are described [here](examples/runsheet/README.md).
128133
129134
```bash
130-
nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
135+
nextflow run NF_MAAgilent1ch_1.0.2
136+
/main.nf \
131137
-profile singularity \
132138
--runsheetPath </path/to/runsheet>
133139
```
@@ -136,7 +142,8 @@ nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
136142
137143
**Required Parameters For All Approaches:**
138144
139-
* `NF_MAAgilent1ch_1.0.0/main.nf` - Instructs Nextflow to run the NF_MAAgilent1ch workflow
145+
* `NF_MAAgilent1ch_1.0.2
146+
/main.nf` - Instructs Nextflow to run the NF_MAAgilent1ch workflow
140147
141148
* `-profile` - Specifies the configuration profile(s) to load, `singularity` instructs Nextflow to setup and use singularity for all software called in the workflow
142149
@@ -168,7 +175,8 @@ nextflow run NF_MAAgilent1ch_1.0.0/main.nf \
168175
All parameters listed above and additional optional arguments for the NF_MAAgilent1ch workflow, including debug related options that may not be immediately useful for most users, can be viewed by running the following command:
169176
170177
```bash
171-
nextflow run NF_MAAgilent1ch_1.0.0/main.nf --help
178+
nextflow run NF_MAAgilent1ch_1.0.2
179+
/main.nf --help
172180
```
173181
174182
See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details common to all nextflow workflows.
@@ -182,7 +190,8 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex
182190
All R code steps and output are rendered within a Quarto document yielding the following:
183191
184192
- Output:
185-
- NF_MAAgilent1ch_1.0.0.html (html report containing executed code and output including QA plots)
193+
- NF_MAAgilent1ch_1.0.2
194+
.html (html report containing executed code and output including QA plots)
186195
187196
188197
The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below:

0 commit comments

Comments
 (0)