Skip to content

Commit 40e3652

Browse files
Merge pull request #123 from torres-alexis/DEV_GeneLab_Reference_Annotations_vGL-DPPD-7110-A
[GL_RefAnnotTable] Add Docker/Singularity
2 parents 7228880 + 51570c4 commit 40e3652

File tree

5 files changed

+141
-71
lines changed

5 files changed

+141
-71
lines changed

GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ The default columns in the annotation table are:
136136
| Program | Version | Relevant Links |
137137
|:----------------|:-------:|:---------------|
138138
| R | 4.4.0 | [https://www.r-project.org/](https://www.r-project.org/) |
139-
| Bioconductor | 3.19.1 | [https://bioconductor.org](https://bioconductor.org) |
139+
| Bioconductor | 3.19 | [https://bioconductor.org](https://bioconductor.org) |
140140
| tidyverse | 2.0.0 | [https://www.tidyverse.org](https://www.tidyverse.org) |
141-
| STRINGdb | 2.16.0 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) |
141+
| STRINGdb | 2.16.4 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) |
142142
| PANTHER.db | 1.0.12 | [https://bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html) |
143143
| rtracklayer | 1.64.0 | [https://bioconductor.org/packages/release/bioc/html/rtracklayer.html](https://www.bioconductor.org/packages/release/bioc/html/rtracklayer.html) |
144144
| org.At.tair.db | 3.19.1 | [https://bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html) |
@@ -182,6 +182,10 @@ Current GeneLab annotation tables are available on [figshare](https://figshare.c
182182
## 0. Set Up Environment
183183

184184
```R
185+
# Set R library path to current working directory
186+
lib_path <- file.path(getwd())
187+
.libPaths(lib_path)
188+
185189
# Define variables associated with current pipeline and annotation table versions
186190
GL_DPPD_ID <- "GL-DPPD-7110-A"
187191
ref_tab_path <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"
@@ -253,7 +257,7 @@ base_output_name <- str_replace(base_gtf_filename, ".gtf.gz", "")
253257

254258
# Add the species name to base_output_name if the reference source is not ENSEMBL
255259
if (!(ref_source %in% c("ensembl_plants", "ensembl_bacteria", "ensembl"))) {
256-
base_output_name <- paste(str_replace(target_species_designation, " ", "_"), base_output_name, sep = "_")
260+
base_output_name <- paste(str_replace(target_organism, " ", "_"), base_output_name, sep = "_")
257261
}
258262

259263
out_table_filename <- paste0(base_output_name, "-GL-annotations.tsv")
@@ -294,42 +298,52 @@ if ( file.exists(out_table_filename) ) {
294298
# Use AnnotationForge's makeOrgPackageFromNCBI function with default settings to create the organism-specific org.db R package from available NCBI annotations
295299

296300
# Try to download the org.db from Bioconductor, build it locally if installation fails
297-
BiocManager::install(target_org_db, ask = FALSE)
298-
if (!requireNamespace(target_org_db, quietly = TRUE)) {
301+
BiocManager::install(target_org_db, ask = FALSE)
302+
if (!requireNamespace(target_org_db, quietly = TRUE)) {
299303
tryCatch({
300304
# Parse organism's name in the reference table to create the org.db name (target_org_db)
301305
genus_species <- strsplit(target_organism, " ")[[1]]
302306
if (length(genus_species) < 1) {
303-
stop("Species designation is not correctly formatted: ", target_organism)
307+
stop("Species designation is not correctly formatted: ", target_organism)
304308
}
309+
305310
genus <- genus_species[1]
306311
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
307312
strain <- ref_table %>%
308313
filter(species == target_organism) %>%
309314
pull(strain) %>%
310315
gsub("[^A-Za-z0-9]", "", .)
316+
311317
if (!is.na(strain) && strain != "") {
312-
species <- paste0(species, strain)
318+
species <- paste0(species, strain)
319+
}
320+
321+
# Get package name or build it if not provided
322+
target_org_db <- ref_table %>%
323+
filter(species == target_organism) %>%
324+
pull(annotations)
325+
326+
if (is.na(target_org_db) || target_org_db == "") {
327+
cat("\nNo annotation database specified. Constructing package name...\n")
328+
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
313329
}
314-
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
315330

316-
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
331+
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
317332
library(AnnotationForge)
318-
makeOrgPackageFromNCBI(
319-
version = "0.1",
320-
author = "Your Name <your.email@example.com>",
321-
maintainer = "Your Name <your.email@example.com>",
322-
outputDir = "./",
323-
tax_id = target_taxid,
324-
genus = genus,
325-
species = species
333+
makeOrgPackageFromNCBI(
334+
version = "0.1",
335+
author = "Your Name <your.email@example.com>",
336+
maintainer = "Your Name <your.email@example.com>",
337+
outputDir = "./",
338+
tax_id = target_taxid,
339+
genus = genus,
340+
species = species
326341
)
327342
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
328343
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
329344
}, error = function(e) {
330-
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
345+
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
331346
})
332-
target_org_db <- install_annotations(target_organism, ref_tab_path)
333347
}
334348
```
335349

GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ Bioconductor. Used for:
4949
- Bacteria: Ensembl bacteria release 59
5050
- Updated software:
5151
- tidyverse version updated from 1.3.2 to 2.0.0
52-
- STRINGdb version updated from 2.8.4 to 2.16.0
52+
- STRINGdb version updated from 2.8.4 to 2.16.4
5353
- PANTHER.db version updated from 1.0.11 to 1.0.12
5454
- rtracklayer version updated from 1.56.1 to 1.64.0
55-
- Bioconductor version updated from 3.15.1 to 3.19.1
55+
- Bioconductor version updated from 3.15.1 to 3.19
5656
- Removed org.EcK12.eg.db and replaced it with a locally created annotations
5757
database, as it is no longer available on Bioconductor
5858
- Changed the first argument of GL-DPPD-7110-A_build-genome-annots-tab.R from

GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/README.md

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ The current GeneLab Reference Annotation Table (GL_RefAnnotTable-A) pipeline is
1010
3. [Setup Execution Permission for Workflow Scripts](#3-setup-execution-permission-for-workflow-scripts)
1111
4. [Run the workflow](#4-run-the-workflow)
1212
5. [Run the annotations database creation function as a stand-alone script](#5-run-the-annotations-database-creation-function-as-a-stand-alone-script)
13+
6. [Run the Workflow Using Docker or Singularity](#6-run-the-workflow-using-docker-or-singularity)
1314
<br>
1415

1516
### 1. Install R and R packages
@@ -26,20 +27,20 @@ Once R is installed, open a CLI terminal and run the following command to activa
2627
```bash
2728
R
2829
```
29-
30+
`
3031
Within an active R environment, run the following commands to install the required R packages:
3132

3233
```R
33-
install.packages("tidyverse", version = 2.0.0, repos = "http://cran.us.r-project.org")
34+
install.packages("tidyverse")
3435

35-
install.packages("BiocManager", version = 3.19.1, repos = "http://cran.us.r-project.org")
36+
install.packages("BiocManager")
3637

37-
BiocManager::install("STRINGdb", version = 3.19.1)
38-
BiocManager::install("PANTHER.db", version = 3.19.1)
39-
BiocManager::install("rtracklayer", version = 3.19.1)
40-
BiocManager::install("AnnotationForge", version = 1.46.0)
41-
BiocManager::install("biomaRt", version = 2.60.1)
42-
BiocManager::install("GO.db", version = 3.19.1)
38+
BiocManager::install("STRINGdb")
39+
BiocManager::install("PANTHER.db")
40+
BiocManager::install("rtracklayer")
41+
BiocManager::install("AnnotationForge")
42+
BiocManager::install("biomaRt")
43+
BiocManager::install("GO.db")
4344
```
4445

4546
<br>
@@ -102,3 +103,53 @@ Rscript install-org-db.R 'Bacillus subtilis' /path/to/GL-DPPD-7110-A_annotations
102103
**Output data:**
103104

104105
- org.*.eg.db/ (species-specific annotation database, as a local R package)
106+
107+
### 6. Run the Workflow Using Docker or Singularity
108+
109+
Rather than running the workflow in your local environment, you can use a Docker or Singularity container. This method ensures that all dependencies are correctly installed.
110+
111+
1. **Pull the container image:**
112+
113+
Docker:
114+
```bash
115+
docker pull quay.io/nasa_genelab/gl-refannottable:v1.0.0
116+
```
117+
118+
Singularity:
119+
```bash
120+
singularity pull docker://quay.io/nasa_genelab/gl-refannottable:v1.0.0
121+
```
122+
123+
2. **Download the workflow files:**
124+
125+
```bash
126+
curl -LO https://github.com/nasa/GeneLab_Data_Processing/releases/download/GL_RefAnnotTable-A_1.1.0/GL_RefAnnotTable-A_1.1.0.zip
127+
unzip GL_RefAnnotTable-A_1.1.0.zip
128+
```
129+
130+
3. **Run the workflow:**
131+
132+
Docker:
133+
```bash
134+
docker run -it -v $(pwd)/GL_RefAnnotTable-A_1.1.0:/work \
135+
quay.io/nasa_genelab/gl-refannottable:v1.0.0 \
136+
bash -c "cd /work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'"
137+
```
138+
139+
Singularity:
140+
```bash
141+
singularity exec -B $(pwd)/GL_RefAnnotTable-A_1.1.0:/work \
142+
gl-refannottable_v1.0.0.sif \
143+
bash -c "cd /work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'"
144+
```
145+
146+
**Input data:**
147+
148+
- No input files are required. Specify the target organism using a positional command line argument. `Mus musculus` is used in the example above. To see a list of all available organisms, run `Rscript GL-DPPD-7110-A_build-genome-annots-tab.R` without positional arguments. The correct argument for each organism can also be found in the 'species' column of the [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)
149+
150+
- Optional: a reference table CSV can be supplied as a second positional argument instead of using the default [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)
151+
152+
**Output data:**
153+
154+
- *-GL-annotations.tsv (Tab delineated table of gene annotations)
155+
- *-GL-build-info.txt (Text file containing information used to create the annotation table, including tool and tool versions and date of creation)

GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/GL-DPPD-7110-A_build-genome-annots-tab.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
# GeneLab script for generating organism-specific gene annotation tables
44
# Example usage: Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'
55

6+
# Set R library path to current working directory
7+
lib_path <- file.path(getwd())
8+
.libPaths(lib_path)
9+
610
# Define variables associated with current pipeline and annotation table versions
711
GL_DPPD_ID <- "GL-DPPD-7110-A"
812
ref_tab_path <- "https://raw.githubusercontent.com/nasa/GeneLab_Data_Processing/master/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv"

GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/workflow_code/install-org-db.R

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -13,35 +13,36 @@ install_annotations <- function(target_organism, refTablePath) {
1313
filter(species == target_organism) %>%
1414
pull(taxon)
1515

16+
# Parse organism's name in the reference table to create the org.db name (target_org_db)
17+
target_species_designation <- ref_table %>%
18+
filter(species == target_organism) %>%
19+
pull(species) %>%
20+
gsub("\\s+", " ", .) %>%
21+
gsub("[^A-Za-z0-9 ]", "", .)
22+
23+
genus_species <- strsplit(target_species_designation, " ")[[1]]
24+
if (length(genus_species) < 1) {
25+
stop("Species designation is not correctly formatted: ", target_species_designation)
26+
}
27+
28+
genus <- genus_species[1]
29+
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
30+
strain <- ref_table %>%
31+
filter(species == target_organism) %>%
32+
pull(strain) %>%
33+
gsub("[^A-Za-z0-9]", "", .)
34+
35+
if (!is.na(strain) && strain != "") {
36+
species <- paste0(species, strain)
37+
}
38+
1639
# Get package name or build it if not provided
1740
target_org_db <- ref_table %>%
1841
filter(species == target_organism) %>%
1942
pull(annotations)
2043

2144
if (is.na(target_org_db) || target_org_db == "") {
2245
cat("\nNo annotation database specified. Constructing package name...\n")
23-
target_species_designation <- ref_table %>%
24-
filter(species == target_organism) %>%
25-
pull(species) %>%
26-
gsub("\\s+", " ", .) %>%
27-
gsub("[^A-Za-z0-9 ]", "", .)
28-
29-
genus_species <- strsplit(target_species_designation, " ")[[1]]
30-
if (length(genus_species) < 1) {
31-
stop("Species designation is not correctly formatted: ", target_species_designation)
32-
}
33-
34-
genus <- genus_species[1]
35-
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
36-
strain <- ref_table %>%
37-
filter(species == target_organism) %>%
38-
pull(strain) %>%
39-
gsub("[^A-Za-z0-9]", "", .)
40-
41-
if (!is.na(strain) && strain != "") {
42-
species <- paste0(species, strain)
43-
}
44-
4546
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
4647
}
4748

@@ -56,25 +57,25 @@ install_annotations <- function(target_organism, refTablePath) {
5657
} else {
5758
cat(paste0("\nInstallation from Bioconductor failed, attempting to build '", target_org_db, "'...\n"))
5859
if (!dir.exists(target_org_db)) {
59-
tryCatch({
60-
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
61-
library(AnnotationForge)
62-
makeOrgPackageFromNCBI(
63-
version = "0.1",
64-
author = "Your Name <your.email@example.com>",
65-
maintainer = "Your Name <your.email@example.com>",
66-
outputDir = "./",
67-
tax_id = target_taxid,
68-
genus = genus,
69-
species = species
70-
)
71-
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
72-
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
73-
}, error = function(e) {
74-
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
75-
})
60+
tryCatch({
61+
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
62+
library(AnnotationForge)
63+
makeOrgPackageFromNCBI(
64+
version = "0.1",
65+
author = "Your Name <your.email@example.com>",
66+
maintainer = "Your Name <your.email@example.com>",
67+
outputDir = "./",
68+
tax_id = target_taxid,
69+
genus = genus,
70+
species = species
71+
)
72+
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
73+
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
74+
}, error = function(e) {
75+
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
76+
})
7677
} else {
77-
cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed."))
78+
cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed.\n"))
7879
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
7980
}
8081
}
@@ -83,4 +84,4 @@ install_annotations <- function(target_organism, refTablePath) {
8384
library(target_org_db, character.only = TRUE)
8485
cat(paste0("Using Annotation Database '", target_org_db, "'.\n"))
8586
return(target_org_db)
86-
}
87+
}

0 commit comments

Comments
 (0)