Skip to content

Commit 2749fe5

Browse files
committed
[GL_RefAnnotTable] Fix R packages, add docker instructions
1 parent 7228880 commit 2749fe5

File tree

3 files changed

+121
-75
lines changed

3 files changed

+121
-75
lines changed

GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A.md

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ The default columns in the annotation table are:
136136
| Program | Version | Relevant Links |
137137
|:----------------|:-------:|:---------------|
138138
| R | 4.4.0 | [https://www.r-project.org/](https://www.r-project.org/) |
139-
| Bioconductor | 3.19.1 | [https://bioconductor.org](https://bioconductor.org) |
139+
| Bioconductor | 3.19 | [https://bioconductor.org](https://bioconductor.org) |
140140
| tidyverse | 2.0.0 | [https://www.tidyverse.org](https://www.tidyverse.org) |
141-
| STRINGdb | 2.16.0 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) |
141+
| STRINGdb | 2.16.4 | [https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html](https://www.bioconductor.org/packages/release/bioc/html/STRINGdb.html) |
142142
| PANTHER.db | 1.0.12 | [https://bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/PANTHER.db.html) |
143143
| rtracklayer | 1.64.0 | [https://bioconductor.org/packages/release/bioc/html/rtracklayer.html](https://www.bioconductor.org/packages/release/bioc/html/rtracklayer.html) |
144144
| org.At.tair.db | 3.19.1 | [https://bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html](https://www.bioconductor.org/packages/release/data/annotation/html/org.At.tair.db.html) |
@@ -294,42 +294,58 @@ if ( file.exists(out_table_filename) ) {
294294
# Use AnnotationForge's makeOrgPackageFromNCBI function with default settings to create the organism-specific org.db R package from available NCBI annotations
295295

296296
# Try to download the org.db from Bioconductor, build it locally if installation fails
297-
BiocManager::install(target_org_db, ask = FALSE)
298-
if (!requireNamespace(target_org_db, quietly = TRUE)) {
297+
BiocManager::install(target_org_db, ask = FALSE)
298+
if (!requireNamespace(target_org_db, quietly = TRUE)) {
299299
tryCatch({
300-
# Parse organism's name in the reference table to create the org.db name (target_org_db)
301-
genus_species <- strsplit(target_organism, " ")[[1]]
300+
# Define genus and species regardless of target_org_db
301+
target_species_designation <- ref_table %>%
302+
filter(species == target_organism) %>%
303+
pull(species) %>%
304+
gsub("\\s+", " ", .) %>%
305+
gsub("[^A-Za-z0-9 ]", "", .)
306+
307+
genus_species <- strsplit(target_species_designation, " ")[[1]]
302308
if (length(genus_species) < 1) {
303-
stop("Species designation is not correctly formatted: ", target_organism)
309+
stop("Species designation is not correctly formatted: ", target_species_designation)
304310
}
311+
305312
genus <- genus_species[1]
306313
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
307314
strain <- ref_table %>%
308315
filter(species == target_organism) %>%
309316
pull(strain) %>%
310317
gsub("[^A-Za-z0-9]", "", .)
318+
311319
if (!is.na(strain) && strain != "") {
312-
species <- paste0(species, strain)
320+
species <- paste0(species, strain)
321+
}
322+
323+
# Get package name or build it if not provided
324+
target_org_db <- ref_table %>%
325+
filter(species == target_organism) %>%
326+
pull(annotations)
327+
328+
if (is.na(target_org_db) || target_org_db == "") {
329+
cat("\nNo annotation database specified. Constructing package name...\n")
330+
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
313331
}
314-
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
315332

316-
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
333+
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
317334
library(AnnotationForge)
318-
makeOrgPackageFromNCBI(
319-
version = "0.1",
320-
author = "Your Name <your.email@example.com>",
321-
maintainer = "Your Name <your.email@example.com>",
322-
outputDir = "./",
323-
tax_id = target_taxid,
324-
genus = genus,
325-
species = species
335+
makeOrgPackageFromNCBI(
336+
version = "0.1",
337+
author = "Your Name <your.email@example.com>",
338+
maintainer = "Your Name <your.email@example.com>",
339+
outputDir = "./",
340+
tax_id = target_taxid,
341+
genus = genus,
342+
species = species
326343
)
327344
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
328345
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
329346
}, error = function(e) {
330-
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
347+
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
331348
})
332-
target_org_db <- install_annotations(target_organism, ref_tab_path)
333349
}
334350
```
335351

GeneLab_Reference_Annotations/Workflow_Documentation/GL_RefAnnotTable-A/README.md

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,20 @@ Once R is installed, open a CLI terminal and run the following command to activa
2626
```bash
2727
R
2828
```
29-
29+
`
3030
Within an active R environment, run the following commands to install the required R packages:
3131

3232
```R
33-
install.packages("tidyverse", version = 2.0.0, repos = "http://cran.us.r-project.org")
33+
install.packages("tidyverse")
3434

35-
install.packages("BiocManager", version = 3.19.1, repos = "http://cran.us.r-project.org")
35+
install.packages("BiocManager")
3636

37-
BiocManager::install("STRINGdb", version = 3.19.1)
38-
BiocManager::install("PANTHER.db", version = 3.19.1)
39-
BiocManager::install("rtracklayer", version = 3.19.1)
40-
BiocManager::install("AnnotationForge", version = 1.46.0)
41-
BiocManager::install("biomaRt", version = 2.60.1)
42-
BiocManager::install("GO.db", version = 3.19.1)
37+
BiocManager::install("STRINGdb")
38+
BiocManager::install("PANTHER.db")
39+
BiocManager::install("rtracklayer")
40+
BiocManager::install("AnnotationForge")
41+
BiocManager::install("biomaRt")
42+
BiocManager::install("GO.db")
4343
```
4444

4545
<br>
@@ -102,3 +102,37 @@ Rscript install-org-db.R 'Bacillus subtilis' /path/to/GL-DPPD-7110-A_annotations
102102
**Output data:**
103103

104104
- org.*.eg.db/ (species-specific annotation database, as a local R package)
105+
106+
### 6. Run the Workflow Using Docker
107+
108+
Rather than running the workflow in your local environment, you can use a Docker image. This method ensures that all dependencies are correctly installed.
109+
110+
1. **Pull the Docker image:**
111+
112+
```bash
113+
docker pull quay.io/torres-alexis/gl_images:GL_RefAnnotTable_v1.1.0-rc.1
114+
```
115+
116+
2. **Download the workflow files:**
117+
118+
```bash
119+
curl -LO https://github.com/nasa/GeneLab_Data_Processing/releases/download/GL_RefAnnotTable-A_1.1.0/GL_RefAnnotTable-A_1.1.0.zip
120+
unzip GL_RefAnnotTable-A_1.1.0.zip
121+
```
122+
123+
3. **Run the workflow using Docker:**
124+
125+
```bash
126+
docker run -it -v $(pwd)/GL_RefAnnotTable-A_1.1.0:/home/rstudio/work quay.io/torres-alexis/gl_images:GL_RefAnnotTable_v1.1.0-rc.1 bash -c "cd /home/rstudio/work && Rscript GL-DPPD-7110-A_build-genome-annots-tab.R 'Mus musculus'"
127+
```
128+
129+
**Input data:**
130+
131+
- No input files are required. Specify the target organism using a positional command line argument. `Mus musculus` is used in the example above. To see a list of all available organisms, run `Rscript GL-DPPD-7110-A_build-genome-annots-tab.R` without positional arguments. The correct argument for each organism can also be found in the 'species' column of the [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)
132+
133+
- Optional: a reference table CSV can be supplied as a second positional argument instead of using the default [GL-DPPD-7110-A_annotations.csv](../../Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110-A/GL-DPPD-7110-A_annotations.csv)
134+
135+
**Output data:**
136+
137+
- *-GL-annotations.tsv (Tab delineated table of gene annotations)
138+
- *-GL-build-info.txt (Text file containing information used to create the annotation table, including tool and tool versions and date of creation)
Lines changed: 42 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
# install-org-db.R
2-
3-
# Function: Get annotations db from ref table. If no annotations db is defined, create the package name from genus, species, (and strain for microbes),
4-
# Try to Bioconductor install annotations db. If fail then build the package using AnnotationForge, install it into the current directory.
5-
# Requires ~80GB for NCBIFilesDir file caching
61
install_annotations <- function(target_organism, refTablePath) {
72
if (!file.exists(refTablePath)) {
83
stop("Reference table file does not exist at the specified path: ", refTablePath)
@@ -13,35 +8,36 @@ install_annotations <- function(target_organism, refTablePath) {
138
filter(species == target_organism) %>%
149
pull(taxon)
1510

11+
# Define genus and species regardless of target_org_db
12+
target_species_designation <- ref_table %>%
13+
filter(species == target_organism) %>%
14+
pull(species) %>%
15+
gsub("\\s+", " ", .) %>%
16+
gsub("[^A-Za-z0-9 ]", "", .)
17+
18+
genus_species <- strsplit(target_species_designation, " ")[[1]]
19+
if (length(genus_species) < 1) {
20+
stop("Species designation is not correctly formatted: ", target_species_designation)
21+
}
22+
23+
genus <- genus_species[1]
24+
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
25+
strain <- ref_table %>%
26+
filter(species == target_organism) %>%
27+
pull(strain) %>%
28+
gsub("[^A-Za-z0-9]", "", .)
29+
30+
if (!is.na(strain) && strain != "") {
31+
species <- paste0(species, strain)
32+
}
33+
1634
# Get package name or build it if not provided
1735
target_org_db <- ref_table %>%
1836
filter(species == target_organism) %>%
1937
pull(annotations)
2038

2139
if (is.na(target_org_db) || target_org_db == "") {
2240
cat("\nNo annotation database specified. Constructing package name...\n")
23-
target_species_designation <- ref_table %>%
24-
filter(species == target_organism) %>%
25-
pull(species) %>%
26-
gsub("\\s+", " ", .) %>%
27-
gsub("[^A-Za-z0-9 ]", "", .)
28-
29-
genus_species <- strsplit(target_species_designation, " ")[[1]]
30-
if (length(genus_species) < 1) {
31-
stop("Species designation is not correctly formatted: ", target_species_designation)
32-
}
33-
34-
genus <- genus_species[1]
35-
species <- ifelse(length(genus_species) > 1, genus_species[2], "")
36-
strain <- ref_table %>%
37-
filter(species == target_organism) %>%
38-
pull(strain) %>%
39-
gsub("[^A-Za-z0-9]", "", .)
40-
41-
if (!is.na(strain) && strain != "") {
42-
species <- paste0(species, strain)
43-
}
44-
4541
target_org_db <- paste0("org.", substr(genus, 1, 1), species, ".eg.db")
4642
}
4743

@@ -56,25 +52,25 @@ install_annotations <- function(target_organism, refTablePath) {
5652
} else {
5753
cat(paste0("\nInstallation from Bioconductor failed, attempting to build '", target_org_db, "'...\n"))
5854
if (!dir.exists(target_org_db)) {
59-
tryCatch({
60-
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
61-
library(AnnotationForge)
62-
makeOrgPackageFromNCBI(
63-
version = "0.1",
64-
author = "Your Name <your.email@example.com>",
65-
maintainer = "Your Name <your.email@example.com>",
66-
outputDir = "./",
67-
tax_id = target_taxid,
68-
genus = genus,
69-
species = species
70-
)
71-
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
72-
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
73-
}, error = function(e) {
74-
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
75-
})
55+
tryCatch({
56+
BiocManager::install(c("AnnotationForge", "biomaRt", "GO.db"), ask = FALSE)
57+
library(AnnotationForge)
58+
makeOrgPackageFromNCBI(
59+
version = "0.1",
60+
author = "Your Name <your.email@example.com>",
61+
maintainer = "Your Name <your.email@example.com>",
62+
outputDir = "./",
63+
tax_id = target_taxid,
64+
genus = genus,
65+
species = species
66+
)
67+
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
68+
cat(paste0("'", target_org_db, "' has been successfully built and installed.\n"))
69+
}, error = function(e) {
70+
stop("Failed to build and load the package: ", target_org_db, "\nError: ", e$message)
71+
})
7672
} else {
77-
cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed."))
73+
cat(paste0("Local annotation package ", target_org_db, " already exists. This local package will be installed.\n"))
7874
install.packages(file.path("./", target_org_db), repos = NULL, type = "source", quiet = TRUE)
7975
}
8076
}
@@ -83,4 +79,4 @@ install_annotations <- function(target_organism, refTablePath) {
8379
library(target_org_db, character.only = TRUE)
8480
cat(paste0("Using Annotation Database '", target_org_db, "'.\n"))
8581
return(target_org_db)
86-
}
82+
}

0 commit comments

Comments
 (0)