Skip to content

Commit ee00750

Browse files
committed
NF_MAAgilent1ch: #85 add processed data protocol
1 parent 9039193 commit ee00750

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
process GENERATE_PROTOCOL {
2+
tag "${ params.gldsAccession }"
3+
publishDir "${ params.outputDir }/${ params.gldsAccession }/GeneLab",
4+
mode: params.publish_dir_mode
5+
6+
input:
7+
path("software_versions_GLmicroarray.md")
8+
val(organism)
9+
10+
output:
11+
path("PROTOCOL_GLmicroarray.txt")
12+
13+
script:
14+
"""
15+
generate_protocol.sh $workflow.manifest.version \"$organism\"
16+
"""
17+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
set -u
3+
4+
software_versions_file="software_versions_GLmicroarray.md"
5+
6+
# Read the markdown table
7+
while read -r line; do
8+
# Extract program, version, and link
9+
program=$(echo "$line" | awk -F'|' '{gsub(/^[[:blank:]]+|[[:blank:]]+$/,"",$1); print $1}')
10+
version=$(echo "$line" | awk -F'|' '{gsub(/^[[:blank:]]+|[[:blank:]]+$/,"",$2); print $2}')
11+
12+
# Skip the header row and rows without version information
13+
if [[ $program != "Program" && $version != "Version" && ! -z $version ]]; then
14+
# Replace invalid characters in program name with underscores
15+
sanitized_program=$(echo "$program" | tr -cd '[:alnum:]_')
16+
17+
# Create environment variable name
18+
env_var_name="${sanitized_program}_VERSION"
19+
20+
# Set the environment variable
21+
export "$env_var_name=$version"
22+
fi
23+
done < <(sed -n '/|/p' "$software_versions_file" | sed 's/^ *|//;s/|$//')
24+
25+
# Print the extracted versions
26+
env | grep "_VERSION"
27+
28+
# Get organism
29+
organism=$2
30+
31+
# List of organisms
32+
organism_list=("Homo sapiens" "Mus musculus" "Rattus norvegicus" "Drosophila melanogaster" "Caenorhabditis elegans" "Danio rerio" "Saccharomyces cerevisiae")
33+
34+
# Check the value of 'organism' variable and set 'GENE_MAPPING_STEP' accordingly
35+
if [[ $organism == "Arabidopsis thaliana" ]]; then
36+
GENE_MAPPING_STEP="Ensembl gene ID mappings were retrieved for each probe using the Plants Ensembl database ftp server (plants.ensembl.org, release 54)."
37+
elif [[ " ${organism_list[*]} " == *"${organism//\"/}"* ]]; then
38+
GENE_MAPPING_STEP="Ensembl gene ID mappings were retrieved for each probe using biomaRt (version ${biomaRt_VERSION}), Ensembl database (ensembl.org, release 107)."
39+
else
40+
GENE_MAPPING_STEP="TBD"
41+
fi
42+
43+
# Check the value of 'organism' variable and set 'GENE_ANNOTATION_DB' accordingly
44+
if [[ $organism == "Arabidopsis thaliana" ]]; then
45+
GENE_ANNOTATION_DB="org.At.tair.db"
46+
elif [[ $organism == "Homo sapiens" ]]; then
47+
GENE_ANNOTATION_DB="org.Hs.eg.db"
48+
elif [[ $organism == "Mus musculus" ]]; then
49+
GENE_ANNOTATION_DB="org.Mm.eg.db"
50+
elif [[ $organism == "Rattus norvegicus" ]]; then
51+
GENE_ANNOTATION_DB="org.Rn.eg.db"
52+
elif [[ $organism == "Drosophila melanogaster" ]]; then
53+
GENE_ANNOTATION_DB="org.Dm.eg.db"
54+
elif [[ $organism == "Caenorhabditis elegans" ]]; then
55+
GENE_ANNOTATION_DB="org.Ce.eg.db"
56+
elif [[ $organism == "Danio rerio" ]]; then
57+
GENE_ANNOTATION_DB="org.Dr.eg.db"
58+
elif [[ $organism == "Saccharomyces cerevisiae" ]]; then
59+
GENE_ANNOTATION_DB="org.Sc.sgd.db"
60+
else
61+
GENE_ANNOTATION_DB="TBD"
62+
fi
63+
64+
# Read the template file
65+
template="Data were processed as described in GL-DPPD-7112 ([https://github.com/nasa/GeneLab_Data_Processing/blob/master/Microarray/Agilent_1-channel/Pipeline_GL-DPPD-7112_Versions/GL-DPPD-7112.md]), using NF_MAAgilent1ch version $1 ([https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MAAgilent1ch_$1/Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch]). In short, a RunSheet containing raw data file location and processing metadata from the study's *ISA.zip file was generated using dp_tools (version ${dp_tools_VERSION}). The raw array data files were loaded into R (version ${R_VERSION}) using limma (version ${limma_VERSION}). Raw data quality assurance density, pseudo image, MA, and foreground-background plots were generated using limma (version ${limma_VERSION}), and boxplots were generated using ggplot2 (version ${ggplot2_VERSION}). The raw intensity data was background corrected and normalized across arrays via the limma (version ${limma_VERSION}) quantile method. Normalized data quality assurance density, pseudo image, and MA plots were generated using limma (version ${limma_VERSION}), and boxplots were generated using ggplot2 (version ${ggplot2_VERSION}). ${GENE_MAPPING_STEP} Differential expression analysis was performed in R (version ${R_VERSION}) using limma (version ${limma_VERSION}); all groups were compared pairwise for each probe to generate a moderated t-statistic and associated p- and adjusted p-value. Gene annotations were assigned using the custom annotation tables generated in-house as detailed in GL-DPPD-7110 ([https://github.com/nasa/GeneLab_Data_Processing/blob/GL_RefAnnotTable_1.0.0/GeneLab_Reference_Annotations/Pipeline_GL-DPPD-7110_Versions/GL-DPPD-7110/GL-DPPD-7110.md]), with STRINGdb (version 2.8.4), PANTHER.db (version 1.0.11), and ${GENE_ANNOTATION_DB} (version 3.15.0)."
66+
67+
# Output the filled template
68+
echo "$template" > PROTOCOL_GLmicroarray.txt

Microarray/Agilent_1-channel/Workflow_Documentation/NF_MAAgilent1ch/workflow_code/post_processing.nf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ c_reset = "\033[0m";
77

88
include { GENERATE_MD5SUMS } from './modules/GENERATE_MD5SUMS.nf'
99
include { UPDATE_ISA_TABLES } from './modules/UPDATE_ISA_TABLES.nf'
10+
include { GENERATE_PROTOCOL } from './modules/POST_PROCESSING/GENERATE_PROTOCOL'
1011

1112
/**************************************************
1213
* HELP MENU **************************************
@@ -49,6 +50,7 @@ workflow {
4950
main:
5051
ch_processed_directory = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }", checkIfExists: true)
5152
ch_runsheet = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }/Metadata/*_runsheet.csv", checkIfExists: true)
53+
ch_software_versions = Channel.fromPath("${ params.outputDir }/${ params.gldsAccession }/GeneLab/software_versions_GLmicroarray.md", checkIfExists: true)
5254
GENERATE_MD5SUMS(
5355
ch_processed_directory,
5456
ch_runsheet,
@@ -59,4 +61,8 @@ workflow {
5961
ch_runsheet,
6062
"${ projectDir }/bin/dp_tools__agilent_1_channel" // dp_tools plugin
6163
)
64+
GENERATE_PROTOCOL(
65+
ch_software_versions,
66+
ch_runsheet | splitCsv(header: true, quote: '"') | first | map{ row -> row['organism'] }
67+
)
6268
}

0 commit comments

Comments
 (0)