nasa
diff --git a/‎Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md
Lines changed: 4 additions & 4 deletions b/‎Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md
Lines changed: 1 addition & 1 deletion b/‎Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md
Lines changed: 9 additions & 6 deletions b/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md
Lines changed: 9 additions & 6 deletions
diff --git a/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh
Lines changed: 16 additions & 3 deletions b/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh
Lines changed: 16 additions & 3 deletions
diff --git a/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh
Lines changed: 0 additions & 67 deletions b/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh
Lines changed: 0 additions & 67 deletions
diff --git a/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh
Lines changed: 102 additions & 0 deletions b/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh
Lines changed: 102 additions & 0 deletions
diff --git a/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm
Lines changed: 51 additions & 0 deletions b/‎Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm
Lines changed: 51 additions & 0 deletions
@@ -38,7 +38,7 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead)
 
 <!-- Included R packages -->
 - Assay-specific suffixes were added where needed for GeneLab repo ("GLAmpSeq")
-- The ITS UNITE reference database used was updated to "UNITE_v2023_July2023.RData", from http://www2.decipher.codes/Classification/TrainingSets/
+- The ITS UNITE reference database used was updated to "UNITE_v2023_July2023.RData", from https://www2.decipher.codes/data/Downloads/TrainingSets/
 - Several program versions were updated (all versions listed in [Software used](#software-used) below)
 
 ---
@@ -103,8 +103,8 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead)
 
 |Program used| Database| Relevant Links|
 |:-----|:-----:|--------:|
-|DECIPHER| SILVA SSU r138 | [http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData](http://www2.decipher.codes/Classification/TrainingSets/)|
-|DECIPHER| UNITE v2020 | [http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2020_February2020.RData](http://www2.decipher.codes/Classification/TrainingSets/)|
+|DECIPHER| SILVA SSU r138 | [https://www2.decipher.codes/data/Downloads/TrainingSets/SILVA_SSU_r138_2019.RData](https://www2.decipher.codes/data/Downloads/TrainingSets/)|
+|DECIPHER| UNITE v2023 | [https://www2.decipher.codes/data/Downloads/TrainingSets/UNITE_v2023_July2023.RData](https://www2.decipher.codes/data/Downloads/TrainingSets/)|
 
 ---
 
@@ -443,7 +443,7 @@ dna <- DNAStringSet(getSequences(seqtab.nochim))
 
 Downloading the reference R taxonomy object:
 ```R
-download.file( url=“http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData”, destfile=“SILVA_SSU_r138_2019.RData”)
+download.file( url=“https://www2.decipher.codes/data/Downloads/TrainingSets/SILVA_SSU_r138_2019.RData”, destfile=“SILVA_SSU_r138_2019.RData”)
 ```
 
 **Parameter Definitions:**  
 
@@ -4,7 +4,7 @@
 
 ---
 
-**Date:** October XX, 2024  
+**Date:** October 28, 2024  
 **Revision:** -A  
 **Document Number:** GL-DPPD-7107  
 
 
@@ -51,7 +51,10 @@ Nextflow can be installed either through [Anaconda](https://anaconda.org/biocond
 > conda install -c bioconda nextflow
 > nextflow self-update
 > ```
-
+> You may also install [mamba](https://mamba.readthedocs.io/en/latest/index.html) which is a faster implementation of conda like so:
+> ```bash
+> conda install -c conda-forge mamba
+> ```
 <br>
 
 #### 1b. Install Singularity
@@ -111,7 +114,7 @@ For options and detailed help on how to run the workflow, run the following comm
 nextflow run main.nf --help
 ```
 
-> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters.  Take care to use the proper number of hyphens for each argument.
+> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --input_file) that denote workflow specific parameters.  Take care to use the proper number of hyphens for each argument.
 
 <br>
 
@@ -126,15 +129,15 @@ nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574
 #### 4b. Approach 2: Run slurm jobs in singularity containers with a csv file as input
 
 ```bash
-nextflow run main.nf -resume -profile slurm,singularity  --csv_file PE_file.csv
+nextflow run main.nf -resume -profile slurm,singularity  --input_file PE_file.csv
 ```
 
 <br>
 
 #### 4c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s)
 
 ```bash
-nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc <path/to/existing/conda/environment>
+nextflow run main.nf -resume -profile mamba --input_file SE_file.csv --conda_megahit <path/to/existing/conda/environment>
 ```
 
 <br>
@@ -153,7 +156,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc <p
 
 *Required only if --accession is not passed as an argument*
 
-* `--csv_file` –  A single-end or paired-end input csv file containing assay metadata for each sample, including sample_id, forward, reverse, and/or paired. Please see the sample [SE_file.csv](workflow_code/SE_file.csv) and [PE_file.csv](workflow_code/PE_file.csv) in this repository for examples on how to format this file.
+* `--input_file` –  A single-end or paired-end input csv file containing assay metadata for each sample, including sample_id, forward, reverse, and/or paired. Please see the sample [SE_file.csv](workflow_code/SE_file.csv) and [PE_file.csv](workflow_code/PE_file.csv) in this repository for examples on how to format this file.
 
 > See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow.
 
@@ -163,7 +166,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc <p
 
 Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html).
 
-Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the machine you're using.
+Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. Additionally, if necessary, you'll need to modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the machine you're using.
 
 <br>
 
 
@@ -13,12 +13,25 @@ if [ -s t ]; then
     exit
 fi
 
-
+FILE=$1
 ROOT_DIR=$(echo $2 | awk '{N=split($0,a,"/"); for(i=0; i < N-1; i++) printf "%s/", a[i]}' | sed 's|//|/|')
 
+
+# Remove path in paired end runsheet
+if [ `awk 'NR==1{print}' ${FILE} | grep -c reverse` -gt 0 ]; then
+
+      awk 'BEGIN{FS=OFS=","} NR==1{print} NR>1{split($2, f, "/");split($3, r, "/"); print $1,f[length(f)],r[length(r)],$4}' ${FILE} > temp && mv temp ${FILE}
+
+# Remove path in single end runsheet
+elif [ `awk 'NR==1{print}' ${FILE} | grep -c forward` -gt 0 ]; then
+
+
+     awk 'BEGIN{FS=OFS=","} NR==1{print} NR>1{split($2, f, "/"); print $1,f[length(f)],$3}' ${FILE} > temp && mv temp ${FILE}
+
+fi
 
-sed -E 's|.*/GLDS_Datasets/(.+)|\1|g'  ${1} \
+sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${FILE} \
     | sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \
     | sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \
     | sed -E 's|/[a-z]{6}/[^ ]*|<path-removed-for-security-purposes>|g' \
-    | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1}
+    | sed -E "s|${ROOT_DIR}||g" > t && mv t  ${FILE}
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+set  -euo pipefail
+
+# Script to launch a nextflow workflow on slurm cluster
+
+# Usage: bash ./launch.sh [mode] [main.nf]  [config] '[extra arguments]'   
+# Examples
+
+# Processing: 
+#      bash ./launch.sh processing path/to/main.nf path/to/nextflow.config '--accession OSD-574'   
+
+# Postprocessing:  
+#      bash ./launch.sh post_processing  path/to/post_processing.nf  path/to/post_processing.config \
+#           '--name FirstNAme M. LastName --email email@doamin.com --GLDS_accession GLDS-574 --OSD_accession OSD-574 --isa_zip  ../GeneLab/OSD-574_metadata_GLDS-574-ISA.zip --runsheet ../GeneLab/GLfile.csv' 
+
+
+
+MODE=${1:-''} # Script run mode i.e. processing or post_processing
+MAIN=${2:-''}  # Path to the main.nf or post_processing.nf nextflow script for processing and post_processing, respectively.
+CONFIG=${3:-''} # nextflow config file i.e. nextflow.config or post_processing.config
+EXTRA=${4:-''}  # extra arguments to the nextflow run command
+
+
+#==============================================================================
+# SETUP START
+#==============================================================================
+eval "$(conda shell.bash hook)"
+conda activate /path/to/conda/envs/nextflow
+export NXF_SINGULARITY_CACHEDIR=<PATH TO SINGULARITY IMAGES>
+export TOWER_ACCESS_TOKEN=<YOUR ACCESS TOKEN>
+export TOWER_WORKSPACE_ID=<YOUR WORKSPACE ID>
+
+#==============================================================================
+# UMASK CONFIGURATION 
+#==============================================================================
+echo "Setting umask to enable group read-access by default"
+umask u=rwx,g=rx
+echo "Umask settings for this launch: $(umask -S)"
+
+
+#==============================================================================
+# NEXTFLOW COMMAND START
+#==============================================================================
+if [ ${MODE} == "processing" ]; then
+
+    RUN_NAME=MAIN_$(date +%Y%m%d%H%M%S)
+
+    RUN_COMMAND="nextflow -C ${CONFIG}
+                    run \
+                    -name ${RUN_NAME} \
+                    ${MAIN} \
+                    -resume \
+                    -profile slurm,singularity \
+                    -with-tower \
+                    -process.queue 'normal' \
+                    -ansi-log false \
+                    ${EXTRA}"
+
+    echo "Running command: ${RUN_COMMAND}"
+    echo ""
+    [ -d processing_scripts ] || mkdir processing_scripts
+    eval ${RUN_COMMAND} && echo ${RUN_COMMAND} > processing_scripts/command.txt
+
+    # Save the nextflow log to a file
+    echo "Creating Nextflow processing info file..."
+    nextflow log ${RUN_NAME} -f name,script > processing_scripts/nextflow_processing_info_GLmetagenomics.txt
+    echo nextflow log ${RUN_NAME} -f name,script >> processing_scripts/nextflow_processing_info_GLmetagenomics.txt
+    echo "Nextflow processing info written to processing_scripts/nextflow_processing_info_GLmetagenomics.txt"
+
+
+elif [ ${MODE} == "post_processing" ];then
+
+
+    RUN_NAME=POST_$(date +%Y%m%d%H%M%S)
+
+    RUN_COMMAND="nextflow -C ${CONFIG}
+                    run \
+                    -name ${RUN_NAME} \
+                    ${MAIN} \
+                    -resume \
+                    -profile slurm,singularity \
+                    -with-tower \
+                    -process.queue 'normal' \
+                    -ansi-log false \
+                    ${EXTRA}"
+
+    echo "Running command: ${RUN_COMMAND}"
+    echo ""
+    eval ${RUN_COMMAND}
+
+else
+    echo 'Please provide a valid mode to run the workflow.'
+    echo 'Either processing or post_processing for running the processing or post_processing workflows, respectively.'
+    exit 1
+fi 
+
+
+# Set permissions on launch directory
+echo ""
+echo "Setting permissions on launch directory..."
+chmod -R 755 .
+echo "Permissions set to 755 recursively on launch directory"
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+#SBATCH --job-name="nf_master" ## Replace job_name with the name of the job you are running ##
+#SBATCH --output=nf_master.o.%j ## Replace job_name with the name of the job you are running ##
+#SBATCH --error=nf_master.e.%j ## Replace job_name with the name of the job you are running ##
+#SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ##
+#SBATCH --mem=20G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ##
+#SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ##
+#SBATCH --mail-user=name@domain.com ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ##
+#SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ##
+
+. ~/.profile
+
+
+echo "nf_master" ## Replace job_name with the name of the job you are running ##
+echo ""
+
+
+## Add a time-stamp at the start of the job ##
+start=$(date +%s)
+echo "start time: $start"
+
+## Print the name of the compute node executing the job ##
+echo $HOSTNAME
+
+WORKFLOW_DIR='/path/to/nextflow/workflow_code'
+# Processing
+bash ./launch.sh processing ${WORKFLOW_DIR}/main.nf ${WORKFLOW_DIR}/nextflow.config '--accession OSD-574'
+
+# Post Processing
+#bash ./launch.sh post_processing  ${WORKFLOW_DIR}/post_processing.nf  ${WORKFLOW_DIR}/post_processing.config \
+#          '--name First M. Last --email name@domain.com --GLDS_accession GLDS-574 --OSD_accession OSD-574 --isa_zip  ../GeneLab/OSD-574_metadata_OSD-574-ISA.zip --runsheet ../GeneLab/GLfile.csv'
+
+
+## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ##
+echo ""
+end=$(date +%s)
+echo "end time: $end"
+runtime_s=$(echo $(( end - start )))
+echo "total run time(s): $runtime_s"
+sec_per_min=60
+sec_per_hr=3600
+runtime_m=$(echo "scale=2; $runtime_s / $sec_per_min;" | bc)
+echo "total run time(m): $runtime_m"
+runtime_h=$(echo "scale=2; $runtime_s / $sec_per_hr;" | bc)
+echo "total run time(h): $runtime_h"
+echo ""
+
+
+## Print the slurm job ID so you have it recorded and can view slurm job statistics if needed ##
+echo "slurm job ID: ${SLURM_JOB_ID}"