Merge pull request #129 from olabiyi/DEV_Metagenomics_Illumina_NF_conversion

asaravia-butler · web-flow · commit 703f4699eacb · 2024-11-05T12:48:16.000-08:00
Nextflow Metagenomics Illumina conversion: Added missing post-processing script and fixed no assemblies produced bug
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md
@@ -115,7 +115,7 @@ nextflow run main.nf --help
 
 <br>
 
-#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input
+#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD or GLDS accession as input
 
 ```bash
 nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574
@@ -195,30 +195,30 @@ Standard nextflow resource usage logs are also produced as follows:
 For options and detailed help on how to run the post-processing workflow, run the following command:
 
 ```bash
-nextflow run post_processng.nf --help
+nextflow run post_processing.nf --help
 ```
 
 To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command:
 
 ```bash
-nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity
+nextflow -C post_processing.config run post_processing.nf -resume -profile slurm,singularity
 ``` 
 
 The outputs of the run will be in a directory called `Post_Processing` by default and they are as follows:
 
- - Post_processing/FastQC_Outputs/filtered_multiqc_GLmetagenomics_report.zip (Filtered sequence multiqc report with paths purged) 
+ - Post_processing/FastQC_Outputs/filtered_multiqc_GLmetagenomics_report.zip (Filtered sequence multiqc report with paths purged)  
 
- - Post_processing/FastQC_Outputs/raw_multiqc_GLmetagenomics_report.zip (Raw sequence multiqc report with paths purged)
+ - Post_processing/FastQC_Outputs/raw_multiqc_GLmetagenomics_report.zip (Raw sequence multiqc report with paths purged)  
 
- - Post_processing/<GLDS_accession>_-associated-file-names.tsv (File association table for curation)
+ - Post_processing/<GLDS_accession>_-associated-file-names.tsv (File association table for curation)  
 
- - Post_processing/<GLDS_accession>_metagenomics-validation.log (Automatic verification and validation log file)
+ - Post_processing/<GLDS_accession>_metagenomics-validation.log (Automatic verification and validation log file)  
 
- - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR)
+ - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR)  
 
- - Post_processing/processing_info_GLmetagenomics.zip  (Zip file containing all files used to run the workflow and required logs with paths purged) 
+ - Post_processing/processing_info_GLmetagenomics.zip  (Zip file containing all files used to run the workflow and required logs with paths purged)  
 
- - Post_processing/protocol.txt  (File describing the methods used by the workflow)
+ - Post_processing/protocol.txt  (File describing the methods used by the workflow)  
 
  - Post_processing/README_GLmetagenomics.txt (README file listing and describing the outputs of the workflow)
 
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Generate protocol according to a pipeline document
+
+# USAGE:
+# generate_protocol.sh <software_versions> <protocol_id>
+# EXAMPLE
+# generate_protocol.sh ../Metadata/software_versions.txt GL-DPPD-7107-A
+
+FASTQC=`grep -i 'fastqc' $1 | awk '{print $2}' |sed -E 's/v//'`
+MULTIQC=`grep -i 'multiqc' $1 | awk '{print $3}'`
+BBMAP=`grep -i 'bbtools' $1 | awk '{print $2}'`
+HUMANN=`grep -i 'humann' $1 | awk '{print $2}'|sed -E 's/v//'`
+MEGAHIT=`grep -i 'megahit' $1 | awk '{print $2}'|sed -E 's/v//'`
+PRODIGAL=`grep -i 'prodigal' $1 | awk '{print $2}'|sed -E 's/[vV:]//g'`
+CAT=`grep 'CAT' $1 | awk '{print $2}'|sed -E 's/v//'`
+KOFAMSCAN=`grep 'exec_annotation' $1 | awk '{print $2}'`
+BOWTIE2=`grep -i 'bowtie' $1 | awk '{print $3}'`
+SAMTOOLS=`grep -i 'samtools' $1 | awk '{print $2}'`
+METABAT2=`grep -i 'metabat' $1 | awk '{print $2}'`
+BIT=`grep -i 'bioinformatics tools' $1 | awk '{print $3}' | sed 's/v//' | sed -E 's/.+([0-9]+.[0-9]+.[0-9]+).+/\1/'`
+CHECKM=`grep -i 'checkm' $1 | awk '{print $2}' |sed -E 's/v//'`
+GTDBTK=`grep -i '^GTDB' $1 | awk '{print $2}' |sed -E 's/v//' | head -n2` # If 2 versions are used, choose the second
+
+PROTOCOL_ID=$2
+
+PROTOCOL="Data were processed as described in ${PROTOCOL_ID} (https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/${PROTOCOL_ID}.md), using workflow NF_MGIllumina v1.0.0 (https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina). \
+	  In breif, quality assessment of reads was performed with FastQC v${FASTQC} and reports were summarized with MultiQC v${MULTIQC}. \
+          Quality trimming and filtering were performed with bbmap v${BBMAP}. Read-based processing was performed with humann3 v${HUMANN}. \
+	  Individual samples were assembled with megahit v${MEGAHIT}. Genes were called with prodigal v${PRODIGAL}. \
+	  Taxonomic classification of genes and contigs was performed with CAT v${CAT}. Functional annotation was done with KOFamScan v${KOFAMSCAN}. \
+	  Reads were mapped to assemblies with bowtie2 v${BOWTIE2} and coverage information was extracted for reads and contigs with samtools v${SAMTOOLS} and bbmap v${BBMAP}. \
+	  Binning of contigs was performed with metabat2 v${METABAT2}. Bins were summarized with bit v${BIT} and estimates of quality were generated with checkm v${CHECKM}. \
+	  High-quality bins (> 90% est. completeness and < 10% est. redundancy) were taxonomically classified with gtdb-tk v${GTDBTK}."
+
+echo ${PROTOCOL}
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf
@@ -61,6 +61,7 @@ process RENAME_HEADERS {
     output:
         tuple val(sample_id), path("${sample_id}-assembly.fasta"), emit: contigs
         path("versions.txt"), emit: version
+        path("Failed-assemblies.tsv"), optional: true, emit: failed_assembly
     script:
         """
         bit-rename-fasta-headers -i ${assembly} \\
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf
@@ -48,6 +48,12 @@ workflow assembly_based {
                                       sample_id, assembly -> file("${assembly}")
                                       }.collect()
         SUMMARIZE_ASSEMBLIES(assemblies_ch)
+
+        // Write failed assemblies to a Failed assemblies file
+        failed_assemblies = RENAME_HEADERS.out.failed_assembly
+        failed_assemblies
+              .map{ it.text }
+              .collectFile(name: "${params.assemblies_dir}/Failed-assemblies.tsv", cache: false)
         
         // Map reads to assembly
         MAPPING(assembly_ch.join(filtered_ch))
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf
@@ -1,36 +1,37 @@
 #!/usr/bin/env nextflow
 nextflow.enable.dsl = 2
 
-//params.GLDS_accession = "OSD-574"
+//params.accession = "OSD-574"
 //params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process 
 
 process GET_RUNSHEET {
 
     beforeScript "chmod +x ${baseDir}/bin/create_runsheet.sh" 
+    tag "Downloading raw fastq files and runsheet for ${accession}..."
 
     input:
-        val(GLDS_accession)
+        val(accession)
     output:
         path("a_*metagenomic*.txt"), emit: assay_TABLE
         path("*.zip"), emit: zip
         path("GLfile.csv"), emit: input_file
         path("versions.txt"), emit: version
     script:
         """
-        # Download ISA zip file for the GLDS_accession then unzip it
-        GL-download-GLDS-data -g ${GLDS_accession} -p ISA -f && unzip *-ISA.zip
+        # Download ISA zip file for the GLDS/OSD accession then unzip it
+        GL-download-GLDS-data -g ${accession} -p ISA -f && unzip *-ISA.zip
 
         if [ ${params.RawFilePattern} == null ];then
         
             # Attempt to download the sequences using the assay table, if that fails then
             # attempt retrieving all fastq.gz files
-            GL-download-GLDS-data -f -g ${GLDS_accession} -a a_*metagenomic*.txt -o Raw_Sequence_Data || \\
-            GL-download-GLDS-data -f -g ${GLDS_accession} -p ".fastq.gz" -o Raw_Sequence_Data
+            GL-download-GLDS-data -f -g ${accession} -a a_*metagenomic*.txt -o Raw_Sequence_Data || \\
+            GL-download-GLDS-data -f -g ${accession} -p ".fastq.gz" -o Raw_Sequence_Data
         
         else
 
         
-            GL-download-GLDS-data -f -g ${GLDS_accession} -p  ${params.RawFilePattern} -o Raw_Sequence_Data
+            GL-download-GLDS-data -f -g ${accession} -p  ${params.RawFilePattern} -o Raw_Sequence_Data
 
         fi
 
@@ -39,8 +40,8 @@ process GET_RUNSHEET {
            grep '+' *wanted-file-download-commands.sh | \\
            sort -u | \\
            awk '{gsub(/\\+/,"%2B", \$NF);print}' \\
-           > plus_containing_${GLDS_accession}-wanted-file-download-commands.sh
-           cat plus_containing_${GLDS_accession}-wanted-file-download-commands.sh | parallel -j $task.cpus
+           > plus_containing_${accession}-wanted-file-download-commands.sh
+           cat plus_containing_${accession}-wanted-file-download-commands.sh | parallel -j $task.cpus
        fi
         
         # Create runsheet from the assay table
@@ -52,7 +53,7 @@ process GET_RUNSHEET {
 
 workflow {
 
-    GET_RUNSHEET(params.GLDS_accession)
+    GET_RUNSHEET(params.accession)
     file_ch = GET_RUNSHEET.out.input_file
                      .splitCsv(header:true)
 
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf
@@ -30,6 +30,7 @@ process MAPPING {
             else
 
                 touch ${sample_id}.sam
+                echo "Mapping not performed for ${sample_id} because the assembly didn't produce anything." > ${sample_id}-mapping-info.txt
                 printf "Mapping not performed for ${sample_id} because the assembly didn't produce anything.\\n"
 
             fi
@@ -48,6 +49,7 @@ process MAPPING {
             else
 
                 touch ${sample_id}.sam
+                echo "Mapping not performed for ${sample_id} because the assembly didn't produce anything."  > ${sample_id}-mapping-info.txt
                 printf "Mapping not performed for ${sample_id} because the assembly didn't produce anything.\\n"
 
             fi
diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config
@@ -327,6 +327,13 @@ process {
                   publishDir = [path: params.logs_dir, pattern: "*-assembly.log", mode: params.publishDir_mode]
             }
 
+    withName: RENAME_HEADERS{
+
+              publishDir = [path: params.assemblies_dir, pattern: "*-assembly.fasta" , mode: params.publishDir_mode]
+
+            }
+
+
     withLabel: mapping {
                   conda = {params.conda.mapping != null ? params.conda.mapping : "envs/mapping.yaml"}
                   cpus = 8