From 95b29a75fae59379155aea6767b6d68ae03755b4 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 1 May 2024 15:16:49 -0700 Subject: [PATCH 01/48] first commit --- .../SW_MGIllumina/workflow_code/PE_file.csv | 3 + .../SW_MGIllumina/workflow_code/SE_file.csv | 3 + .../SW_MGIllumina/workflow_code/Snakefile | 1925 ----------------- .../workflow_code/bin/clean-paths.sh | 20 + .../combine-all-gene-tables.py | 0 ...evel-coverages-annots-and-tax-per-group.py | 0 .../workflow_code/bin/create_runsheet.py | 513 +++++ .../workflow_code/bin/download-db.sh | 19 + .../format-contig-tax-classifications.sh | 0 .../format-gene-tax-classifications.sh | 0 .../generate-assembly-based-overview-table.sh | 0 .../workflow_code/bin/get-cov-and-depth.sh | 67 + .../bin/get_MAGs_estimates_and_taxonomy.sh | 26 + .../{scripts => bin}/parse-MAG-annots.py | 0 .../{scripts => bin}/swap-MAG-IDs.py | 0 .../SW_MGIllumina/workflow_code/config.yaml | 258 --- .../workflow_code/config/bbtools_adapters.fa | 317 +++ .../SW_MGIllumina/workflow_code/envs/cat.yaml | 1 + .../SW_MGIllumina/workflow_code/main.nf | 208 ++ .../workflow_code/modules/assembly.nf | 92 + .../modules/assembly_annotation.nf | 195 ++ .../modules/assembly_based_processing.nf | 132 ++ .../workflow_code/modules/binning.nf | 85 + .../modules/combine_contig_annotation.nf | 202 ++ .../workflow_code/modules/coverage.nf | 80 + .../workflow_code/modules/create_runsheet.nf | 32 + .../modules/database_creation.nf | 254 +++ .../modules/quality_assessment.nf | 116 + .../modules/read_based_processing.nf | 269 +++ .../workflow_code/modules/read_mapping.nf | 84 + .../workflow_code/modules/summarize_MAG.nf | 343 +++ .../summarize_assembly-based_processing.nf | 40 + .../workflow_code/modules/summarize_bins.nf | 155 ++ .../workflow_code/nextflow.config | 410 ++++ .../scripts/combine-benchmarks.sh | 18 - .../workflow_code/scripts/slurm-status.py | 17 - .../workflow_code/slurm_submit.slurm | 63 + 37 files changed, 3729 insertions(+), 2218 deletions(-) create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv delete mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile create mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/combine-all-gene-tables.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/combine-gene-level-coverages-annots-and-tax-per-group.py (100%) create mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/format-contig-tax-classifications.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/format-gene-tax-classifications.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/generate-assembly-based-overview-table.sh (100%) create mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get-cov-and-depth.sh create mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/parse-MAG-annots.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/{scripts => bin}/swap-MAG-IDs.py (100%) delete mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config/bbtools_adapters.fa create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config delete mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh delete mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv new file mode 100644 index 00000000..53f4c6fe --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv @@ -0,0 +1,3 @@ +sample_id,forward,reverse,paired +Sample-1,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-1_R1_raw.fastq.gz,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-1_R2_raw.fastq.gz,true +Sample-2,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-2_R1_raw.fastq.gz,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-2_R2_raw.fastq.gz,true diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv new file mode 100644 index 00000000..fa3e269c --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv @@ -0,0 +1,3 @@ +sample_id,forward,paired +Sample-1,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-1_R1_raw.fastq.gz,false +Sample-2,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-2_R1_raw.fastq.gz,false diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile deleted file mode 100644 index 17eb6870..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile +++ /dev/null @@ -1,1925 +0,0 @@ -############################################################################################ -## Snakefile for GeneLab Illumina metagenomics processing workflow ## -## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## -## Version 2.0.4 ## -############################################################################################ - -import os - -configfile: "config.yaml" - - -######################################## -############# General Info ############# -######################################## - -""" -See the corresponding 'config.yaml' file for general use information. -Variables that may need to be adjusted should usually be changed there, not here. -""" - - -######################################## -####### Assay-specific GL suffix ####### -######################################## - -assay_suffix = "_GLmetagenomics" - - -######################################## -######## Some colors and helpers ####### -######################################## - -tty_colors = { - 'green' : '\033[0;32m%s\033[0m', - 'yellow' : '\033[0;33m%s\033[0m', - 'red' : '\033[0;31m%s\033[0m' -} - -def color_text(text, color='green'): - if sys.stdout.isatty(): - return(tty_colors[color] % text) - else: - return(text) - - -######################################## -#### Reading samples file into list #### -######################################## - -sample_IDs_file = config["sample_info_file"] -sample_ID_list = [line.strip() for line in open(sample_IDs_file)] - -# making sure there are all unique names -if len(set(sample_ID_list)) != len(sample_ID_list): - - print(color_text("\n Not all sample IDs in the " + str(config["sample_info_file"]) + " file are unique :(\n", "yellow")) - print(" Exiting for now.\n") - exit(1) - -######################################## -######## Setting up directories ######## -######################################## - -if config["workflow"] == "both": - - dirs_to_create = [config["fastqc_out_dir"], config["filtered_reads_dir"], config["assembly_based_dir"], - config["read_based_dir"], config["assemblies_dir"], config["genes_dir"], - config["annotations_and_tax_dir"], config["mapping_dir"], config["combined_output_dir"], - config["bins_dir"], config["MAGs_dir"], config["logs_dir"], "benchmarks"] - -elif config["workflow"] == "assembly-based": - - dirs_to_create = [config["fastqc_out_dir"], config["filtered_reads_dir"], config["assembly_based_dir"], - config["assemblies_dir"], config["genes_dir"], - config["annotations_and_tax_dir"], config["mapping_dir"], config["combined_output_dir"], - config["bins_dir"], config["MAGs_dir"], config["logs_dir"], "benchmarks"] - - -elif config["workflow"] == "read-based": - - dirs_to_create = [config["fastqc_out_dir"], config["filtered_reads_dir"], config["read_based_dir"], - config["MAGs_dir"], config["logs_dir"], "benchmarks"] - -else: - - print(color_text("\n The 'workflow' variable in the config.yaml file needs to be one of 'assembly-based', 'read-based', or 'both'.", "yellow")) - print("\n Exiting for now.\n") - - exit(1) - -for dir in dirs_to_create: - try: - os.mkdir(dir) - except: - pass - - -######################################## -############# Rules start ############## -######################################## -# all rule depends on if assembly-based, read-based, or both - -if config["workflow"] == "both": - - rule all: - input: - config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", - config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", - config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv", - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.tsv", - config["bins_dir"] + config["additional_filename_prefix"] + f"bins-overview{assay_suffix}.tsv", - config["assemblies_dir"] + config["additional_filename_prefix"] + f"assembly-summaries{assay_suffix}.tsv", - config["assembly_based_dir"] + config["additional_filename_prefix"] + f"Assembly-based-processing-overview{assay_suffix}.tsv", - config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip", - config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip", - config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", - config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv", - config["read_based_dir"] + config["additional_filename_prefix"] + f"Metaphlan-taxonomy{assay_suffix}.tsv" - shell: - """ - bash scripts/combine-benchmarks.sh - """ - -elif config["workflow"] == "assembly-based": - - rule all: - input: - config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", - config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", - config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv", - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.tsv", - config["bins_dir"] + config["additional_filename_prefix"] + f"bins-overview{assay_suffix}.tsv", - config["assemblies_dir"] + config["additional_filename_prefix"] + f"assembly-summaries{assay_suffix}.tsv", - config["assembly_based_dir"] + config["additional_filename_prefix"] + f"Assembly-based-processing-overview{assay_suffix}.tsv", - config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip", - config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip" - shell: - """ - bash scripts/combine-benchmarks.sh - """ - -elif config["workflow"] == "read-based": - - rule all: - input: - config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip", - config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip", - config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", - config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv", - config["read_based_dir"] + config["additional_filename_prefix"] + f"Metaphlan-taxonomy{assay_suffix}.tsv" - shell: - """ - bash scripts/combine-benchmarks.sh - """ - - -rule summarize_MAG_KO_annots_with_KEGG_Decoder: - conda: - "envs/keggdecoder.yaml" - input: - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-level-KO-annotations{assay_suffix}.tsv" - params: - MAGs_dir = config["MAGs_dir"], - mod_input_annotations = config["MAGs_dir"] + "mod-MAG-level-KO-annotations.tmp", - temp_output = config["MAGs_dir"] + "MAG-KEGG-Decoder-out.tmp", - mapping_file = config["MAGs_dir"] + "MAG-ID-map.tmp", - orig_html_output = config["MAGs_dir"] + "MAG-KEGG-Decoder-out.html", - final_html_output = config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.html" - output: - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.tsv" - benchmark: - "benchmarks/summarize_MAG_KO_annots_with_KEGG_Decoder-benchmarks.tsv" - shell: - """ - # getting number of MAGs recovered - num_mags_recovered=$(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') - # only running if any MAGs were recovered - if [ ${{num_mags_recovered}} -gt 0 ]; then - - # KEGGDecoder splits on the first underscore to identify unique genome/MAG IDs - # this can be problematic with how things are named, so we are swapping them all to not have - # any "_" first, then afterwards we are changing the output table back to the original names so - # they match elsewhere (they will still be slightly different in the html output, but that is - # only manually explored anyway) - - # making version of input for KEGGDecoder with no underscores - tr "_" "-" < {input} > {params.mod_input_annotations} - - # making mapping file - paste <( cut -f 1 {input} ) <( cut -f 1 {params.mod_input_annotations} ) > {params.mapping_file} - - - # running KEGGDecoder - # can only create html output if there are more than 1 - if [ ${{num_mags_recovered}} -gt 1 ]; then - KEGG-decoder -v interactive -i {params.mod_input_annotations} -o {params.temp_output} - ## adding additional prefix to html output if there is one - if [ {params.orig_html_output} != {params.final_html_output} ]; then - mv {params.orig_html_output} {params.final_html_output} - fi - else - KEGG-decoder -i {params.mod_input_annotations} -o {params.temp_output} - fi - - - # swapping MAG IDs back in output tsv from KEGGDecoder - python scripts/swap-MAG-IDs.py -i {params.temp_output} -m {params.mapping_file} -o {output} - - # removing intermediate files - rm {params.mod_input_annotations} {params.mapping_file} {params.temp_output} - - else - - printf "There were no MAGs recovered.\n" > {output} - - fi - """ - - -rule summarize_MAG_level_KO_annotations: - input: - MAG_overview = config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv", - trigger = expand(config["annotations_and_tax_dir"] + "{ID}-gene-coverage-annotation-and-tax.tsv", ID = sample_ID_list) - params: - MAGs_dir = config["MAGs_dir"], - annot_and_tax_dir = config["annotations_and_tax_dir"], - tmp_contig_IDs = "curr-contig-ids.tmp" - output: - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-level-KO-annotations{assay_suffix}.tsv" - benchmark: - "benchmarks/summarize_MAG_level_KO_annotations-benchmarks.tsv" - shell: - """ - # only running if any MAGs were recovered - if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - for MAG in $(cut -f 1 {input.MAG_overview} | tail -n +2) - do - - sample_ID=$(echo $MAG | sed 's/-MAG-[0-9]*$//') - grep "^>" {params.MAGs_dir}${{MAG}}.fasta | tr -d ">" > {params.tmp_contig_IDs} - - python scripts/parse-MAG-annots.py -i {params.annot_and_tax_dir}${{sample_ID}}-gene-coverage-annotation-and-tax.tsv -w {params.tmp_contig_IDs} -M ${{MAG}} -o {output} - rm {params.tmp_contig_IDs} - - done - - else - - printf "There were no MAGs recovered.\n" > {output} - - fi - """ - - -rule generate_assembly_processing_overview_table: - input: - trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv" - params: - sample_IDs_file = config["sample_info_file"], - assemblies_dir = config["assemblies_dir"], - genes_dir = config["genes_dir"], - mapping_dir = config["mapping_dir"], - bins_dir = config["bins_dir"], - MAGs_dir = config["MAGs_dir"], - assembly_summaries = config["bins_dir"] + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv", - checkm_results = config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv" - output: - config["assembly_based_dir"] + config["additional_filename_prefix"] + f"Assembly-based-processing-overview{assay_suffix}.tsv" - benchmark: - "benchmarks/generate_assembly_processing_overview_table-benchmarks.tsv" - shell: - """ - bash scripts/generate-assembly-based-overview-table.sh {params.sample_IDs_file} {params.assemblies_dir} {params.genes_dir} {params.mapping_dir} {params.bins_dir} {params.MAGs_dir} {output} - # removing intermediate files from assembly-based process - rm -rf {params.assembly_summaries} {params.checkm_results} - """ - - -rule generate_MAGs_overview_table: - input: - assembly_summaries = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAG-assembly-summaries.tsv", - checkm_results = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv", - gtdb_done_trigger = config["MAGs_dir"] + "gtdbtk-out" - params: - gtdb_results = config["MAGs_dir"] + "gtdbtk-out/gtdbtk.*.summary.tsv", - checkm_tmp = config["MAGs_dir"] + "checkm-estimates.tmp", - gtdb_tmp = config["MAGs_dir"] + "gtdb-taxonomies.tmp", - checkm_w_header_tmp = config["MAGs_dir"] + "checkm-estimates-with-headers.tmp", - gtdb_w_header_tmp = config["MAGs_dir"] + "gtdb-taxonomies-with-headers.tmp", - overview_tmp = config["MAGs_dir"] + "MAGs-overview.tmp", - overview_header_tmp = config["MAGs_dir"] + "MAGs-overview-header.tmp", - overview_sorted_tmp = config["MAGs_dir"] + "MAGs-overview-sorted.tmp", - MAGs_dir = config["MAGs_dir"] - output: - config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv" - benchmark: - "benchmarks/generate_MAGs_overview_table-benchmarks.tsv" - shell: - """ - # only running if any MAGs were recovered - if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - # making sure none of the intermediate files exist already - rm -rf {params.checkm_tmp} {params.gtdb_tmp} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} {params.overview_tmp} {params.overview_header_tmp} {params.overview_sorted_tmp} - - for MAG in $(cut -f 1 {input.assembly_summaries} | tail -n +2) - do - - grep -w -m 1 "^${{MAG}}" {input.checkm_results} | cut -f 12,13,14 >> {params.checkm_tmp} - grep -w "^${{MAG}}" {params.gtdb_results} | cut -f 2 | sed 's/^.__//' | sed $'s/;.__/\t/g' | awk -F $'\\t' ' BEGIN {{ OFS=FS }} {{ for (i=1; i<=NF; i++) if ( $i ~/^ *$/) $i = "NA" }}; 1 ' >> {params.gtdb_tmp} - - done - - # adding headers - cat <(printf "est. completeness\\test. redundancy\\test. strain heterogeneity\\n") {params.checkm_tmp} > {params.checkm_w_header_tmp} - cat <(printf "domain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n") {params.gtdb_tmp} > {params.gtdb_w_header_tmp} - - paste {input.assembly_summaries} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} > {params.overview_tmp} - - # ordering by taxonomy - head -n 1 {params.overview_tmp} > {params.overview_header_tmp} - tail -n +2 {params.overview_tmp} | sort -t $'\\t' -k 14,20 > {params.overview_sorted_tmp} - - cat {params.overview_header_tmp} {params.overview_sorted_tmp} > {output} - - rm -rf {params.checkm_tmp} {params.gtdb_tmp} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} {params.overview_tmp} {params.overview_header_tmp} {params.overview_sorted_tmp} {input} - - else - - rm -rf {params.MAGs_dir}* - - printf "There were no MAGs recovered.\n" > {output} - - fi - """ - - -rule summarize_MAG_assemblies: - """ summarize MAG assemblies """ - - conda: - "envs/bit.yaml" - input: - trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv" - params: - intermediate_file = config["MAGs_dir"] + "MAG-summaries.tmp", - MAGs_dir = config["MAGs_dir"] - output: - config["MAGs_dir"] + config["additional_filename_prefix"] + "MAG-assembly-summaries.tsv" - benchmark: - "benchmarks/summarize_MAG_assemblies-benchmarks.tsv" - shell: - """ - # only running if any MAGs were recovered - if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - bit-summarize-assembly {params.MAGs_dir}*.fasta -o {params.intermediate_file} -t - - # slimming down the output - cut -f 1,2,3,5,6,8,11,18,19,20 {params.intermediate_file} > {output} - rm {params.intermediate_file} - - else - - printf "There were no MAGs recovered.\n" > {output} - - fi - """ - - -rule gtdbtk_on_MAGs: - """ assign taxonomy to MAGs with gtdb-tk """ - - conda: - "envs/gtdb-tk.yaml" - input: - trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv", - gtdbtk_db_trigger = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"] - params: - MAGs_dir = config["MAGs_dir"], - gtdbtk_db_dir = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"], - pplacer_cpus = config["gtdb_tk_checkm_pplacer_cpus"], - gtdb_tk_scratch_location = config["gtdb_tk_scratch_location"] - output: - directory(config["MAGs_dir"] + "gtdbtk-out") - resources: - cpus = config["gtdb_tk_num_cpus"], - mem_mb = config["gtdbtk_memory_resources"] - log: - config["logs_dir"] + "gtdbtk-run.log" - benchmark: - "benchmarks/run_gtdbtk_on_MAGs-with-1-pplacer-cpu-benchmarks.tsv" - shell: - """ - # making sure database variable is set properly (can be off if using previous db location with new gtdb-tk conda env) - # this runs if the exit status of seeking the help menu isn't 0 (e.g. gtdb-tk tells us something is wrong with where it's looking for the ref db) - if ! gtdbtk -h > /dev/null; then - # adding wanted location to this conda env PATH (gtdb-tk looks in the GTDBTK_DATA_PATH variable), - # so will be set when the conda environment is started from now on - mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/ - echo 'export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh - - # but still needs to be set for this particular session that is downloading and setting up the db - GTDBTK_DATA_PATH={params.gtdbtk_db_dir} - fi - - # only running if any MAGs were recovered - if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - if [ "{params.gtdb_tk_scratch_location}" != "" ]; then - - gtdbtk classify_wf --scratch_dir {params.gtdb_tk_scratch_location} --genome_dir {params.MAGs_dir} -x fasta --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} > {log} 2>&1 - - else - - gtdbtk classify_wf --genome_dir {params.MAGs_dir} -x fasta --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} > {log} 2>&1 - - fi - - else - - mkdir -p {output} - printf "There were no MAGs recovered.\n" > {params.MAGs_dir}No-MAGs-recovered.txt - printf "\n\nThere were no MAGs recovered, so GTDB-tk was not run.\n\n" > {log} - - fi - """ - - - -rule filter_checkm_results_and_copy_MAGs: - """ - Filters checkm results based on est. completion, redundancy, and strain heterogeneity set in 'config.yaml' - Defaults are conservatively 90, 10, and 50 - """ - - input: - config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv" - output: - config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv" - params: - bins_dir = config["bins_dir"], - MAGs_dir = config["MAGs_dir"], - tmp_file = config["MAGs_dir"] + "MAGs-checkm-out.tmp", - min_est_comp = config["minimum_estimated_completion"], - max_est_redund = config["maximum_estimated_redundancy"], - max_est_strain_het = config["maximum_estimated_strain_heterogeneity"] - benchmark: - "benchmarks/filtering_checkm_results_and_copying_MAGs-benchmarks.tsv" - shell: - """ - # only running if there were bins recovered - if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - cat <( printf "Bin Id\tMarker lineage\t# genomes\t# markers\t# marker sets\t0\t1\t2\t3\t4\t5+\tCompleteness\tContamination\tStrain heterogeneity\n" ) \ - <( awk -F $'\\t' ' $12 >= {params.min_est_comp} && $13 <= {params.max_est_redund} && $14 <= {params.max_est_strain_het} ' {input} ) > {params.tmp_file} - - sed 's/-bin\./-MAG-/' {params.tmp_file} > {output} - - for MAG in $(cut -f 1 {params.tmp_file} | tail -n +2) - do - new_ID=$(echo $MAG | sed 's/-bin\./-MAG-/') - cp {params.bins_dir}${{MAG}}.fasta {params.MAGs_dir}${{new_ID}}.fasta - done - - rm {params.tmp_file} - - else - - printf "There were no MAGs recovered.\n" > {output} - - fi - """ - - -rule generate_bins_overview_table: - input: - assembly_summaries = config["bins_dir"] + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv", - checkm_results = config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv", - timing_trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv" - params: - checkm_tmp = config["bins_dir"] + "checkm-estimates.tmp", - checkm_w_header_tmp = config["bins_dir"] + "checkm-estimates-with-headers.tmp", - bins_dir = config["bins_dir"] - output: - config["bins_dir"] + config["additional_filename_prefix"] + f"bins-overview{assay_suffix}.tsv" - benchmark: - "benchmarks/generate_bins_overview_table-benchmarks.tsv" - shell: - """ - # only running if there were bins recovered - if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - # making sure none of the intermediate files exist already - rm -rf {params.checkm_tmp} {params.checkm_w_header_tmp} - - for bin in $(cut -f 1 {input.assembly_summaries} | tail -n +2) - do - - grep -w -m 1 "^${{bin}}" {input.checkm_results} | cut -f 12,13,14 >> {params.checkm_tmp} - - done - - # adding header - cat <(printf "est. completeness\\test. redundancy\\test. strain heterogeneity\\n") {params.checkm_tmp} > {params.checkm_w_header_tmp} - - # combining - paste {input.assembly_summaries} {params.checkm_w_header_tmp} > {output} - - rm -rf {params.checkm_tmp} {params.checkm_w_header_tmp} - - else - - rm -rf {params.bins_dir}* - printf "There were no bins recovered.\n" > {output} - - fi - """ - - -rule checkm_on_bins: - """ runs checkm on recovered bins """ - - conda: - "envs/checkm.yaml" - input: - trigger = expand(config["mapping_dir"] + "{ID}-metabat-assembly-depth.tsv", ID = sample_ID_list) - params: - bins_dir = config["bins_dir"], - tmp_output_dir = config["bins_dir"] + "checkm-out-tmp/", - tmp_working_dir = config["bins_dir"] + "checkm-working-tmp/", - num_threads = config["gtdb_tk_checkm_pplacer_cpus"], - reduced_tree = config["reduced_tree"] - resources: - cpus = config["num_cpus"], - mem_mb = config["checkm_memory_resources"] - output: - config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv" - log: - config["logs_dir"] + "checkm.log" - benchmark: - "benchmarks/run_checkm_on_bins-benchmarks.tsv" - shell: - """ - # only running if there were bins recovered - if [ $(find {params.bins_dir} -name "*fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - mkdir -p {params.tmp_working_dir} - - if [ "{params.reduced_tree}" == True ]; then - - checkm lineage_wf -f {output} --tab_table -t {resources.cpus} --reduced_tree --pplacer_threads {params.num_threads} -x fasta {params.bins_dir} {params.tmp_output_dir} --tmpdir {params.tmp_working_dir} > {log} 2>&1 - - else - - checkm lineage_wf -f {output} --tab_table -t {resources.cpus} --pplacer_threads {params.num_threads} -x fasta {params.bins_dir} {params.tmp_output_dir} --tmpdir {params.tmp_working_dir} > {log} 2>&1 - - fi - - rm -rf {params.tmp_output_dir} {params.tmp_working_dir} - - else - - printf "There were no bins recovered, so checkm was not run.\n" > {output} - - fi - """ - - -rule summarize_bin_assemblies: - """ summarize bin assemblies """ - - conda: - "envs/bit.yaml" - input: - trigger = expand(config["mapping_dir"] + "{ID}-metabat-assembly-depth.tsv", ID = sample_ID_list) - params: - intermediate_file = config["bins_dir"] + "bin-summaries.tmp", - bins_dir = config["bins_dir"] - output: - config["bins_dir"] + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv" - benchmark: - "benchmarks/summarize_bin_assemblies-benchmarks.tsv" - shell: - """ - # only running if any bins were recovered - if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then - - bit-summarize-assembly {params.bins_dir}*.fasta -o {params.intermediate_file} -t - - # slimming down the output - cut -f 1,2,3,5,6,8,11,18,19,20 {params.intermediate_file} > {output} - rm {params.intermediate_file} - - else - - printf "There were no bins recovered.\n" > {output} - - fi - """ - - -rule metabat_binning: - """ - This rule runs metabat2 for binning contigs. - """ - - conda: - "envs/metabat.yaml" - input: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - bam = config["mapping_dir"] + "{ID}.bam" - params: - bins_dir = config["bins_dir"], - prefix = config["bins_dir"] + "{ID}-bin", - tmp_bins_file = "{ID}-bin-files.tmp", - tmp_rename_script = "{ID}-rename.tmp" - resources: - cpus = config["num_threads"] - output: - depth_file = config["mapping_dir"] + "{ID}-metabat-assembly-depth.tsv" - log: - config["logs_dir"] + "{ID}-bam-summarize-and-metabat.log" - benchmark: - "benchmarks/metabat_binning-{ID}-benchmarks.tsv" - shell: - """ - # only running if the assembly produced anything - if [ -s {input.assembly} ]; then - - jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} --percentIdentity 97 --minContigLength 1000 --minContigDepth 1.0 --referenceFasta {input.assembly} {input.bam} > {log} 2>&1 - - # only running if there are contigs with coverage information in the coverage file we just generated - if [ $(wc -l {output.depth_file} | sed 's/^ *//' | cut -f 1 -d " ") -gt 1 ]; then - metabat2 --inFile {input.assembly} --outFile {params.prefix} --abdFile {output.depth_file} -t {resources.cpus} >> {log} 2>&1 - else - printf "\n\nThere was no coverage info generated in {output.depth_file}, so no binning with metabat was performed.\n\n" >> {log} - fi - - # changing extensions from .fa to .fasta to match nt fasta extension elsewhere in GeneLab - find {params.bins_dir} -name {wildcards.ID}*.fa > {params.tmp_bins_file} - - if [ -s {params.tmp_bins_file} ]; then - paste -d " " <( sed 's/^/mv /' {params.tmp_bins_file} ) <( sed 's/.fa/.fasta/' {params.tmp_bins_file} ) > {params.tmp_rename_script} - bash {params.tmp_rename_script} - fi - - rm -rf {params.tmp_bins_file} {params.tmp_rename_script} - - else - - touch {output} - printf "Binning not performed because the assembly didn't produce anything.\n" > {log} - - fi - """ - - -rule combine_read_based_processing_taxonomy: - """ - This rule includes final outputs from read-based functional annotation process as inputs even though they aren't used just so - we can delete those working directories when done with them here (ensuring the other processes are already done with them). - """ - conda: - "envs/humann3.yaml" - input: - in_files = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_metaphlan_bugs_list.tsv", ID = sample_ID_list), - trigger1 = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", - trigger2 = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv" - params: - dirs_to_remove = " ".join(expand(config["read_based_dir"] + "{ID}-humann3-out-dir/", ID = sample_ID_list)) - output: - config["read_based_dir"] + config["additional_filename_prefix"] + f"Metaphlan-taxonomy{assay_suffix}.tsv" - benchmark: - "benchmarks/combine_read_based_processing_taxonomy-benchmarks.tsv" - shell: - """ - merge_metaphlan_tables.py {input.in_files} > {output} 2> /dev/null - - # removing redundant text from headers (using the -i flag to keep it portable with darwin shell) - sed -i.tmp 's/_metaphlan_bugs_list//g' {output} - rm -rf {output}.tmp {params.dirs_to_remove} - """ - - -rule gen_read_based_processing_KO_table: - """ - This rule summarizes the read-based humann annotations based on Kegg Orthlogy terms. - """ - conda: - "envs/humann3.yaml" - input: - gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families{assay_suffix}.tsv" - output: - gene_families_KOs_cpm = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv" - benchmark: - "benchmarks/gen_read_based_processing_KO_table-benchmarks.tsv" - shell: - """ - humann_regroup_table -i {input} -g uniref90_ko 2> /dev/null | humann_rename_table -n kegg-orthology 2> /dev/null | humann_renorm_table -o {output} --update-snames > /dev/null 2>&1 - """ - - -rule gen_normalized_read_based_processing_tables: - """ - This rule generates some normalized tables of the read-based functional outputs from - humann that are more readily suitable for across sample comparisons. - """ - conda: - "envs/humann3.yaml" - input: - gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families{assay_suffix}.tsv", - path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances{assay_suffix}.tsv" - output: - gene_families_cpm = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", - path_abundances_cpm = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances-cpm{assay_suffix}.tsv" - benchmark: - "benchmarks/gen_normalized_read_based_processing_tables-benchmarks.tsv" - shell: - """ - humann_renorm_table -i {input.gene_families} -o {output.gene_families_cpm} --update-snames > /dev/null 2>&1 - humann_renorm_table -i {input.path_abundances} -o {output.path_abundances_cpm} --update-snames > /dev/null 2>&1 - """ - - -rule split_read_based_processing_tables: - """ - The read-based functional annotation tables have taxonomic info and non-taxonomic info mixed - together initially. humann comes with utility scripts to split these. This rule does that, - generating non-taxonomically grouped functional info files and taxonomically grouped ones. - """ - conda: - "envs/humann3.yaml" - input: - gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial.tsv", - path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial.tsv", - path_coverages = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial.tsv" - params: - read_based_dir = config["read_based_dir"], - gene_families_initial_stratified = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial_stratified.tsv", - gene_families_initial_unstratified = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial_unstratified.tsv", - path_abundances_initial_stratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial_stratified.tsv", - path_abundances_initial_unstratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial_unstratified.tsv", - path_coverages_initial_stratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial_stratified.tsv", - path_coverages_initial_unstratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial_unstratified.tsv" - output: - gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families{assay_suffix}.tsv", - gene_families_grouped = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-grouped-by-taxa{assay_suffix}.tsv", - path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances{assay_suffix}.tsv", - path_abundances_grouped = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances-grouped-by-taxa{assay_suffix}.tsv", - path_coverages = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-coverages{assay_suffix}.tsv", - path_coverages_grouped = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-coverages-grouped-by-taxa{assay_suffix}.tsv", - benchmark: - "benchmarks/split_read_based_processing_tables-benchmarks.tsv" - shell: - """ - humann_split_stratified_table -i {input.gene_families} -o {params.read_based_dir} > /dev/null 2>&1 - mv {params.gene_families_initial_stratified} {output.gene_families_grouped} - mv {params.gene_families_initial_unstratified} {output.gene_families} - - humann_split_stratified_table -i {input.path_abundances} -o {params.read_based_dir} > /dev/null 2>&1 - mv {params.path_abundances_initial_stratified} {output.path_abundances_grouped} - mv {params.path_abundances_initial_unstratified} {output.path_abundances} - - humann_split_stratified_table -i {input.path_coverages} -o {params.read_based_dir} > /dev/null 2>&1 - mv {params.path_coverages_initial_stratified} {output.path_coverages_grouped} - mv {params.path_coverages_initial_unstratified} {output.path_coverages} - - rm {input} - """ - - -rule combine_read_based_processing_tables: - """ - This rule combines the read-based humann3 output functional tables from indiviual samples into single - tables across the GLDS dataset. - """ - conda: - "envs/humann3.yaml" - input: - gene_families = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_genefamilies.tsv", ID = sample_ID_list), - path_abundances = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathabundance.tsv", ID = sample_ID_list), - path_coverages = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathcoverage.tsv", ID = sample_ID_list) - params: - gene_fam_dir = config["read_based_dir"] + "gene-family-results/", - path_abund_dir = config["read_based_dir"] + "path-abundance-results/", - path_cov_dir = config["read_based_dir"] + "path-coverage-results/", - utilities_path = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/utility_mapping" - output: - gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial.tsv", - path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial.tsv", - path_coverages = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial.tsv" - benchmark: - "benchmarks/combine_read_based_processing_tables-benchmarks.tsv" - shell: - """ - # setting humann3 utilities location (can be off if we pointed to a previously installed database, and doesn't hurt to reset if it was already good-to-go) - humann_config --update database_folders utility_mapping {params.utilities_path} > /dev/null 2>&1 - - # they each need to be in the same directories to be merged - mkdir -p {params.gene_fam_dir} {params.path_abund_dir} {params.path_cov_dir} - cp {input.gene_families} {params.gene_fam_dir} - cp {input.path_abundances} {params.path_abund_dir} - cp {input.path_coverages} {params.path_cov_dir} - - humann_join_tables -i {params.gene_fam_dir} -o {output.gene_families} > /dev/null 2>&1 - humann_join_tables -i {params.path_abund_dir} -o {output.path_abundances} > /dev/null 2>&1 - humann_join_tables -i {params.path_cov_dir} -o {output.path_coverages} > /dev/null 2>&1 - - rm -rf {params.gene_fam_dir} {params.path_abund_dir} {params.path_cov_dir} - """ - - -if config["single_end_data"] != "TRUE": - # humann3 rule if paired-end data - - rule humann3_PE: - """ - This rule runs humann3 and metaphlan4 on each individual sample generating the - read-based functional annotations and taxonomic classifications. - """ - conda: - "envs/humann3.yaml" - input: - R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], - R2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"], - chocophlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_CHOCOPHLAN_TRIGGER_FILE"], - uniref_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UNIREF_TRIGGER_FILE"], - utility_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UTILITY_MAPPING_TRIGGER_FILE"], - metaphlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + "/" + config["METAPHLAN_TRIGGER_FILE"] - params: - combined_reads = config["read_based_dir"] + "{ID}-reads.tmp.fq.gz", - output_dir = config["read_based_dir"] + "{ID}-humann3-out-dir", - tmp_metaphlan = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/{ID}_metaphlan_bugs_list.tsv", - tmp_dir = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/", - metaphlan_dir = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] - output: - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_genefamilies.tsv", - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathabundance.tsv", - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathcoverage.tsv", - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_metaphlan_bugs_list.tsv" - resources: - cpus = config["num_threads"], - mem_mb = config["humann3_memory_resources"] - log: - config["logs_dir"] + "{ID}-humann3-run.log" - benchmark: - "benchmarks/run_humann3-{ID}-benchmarks.tsv" - shell: - """ - cat {input.R1} {input.R2} > {params.combined_reads} - humann --input {params.combined_reads} --output {params.output_dir} --threads {resources.cpus} --output-basename {wildcards.ID} --metaphlan-options "--index mpa_vJan21_CHOCOPhlAnSGB_202103 --bowtie2db {params.metaphlan_dir} --unclassified_estimation --add_viruses --sample_id {wildcards.ID}" --bowtie-options "--sensitive --mm" > {log} 2>&1 - mv {params.tmp_metaphlan} {output[3]} - rm -rf {params.combined_reads} {params.tmp_dir} - """ - -else: - # humann3 rule if single-end data - - rule humann3_SE: - """ - This rule runs humann3 and metaphlan4 on each individual sample generating the - read-based functional annotations and taxonomic classifications. - """ - conda: - "envs/humann3.yaml" - input: - R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"], - chocophlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_CHOCOPHLAN_TRIGGER_FILE"], - uniref_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UNIREF_TRIGGER_FILE"], - utility_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UTILITY_MAPPING_TRIGGER_FILE"], - metaphlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + "/" + config["METAPHLAN_TRIGGER_FILE"] - params: - output_dir = config["read_based_dir"] + "{ID}-humann3-out-dir", - tmp_metaphlan = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/{ID}_metaphlan_bugs_list.tsv", - tmp_dir = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/", - metaphlan_dir = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] - output: - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_genefamilies.tsv", - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathabundance.tsv", - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathcoverage.tsv", - config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_metaphlan_bugs_list.tsv" - resources: - cpus = config["num_threads"], - mem_mb = config["humann3_memory_resources"] - log: - config["logs_dir"] + "{ID}-humann3-run.log" - benchmark: - "benchmarks/run_humann3-{ID}-benchmarks.tsv" - shell: - """ - humann --input {input.R1} --output {params.output_dir} --threads {resources.cpus} --output-basename {wildcards.ID} --metaphlan-options "--bowtie2db {params.metaphlan_dir} --unclassified_estimation --add_viruses --sample_id {wildcards.ID}" --bowtie-options "--sensitive --mm" > {log} 2>&1 - mv {params.tmp_metaphlan} {output[3]} - rm -rf {params.tmp_dir} - """ - - -rule make_combined_contig_tax_tables: - conda: - "envs/bit.yaml" - input: - expand(config["annotations_and_tax_dir"] + "{ID}-contig-coverage-and-tax.tsv", ID = sample_ID_list) - params: - out_prefix = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined", - tmp_out = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-contig-level-taxonomy-coverages.tsv", - tmp_out_CPM = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-contig-level-taxonomy-coverages-CPM.tsv" - output: - combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", - norm_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages-CPM{assay_suffix}.tsv" - benchmark: - "benchmarks/make_combined_contig_tax_tables-benchmarks.tsv" - shell: - """ - bit-GL-combine-contig-tax-tables {input} -o {params.out_prefix} - # renaming to have GL assay-specific suffix - mv {params.tmp_out} {output.combined_tax} - mv {params.tmp_out_CPM} {output.norm_combined_tax} - """ - - -rule make_combined_gene_level_tables: - conda: - "envs/bit.yaml" - input: - expand(config["annotations_and_tax_dir"] + "{ID}-gene-coverage-annotation-and-tax.tsv", ID = sample_ID_list) - params: - out_prefix = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined", - tmp_combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-KO-function-coverages.tsv", - tmp_norm_combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-KO-function-coverages-CPM.tsv", - tmp_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-taxonomy-coverages.tsv", - tmp_norm_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-taxonomy-coverages-CPM.tsv" - output: - combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", - norm_combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages-CPM{assay_suffix}.tsv", - combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", - norm_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages-CPM{assay_suffix}.tsv" - benchmark: - "benchmarks/make_combined_gene_level_tables-benchmarks.tsv" - shell: - """ - bit-GL-combine-KO-and-tax-tables {input} -o {params.out_prefix} - # renaming to have GL assay-specific suffix - mv {params.tmp_combined_annots} {output.combined_annots} - mv {params.tmp_norm_combined_annots} {output.norm_combined_annots} - mv {params.tmp_combined_tax} {output.combined_tax} - mv {params.tmp_norm_combined_tax} {output.norm_combined_tax} - """ - - -rule combine_contig_tax_and_coverage: - """ - This rule combines the contig-level taxonomic and coverage information for each individual sample. - """ - input: - cov = config["mapping_dir"] + "{ID}-contig-coverages.tsv", - tax = config["annotations_and_tax_dir"] + "{ID}-contig-tax.tsv" - params: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - AAs = config["genes_dir"] + "{ID}-genes.faa", - contig_tmp = config["annotations_and_tax_dir"] + "{ID}-contig.tmp", - header_tmp = config["annotations_and_tax_dir"] + "{ID}-contig-header.tmp", - contig_p1_tmp = config["annotations_and_tax_dir"] + "{ID}-contig-p1.tmp", - tax_col_tmp = config["annotations_and_tax_dir"] + "{ID}-tax-col.tmp" - output: - config["annotations_and_tax_dir"] + "{ID}-contig-coverage-and-tax.tsv" - benchmark: - "benchmarks/combine_contig_tax_and_coverage-{ID}-benchmarks.tsv" - shell: - """ - # only running if the assembly produced anything - if [ -s {params.assembly} ]; then - - # if there were no genes called, there is no contig-level taxonomy, so dealing with that here - if [ -s {params.AAs} ]; then - paste <( tail -n +2 {input.cov} | sort -V -k 1 ) <( tail -n +2 {input.tax} | sort -V -k 1 | cut -f 2- ) > {params.contig_tmp} - paste <( head -n 1 {input.cov} ) <( head -n 1 {input.tax} | cut -f 2- ) > {params.header_tmp} - cat {params.header_tmp} {params.contig_tmp} > {output} - rm -rf {params.contig_tmp} {params.header_tmp} - rm -rf {input} - - else - - paste <( tail -n +2 {input.cov} | sort -V -k 1 ) > {params.contig_p1_tmp} - sed 's/.*/NA/g' {params.contig_p1_tmp} > {params.tax_col_tmp} - paste {params.contig_p1_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} > {params.contig_tmp} - cat <( printf "contig_ID\tcoverage\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" ) {params.contig_tmp} > {output} - rm -rf {params.contig_p1_tmp} {params.tax_col_tmp} {params.contig_tmp} - rm -rf {input} - - fi - - else - - printf "contig_ID\tcoverage\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output} - rm -rf {input} - - fi - """ - - -rule combine_gene_annots_tax_and_coverage: - """ - This rule combines the gene-level functional annotations, taxonomic classifications, and coverage information for each individual sample. - """ - input: - cov = config["mapping_dir"] + "{ID}-gene-coverages.tsv", - annots = config["annotations_and_tax_dir"] + "{ID}-annotations.tsv", - tax = config["annotations_and_tax_dir"] + "{ID}-gene-tax.tsv" - params: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - AAs = config["genes_dir"] + "{ID}-genes.faa", - gene_tmp = config["annotations_and_tax_dir"] + "{ID}-gene.tmp", - header_tmp = config["annotations_and_tax_dir"] + "{ID}-gene-header.tmp" - output: - config["annotations_and_tax_dir"] + "{ID}-gene-coverage-annotation-and-tax.tsv" - benchmark: - "benchmarks/combine_gene_annots_tax_and_coverage-{ID}-benchmarks.tsv" - shell: - """ - # only running if the assembly produced anything and genes were identified (they are required for this) - if [ -s {params.assembly} ] && [ -s {params.AAs} ]; then - - paste <( tail -n +2 {input.cov} | sort -V -k 1 ) <( tail -n +2 {input.annots} | sort -V -k 1 | cut -f 2- ) <( tail -n +2 {input.tax} | sort -V -k 1 | cut -f 2- ) > {params.gene_tmp} - paste <( head -n 1 {input.cov} ) <( head -n 1 {input.annots} | cut -f 2- ) <( head -n 1 {input.tax} | cut -f 2- ) > {params.header_tmp} - - cat {params.header_tmp} {params.gene_tmp} > {output} - - rm -rf {params.gene_tmp} {params.header_tmp} - rm -rf {input} - - else - - printf "gene_ID\tcoverage\tKO_ID\tKO_function\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output} - rm -rf {input} - - fi - """ - - -rule get_cov_and_det: - """ - This rule pulls out coverage and detection information for each sample, gene-level and contig-level, - and filters the gene-level coverage information based on requiring at least 50% detection. - """ - - conda: - "envs/mapping.yaml" - input: - bam = config["mapping_dir"] + "{ID}.bam", - nt = config["genes_dir"] + "{ID}-genes.fasta" - params: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - gene_cov_and_det_tmp = config["mapping_dir"] + "{ID}-gene-cov-and-det.tmp", - contig_cov_and_det_tmp = config["mapping_dir"] + "{ID}-contig-cov-and-det.tmp", - gene_cov_tmp = config["mapping_dir"] + "{ID}-gene-cov.tmp", - contig_cov_tmp = config["mapping_dir"] + "{ID}-contig-cov.tmp", - pileup_mem = config["pileup_mem"] - output: - gene_covs = config["mapping_dir"] + "{ID}-gene-coverages.tsv", - contig_covs = config["mapping_dir"] + "{ID}-contig-coverages.tsv" - resources: - mem_mb = config["pileup_memory_resources"] - log: - config["logs_dir"] + "{ID}-pileup.log" - benchmark: - "benchmarks/get_cov_and_det-{ID}-benchmarks.tsv" - shell: - """ - # only running if the assembly produced anything - if [ -s {params.assembly} ]; then - - # only running on genes also if genes were identified - if [ -s {input.nt} ]; then - - pileup.sh -Xmx{params.pileup_mem} -in {input.bam} fastaorf={input.nt} outorf={params.gene_cov_and_det_tmp} out={params.contig_cov_and_det_tmp} > {log} 2>&1 - - # filtering coverages based on detection - # genes - grep -v "#" {params.gene_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $10 <= 0.5 ) $4 = 0 }} {{ print $1,$4 }} ' > {params.gene_cov_tmp} - cat <( printf "gene_ID\tcoverage\n" ) {params.gene_cov_tmp} > {output.gene_covs} - - # contigs - grep -v "#" {params.contig_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $5 <= 50 ) $2 = 0 }} {{ print $1,$2 }} ' > {params.contig_cov_tmp} - cat <( printf "contig_ID\tcoverage\n" ) {params.contig_cov_tmp} > {output.contig_covs} - - # removing intermediate files - rm {params.gene_cov_and_det_tmp} {params.contig_cov_and_det_tmp} {params.gene_cov_tmp} {params.contig_cov_tmp} - - else - - pileup.sh -in {input.bam} out={params.contig_cov_and_det_tmp} > {log} 2>&1 - - # filtering coverages based on detection - # contigs - grep -v "#" {params.contig_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $5 <= 50 ) $2 = 0 }} {{ print $1,$2 }} ' > {params.contig_cov_tmp} - cat <( printf "contig_ID\tcoverage\n" ) {params.contig_cov_tmp} > {output.contig_covs} - - # writing out empty genes coverage file - printf "gene_ID\tcoverage\n" > {output.gene_covs} - printf "\n\nGene-level coverage info not recovered because the assembly didn't have any genes identified.\n" >> {log} - - # removing intermediate files - rm {params.contig_cov_and_det_tmp} {params.contig_cov_tmp} - - fi - - else - - printf "gene_ID\tcoverage\n" > {output.gene_covs} - printf "contig_ID\tcoverage\n" > {output.contig_covs} - printf "Coverage info not recovered because the assembly didn't produce anything.\n" > {log} - - fi - """ - - -if config["single_end_data"] != "TRUE": - # mapping rule if paired-end data - - rule mapping_PE: - """ - This rule builds the bowtie2 index and runs the mapping for each sample. - """ - conda: - "envs/mapping.yaml" - input: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], - R2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] - params: - index = config["mapping_dir"] + "{ID}-index", - mapping_info = config["mapping_dir"] + "{ID}-mapping-info.txt", - num_threads = config["num_threads"] - resources: - cpus = config["num_threads"], - mem_mb = config["mapping_memory_resources"] - output: - config["mapping_dir"] + "{ID}.bam" - log: - config["logs_dir"] + "{ID}-bowtie2-build.log" - benchmark: - "benchmarks/run_mapping-{ID}-benchmarks.tsv" - shell: - """ - # only running if the assembly produced anything - if [ -s {input.assembly} ]; then - - bowtie2-build {input.assembly} {params.index} > {log} 2>&1 - bowtie2 --mm -q --threads {params.num_threads} -x {params.index} -1 {input.R1} -2 {input.R2} --no-unal 2> {params.mapping_info} | samtools view -b | samtools sort -@ {params.num_threads} > {output} 2> /dev/null - rm {params.index}* - - else - - touch {output} - printf "Mapping not performed because the assembly didn't produce anything.\n" > {log} - - fi - """ - -else: - # mapping rule if single-end data - - rule mapping_SE: - """ - This rule builds the bowtie2 index and runs the mapping for each sample. - """ - conda: - "envs/mapping.yaml" - input: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - params: - index = config["mapping_dir"] + "{ID}-index", - mapping_info = config["mapping_dir"] + "{ID}-mapping-info.txt", - num_threads = config["num_threads"] - resources: - cpus = config["num_threads"], - mem_mb = config["mapping_memory_resources"] - output: - config["mapping_dir"] + "{ID}.bam" - log: - config["logs_dir"] + "{ID}-bowtie2-build.log" - benchmark: - "benchmarks/run_mapping-{ID}-benchmarks.tsv" - shell: - """ - # only running if the assembly produced anything - if [ -s {input.assembly} ]; then - - bowtie2-build {input.assembly} {params.index} > {log} 2>&1 - bowtie2 --mm -q --threads {params.num_threads} -x {params.index} -r {input.R1} --no-unal 2> {params.mapping_info} | samtools view -b | samtools sort -@ {params.num_threads} > {output} 2> /dev/null - rm {params.index}* - - else - - touch {output} - printf "Mapping not performed because the assembly didn't produce anything.\n" > {log} - - fi - """ - - -rule tax_classification: - """ - This rule runs the gene- and contig-level taxonomic classifications for each assembly. - """ - - conda: - "envs/cat.yaml" - input: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - AA = config["genes_dir"] + "{ID}-genes.faa", - cat_db_trigger = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"] - output: - gene_tax_out = config["annotations_and_tax_dir"] + "{ID}-gene-tax.tsv", - contig_tax_out = config["annotations_and_tax_dir"] + "{ID}-contig-tax.tsv" - params: - tmp_out_prefix = config["annotations_and_tax_dir"] + "{ID}-tax-out.tmp", - tmp_genes = config["annotations_and_tax_dir"] + "{ID}-gene-tax.tmp", - tmp_contigs = config["annotations_and_tax_dir"] + "{ID}-contig-tax.tmp", - cat_db = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + config["CAT_DB"], - cat_tax = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + config["CAT_TAX"], - block_size = config["block_size"] - resources: - cpus = config["num_cpus"], - mem_mb = config["CAT_memory_resources"] - log: - config["logs_dir"] + "{ID}-CAT.log" - benchmark: - "benchmarks/run_tax_classification-{ID}-benchmarks.tsv" - shell: - """ - # only running if assembly produced any contigs and genes were identified (they are required for this) - if [ -s {input.assembly} ] && [ -s {input.AA} ]; then - - CAT contigs -d {params.cat_db} -t {params.cat_tax} -n {resources.cpus} -r 3 --top 4 --I_know_what_Im_doing -c {input.assembly} -p {input.AA} -o {params.tmp_out_prefix} --no_stars --block_size {params.block_size} --index_chunks 2 --force > {log} 2>&1 - - # adding names to gene classifications - CAT add_names -i {params.tmp_out_prefix}.ORF2LCA.txt -o {params.tmp_genes} -t {params.cat_tax} --only_official --exclude_scores >> {log} 2>&1 - - # formatting gene classifications - bash scripts/format-gene-tax-classifications.sh {params.tmp_genes} {output.gene_tax_out} - - # adding names to contig classifications - CAT add_names -i {params.tmp_out_prefix}.contig2classification.txt -o {params.tmp_contigs} -t {params.cat_tax} --only_official --exclude_scores >> {log} 2>&1 - - # formatting contig classifications - bash scripts/format-contig-tax-classifications.sh {params.tmp_contigs} {output.contig_tax_out} - - rm -rf {params.tmp_out_prefix}* {params.tmp_genes} {params.tmp_contigs} - - else - - touch {output} - printf "Assembly-based taxonomic classification not performed because the assembly didn't produce anything and/or no genes were identified.\n" > {log} - - fi - """ - - -rule KO_annotation: - """ - This rule runs the gene-level (KO) functional annotation for each sample. - """ - conda: - "envs/kofamscan.yaml" - input: - AAs = config["genes_dir"] + "{ID}-genes.faa", - kofamscan_db_trigger = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/" + config["KOFAMSCAN_TRIGGER_FILE"] - output: - config["annotations_and_tax_dir"] + "{ID}-annotations.tsv" - params: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", - ko_db_dir = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"], - tmp_out = config["annotations_and_tax_dir"] + "{ID}-KO-tab.tmp", - tmp_dir = config["annotations_and_tax_dir"] + "{ID}-tmp-KO-dir" - resources: - cpus = config["num_cpus"], - mem_mb = config["KOFamScan_memory_resources"] - log: - config["logs_dir"] + "{ID}-kofamscan.log" - benchmark: - "benchmarks/run_KO_annotation-{ID}-benchmarks.tsv" - shell: - """ - # only running if assembly produced any contigs and genes were identified (they are required for this) - if [ -s {params.assembly} ] && [ -s {input.AAs} ]; then - - exec_annotation -p {params.ko_db_dir}/profiles/ -k {params.ko_db_dir}/ko_list --cpu {resources.cpus} -f detail-tsv -o {params.tmp_out} --tmp-dir {params.tmp_dir} --report-unannotated {input.AAs} > {log} 2>&1 - - bit-filter-KOFamScan-results -i {params.tmp_out} -o {output} - - rm -rf {params.tmp_out} {params.tmp_dir} - - else - - touch {output} - printf "Functional annotations not performed because the assembly didn't produce anything and/or no genes were identified.\n" > {log} - - fi - """ - - -rule call_genes: - """ - This rule calls genes on each assembly file. - """ - - conda: - "envs/prodigal.yaml" - input: - assembly = config["assemblies_dir"] + "{ID}-assembly.fasta" - output: - AA = config["genes_dir"] + "{ID}-genes.faa", - nt = config["genes_dir"] + "{ID}-genes.fasta", - gff = config["genes_dir"] + "{ID}-genes.gff" - log: - config["logs_dir"] + "{ID}-prodigal.log" - benchmark: - "benchmarks/call_genes-{ID}-benchmarks.tsv" - shell: - """ - # only running if assembly produced any contigs - if [ -s {input.assembly} ]; then - - prodigal -q -c -p meta -a {output.AA} -d {output.nt} -f gff -o {output.gff} -i {input.assembly} > {log} 2>&1 - - # removing line-wraps - bit-remove-wraps {output.AA} > {output.AA}.tmp 2> /dev/null && mv {output.AA}.tmp {output.AA} - bit-remove-wraps {output.nt} > {output.nt}.tmp 2> /dev/null && mv {output.nt}.tmp {output.nt} - - else - - touch {output} - printf "Gene-calling not performed because the assembly didn't produce anything.\n" > {log} - - fi - """ - - -rule summarize_assemblies: - """ - This rule summarizes and reports general stats for all individual sample assemblies in one table. - """ - conda: - "envs/bit.yaml" - input: - expand(config["assemblies_dir"] + "{ID}-assembly.fasta", ID = sample_ID_list) - output: - config["assemblies_dir"] + config["additional_filename_prefix"] + f"assembly-summaries{assay_suffix}.tsv" - benchmark: - "benchmarks/summarize_assemblies-benchmarks.tsv" - shell: - """ - bit-summarize-assembly -o {output} {input} - """ - - -if config["single_end_data"] != "TRUE": - # assembly rule if paired-end data - rule assemble_PE: - """ - This rule handles running the assembly for each individual sample. - """ - conda: - "envs/megahit.yaml" - input: - R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], - R2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] - params: - assemblies_dir = config["assemblies_dir"], - max_mem = config["max_mem_megahit"], - failed_assemblies_file = config["assemblies_dir"] + config["additional_filename_prefix"] + f"Failed-assemblies{assay_suffix}.tsv" - resources: - cpus = config["num_threads"], - mem_mb = config["megahit_memory_resources"] - output: - config["assemblies_dir"] + "{ID}-assembly.fasta" - log: - config["logs_dir"] + "{ID}-assembly.log" - benchmark: - "benchmarks/assemble-{ID}-benchmarks.tsv" - shell: - """ - # removing output directory if exists already but rule still needs to be run (because there is no --force option to megahit i dont't think): - rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ - - megahit -1 {input.R1} -2 {input.R2} -m {params.max_mem} -t {resources.cpus} --min-contig-len 500 -o {params.assemblies_dir}{wildcards.ID}-megahit-out > {log} 2>&1 - bit-rename-fasta-headers -i {params.assemblies_dir}{wildcards.ID}-megahit-out/final.contigs.fa -w c_{wildcards.ID} -o {output} - - rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ - - # checking the assembly produced anything (megahit can run, produce the output fasta, but it will be empty if no contigs were assembled) - if [ ! -s {output} ]; then - printf "{wildcards.ID}\tNo contigs assembled\n" >> {params.failed_assemblies_file} - fi - """ - -else: - # assembly rule if single-end data - rule assemble_SE: - """ - This rule handles running the assembly for each individual sample. - """ - conda: - "envs/megahit.yaml" - input: - R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - params: - assemblies_dir = config["assemblies_dir"], - max_mem = config["max_mem_megahit"], - failed_assemblies_file = config["assemblies_dir"] + config["additional_filename_prefix"] + f"Failed-assemblies{assay_suffix}.tsv" - resources: - cpus = config["num_threads"], - mem_mb = config["megahit_memory_resources"] - output: - config["assemblies_dir"] + "{ID}-assembly.fasta" - log: - config["logs_dir"] + "{ID}-assembly.log" - benchmark: - "benchmarks/assemble-{ID}-benchmarks.tsv" - shell: - """ - # removing output directory if exists already but rule still needs to be run (because there is no --force option to megahit i dont't think): - rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ - - megahit -r {input.R1} -m {params.max_mem} -t {resources.cpus} --min-contig-len 500 -o {params.assemblies_dir}{wildcards.ID}-megahit-out > {log} 2>&1 - bit-rename-fasta-headers -i {params.assemblies_dir}{wildcards.ID}-megahit-out/final.contigs.fa -w c_{wildcards.ID} -o {output} - - rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ - - # checking the assembly produced anything (megahit can run, produce the output fasta, but it will be empty if no contigs were assembled) - if [ ! -s {output} ]; then - printf "{wildcards.ID}\tNo contigs assembled\n" >> {params.failed_assemblies_file} - fi - """ - - -if config["single_end_data"] != "TRUE": - # quality-trimming/filtering rule if this is paired-end data - # quality-trimming/filtering rule run slightly different if data are generated with Swift 1S library prep - if config["swift_1S"] == "TRUE": - - rule bbduk_PE: - """ - This rule runs quality filtering/trimming on raw input fastq files for each individual sample. - """ - - conda: - "envs/qc.yaml" - input: - in1 = config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"], - in2 = config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"] - output: - out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], - out2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] - log: - config["logs_dir"] + "{ID}-bbduk.log" - benchmark: - "benchmarks/bbduk-{ID}-benchmarks.tsv" - shell: - """ - bbduk.sh in={input.in1} in2={input.in2} out1={output.out1} out2={output.out2} \ - ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ - trimq=10 mlf=0.5 maxns=0 swift=t > {log} 2>&1 - """ - - else: - - rule bbduk_PE: - """ - This rule runs quality filtering/trimming on raw input fastq files for each individual sample. - """ - - conda: - "envs/qc.yaml" - input: - in1 = config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"], - in2 = config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"] - output: - out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], - out2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] - log: - config["logs_dir"] + "{ID}-bbduk.log" - benchmark: - "benchmarks/bbduk-{ID}-benchmarks.tsv" - shell: - """ - bbduk.sh in={input.in1} in2={input.in2} out1={output.out1} out2={output.out2} \ - ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ - trimq=10 mlf=0.5 maxns=0 > {log} 2>&1 - """ - -else: - # quality-trimming/filtering rule if this is single-end data - # quality-trimming/filtering rule run slightly different if data are generated with Swift 1S library prep - if config["swift_1S"] == "TRUE": - - rule bbduk_SE: - """ - This rule runs quality filtering/trimming on raw input fastq files for each individual sample. - """ - - conda: - "envs/qc.yaml" - input: - in1 = config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] - output: - out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - log: - config["logs_dir"] + "{ID}-bbduk.log" - benchmark: - "benchmarks/bbduk-{ID}-benchmarks.tsv" - shell: - """ - bbduk.sh in={input.in1} out1={output.out1} \ - ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ - trimq=10 mlf=0.5 maxns=0 swift=t > {log} 2>&1 - """ - - else: - - rule bbduk_SE: - """ - This rule runs quality filtering/trimming on raw input fastq files for each individual sample. - """ - - conda: - "envs/qc.yaml" - input: - in1 = config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] - output: - out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - log: - config["logs_dir"] + "{ID}-bbduk.log" - benchmark: - "benchmarks/bbduk-{ID}-benchmarks.tsv" - shell: - """ - bbduk.sh in={input.in1} out1={output.out1} \ - ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ - trimq=10 mlf=0.5 maxns=0 > {log} 2>&1 - """ - - -if config["single_end_data"] != "TRUE": - - # QC rules if this is paired-end data - rule raw_multiqc_PE: - """ - This rule collates all raw fastqc outputs. - """ - - conda: - "envs/qc.yaml" - input: - expand(config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list), - expand(config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) - params: - reads_dir = config["raw_reads_dir"], - int_out_dir = config["additional_filename_prefix"] + "raw_multiqc_report", - out_filename_prefix = config["additional_filename_prefix"] + "raw_multiqc", - int_out_data_dir = config["additional_filename_prefix"] + "raw_multiqc_data", - int_html_file = config["additional_filename_prefix"] + "raw_multiqc.html", - int_zip = config["additional_filename_prefix"] + "raw_multiqc_report.zip", - config_file = "config/multiqc.config" - output: - final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip" - benchmark: - "benchmarks/raw_multiqc-benchmarks.tsv" - shell: - """ - multiqc -q -n {params.out_filename_prefix} --force --cl-config 'max_table_rows: 99999999' --interactive --config {params.config_file} {input} > /dev/null 2>&1 - - # removing the individual fastqc files - rm -rf {params.reads_dir}*fastqc* - - # making an output report directory and moving things into it - mkdir -p {params.int_out_dir} - mv {params.int_html_file} {params.int_out_data_dir} {params.int_out_dir} - - # zipping and removing unzipped dir - zip -q -r {params.int_zip} {params.int_out_dir} && rm -rf {params.int_out_dir} - - # moving to final wanted location - mv {params.int_zip} {output.final_out_zip} - """ - - - rule raw_fastqc_PE: - """ - This rule runs fastqc on all raw input fastq files. - """ - - conda: - "envs/qc.yaml" - input: - config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"], - config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"] - output: - config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", - config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" - benchmark: - "benchmarks/raw_fastqc-{ID}-benchmarks.tsv" - shell: - """ - fastqc {input} -t 2 -q - """ - - - use rule raw_multiqc_PE as filtered_multiqc_PE with: - input: - expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list), - expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) - params: - reads_dir = config["filtered_reads_dir"], - int_out_dir = config["additional_filename_prefix"] + "filtered_multiqc_report", - out_filename_prefix = config["additional_filename_prefix"] + "filtered_multiqc", - int_out_data_dir = config["additional_filename_prefix"] + "filtered_multiqc_data", - int_html_file = config["additional_filename_prefix"] + "filtered_multiqc.html", - int_zip = config["additional_filename_prefix"] + "filtered_multiqc_report.zip", - config_file = "config/multiqc.config" - output: - final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip" - benchmark: - "benchmarks/filtered_multiqc-benchmarks.tsv" - - - use rule raw_fastqc_PE as filtered_fastqc_PE with: - input: - config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], - config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] - output: - config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", - config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" - benchmark: - "benchmarks/filtered_fastqc-{ID}-benchmarks.tsv" - - - -else: - # QC rules if this is single-end data - rule raw_multiqc_SE: - """ - This rule collates all raw fastqc outputs. - """ - - conda: - "envs/qc.yaml" - input: - expand(config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) - params: - reads_dir = config["raw_reads_dir"], - int_out_dir = config["additional_filename_prefix"] + "raw_multiqc_report", - out_filename_prefix = config["additional_filename_prefix"] + "raw_multiqc", - int_out_data_dir = config["additional_filename_prefix"] + "raw_multiqc_data", - int_html_file = config["additional_filename_prefix"] + "raw_multiqc.html", - int_zip = config["additional_filename_prefix"] + "raw_multiqc_report.zip", - config_file = "config/multiqc.config" - output: - final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip" - benchmark: - "benchmarks/raw_multiqc-benchmarks.tsv" - shell: - """ - multiqc -q -n {params.out_filename_prefix} --force --cl-config 'max_table_rows: 99999999' --interactive --config {params.config_file} {input} > /dev/null 2>&1 - - # removing the individual fastqc files - rm -rf {params.reads_dir}*fastqc* - - # making an output report directory and moving things into it - mkdir -p {params.int_out_dir} - mv {params.int_html_file} {params.int_out_data_dir} {params.int_out_dir} - - # zipping and removing unzipped dir - zip -q -r {params.int_zip} {params.int_out_dir} && rm -rf {params.int_out_dir} - - # moving to final wanted location - mv {params.int_zip} {output.final_out_zip} - """ - - - rule raw_fastqc_SE: - """ - This rule runs fastqc on all raw input fastq files. - """ - - conda: - "envs/qc.yaml" - input: - config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] - output: - config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" - benchmark: - "benchmarks/raw_fastqc-{ID}-benchmarks.tsv" - shell: - """ - fastqc {input} -t 2 -q - """ - - - use rule raw_multiqc_SE as filtered_multiqc_SE with: - input: - expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) - params: - reads_dir = config["filtered_reads_dir"], - int_out_dir = config["additional_filename_prefix"] + "filtered_multiqc_report", - out_filename_prefix = config["additional_filename_prefix"] + "filtered_multiqc", - int_out_data_dir = config["additional_filename_prefix"] + "filtered_multiqc_data", - int_html_file = config["additional_filename_prefix"] + "filtered_multiqc.html", - int_zip = config["additional_filename_prefix"] + "filtered_multiqc_report.zip", - config_file = "config/multiqc.config" - output: - final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip" - benchmark: - "benchmarks/filtered_multiqc-benchmarks.tsv" - - - use rule raw_fastqc_SE as filtered_fastqc_SE with: - input: - config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] - output: - config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" - benchmark: - "benchmarks/filtered_fastqc-{ID}-benchmarks.tsv" - - -### database checking and setup rules ### -rule setup_CAT_db: - """ - This rule checks for the CAT reference database, and downloads if needed. - """ - - conda: - "envs/cat.yaml" - output: - cat_db_trigger = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"] - params: - cat_db_dir = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"], - compressed_cat = config["REF_DB_ROOT_DIR"] + config["CAT_DL_FILE"], - compressed_nr_faa = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + config["CAT_DB"] + "/2021-01-07.nr.gz", - cat_dl_link = config["CAT_DL_LINK"], - REF_DB_ROOT_DIR = config["REF_DB_ROOT_DIR"] - log: - config["logs_dir"] + "setup-CAT-db.log" - benchmark: - "benchmarks/setup_CAT_db-benchmarks.tsv" - shell: - """ - mkdir -p {params.REF_DB_ROOT_DIR} - - printf "### Setting up CAT reference database ###\n\n" > {log} 2>&1 - - printf " Downloading reference db:\n\n" >> {log} 2>&1 - curl -L -C - -o {params.compressed_cat} {params.cat_dl_link} >> {log} 2>&1 - - printf "\n\n Extracting reference db:\n\n" >> {log} 2>&1 - tar -xvzf {params.compressed_cat} -C {params.REF_DB_ROOT_DIR} >> {log} 2>&1 - - rm {params.compressed_cat} {params.compressed_nr_faa} - - touch {output.cat_db_trigger} - """ - - -rule setup_KOFamScan_db: - """ - This rule checks for the KOFamScan db (minimally currently) and downloads if needed. - """ - - conda: - "envs/kofamscan.yaml" - output: - kofamscan_db_trigger = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/" + config["KOFAMSCAN_TRIGGER_FILE"] - params: - ko_db_dir = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"], - compressed_ko_list = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/ko_list.gz", - compressed_profiles = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/profiles.tar.gz" - log: - config["logs_dir"] + "setup-kofamscan-db.log" - benchmark: - "benchmarks/setup_KOFamScan_db-benchmarks.tsv" - shell: - """ - mkdir -p {params.ko_db_dir} - - printf "### Setting up KOFamScan reference database ###\n\n" > {log} 2>&1 - - # using https instead of ftp for those whose systems that don't have access to the ftp servers - - printf "\n Downloading ko_list file:\n\n" >> {log} 2>&1 - - if ! curl -L -C - --connect-timeout 15 -o {params.compressed_ko_list} ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz >> {log} 2>&1 - then - printf "\n\n Downloading via http since ftp seemed to fail making the connection:\n\n" - curl -L -C - -o {params.compressed_ko_list} https://www.genome.jp/ftp/db/kofam/ko_list.gz >> {log} 2>&1 - fi - - printf "\n\n Downloading profiles.tar.gz file:\n\n" >> {log} 2>&1 - - - if ! curl -L -C - --connect-timeout 15 -o {params.compressed_profiles} ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz >> {log} 2>&1 - then - printf "\n\n Downloading via http since ftp seemed to fail making the connection:\n\n" - curl -L -C - -o {params.compressed_profiles} https://www.genome.jp/ftp/db/kofam/profiles.tar.gz >> {log} 2>&1 - fi - - printf "\n\n Decompressing profiles.tar.gz file:\n\n" >> {log} 2>&1 - tar -xzf {params.compressed_profiles} -C {params.ko_db_dir} >> {log} 2>&1 - rm {params.compressed_profiles} - - gunzip {params.compressed_ko_list} - - touch {output.kofamscan_db_trigger} - """ - - -rule setup_gtdbtk_db: - """ - This rule checks for the gtdb-tk db (minimally currently) and downloads if needed. - """ - - conda: - "envs/gtdb-tk.yaml" - output: - gtdbtk_db_trigger = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"] - params: - gtdbtk_db_dir = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] - log: - config["logs_dir"] + "setup-gtdbtk-db.log" - benchmark: - "benchmarks/setup_gtdbtk_db-benchmarks.tsv" - shell: - """ - mkdir -p {params.gtdbtk_db_dir} - - # storing current working directory to be able to send the log file here - working_dir=$(pwd) - - cd {params.gtdbtk_db_dir} - - # adding wanted location to this conda env PATH (gtdb-tk looks in the GTDBTK_DATA_PATH variable), - # so will be set when the conda environment is started from now on - mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/ - echo 'export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh - - # but still needs to be set for this particular session that is downloading and setting up the db - GTDBTK_DATA_PATH={params.gtdbtk_db_dir} - - # now downloading - download-db.sh > ${{working_dir}}/{log} 2>&1 - - cd - > /dev/null - - touch {output.gtdbtk_db_trigger} - """ - - -rule setup_humann3_dbs: - """ - This rule checks for the databases required for humann3, downloads if needed. - """ - - conda: - "envs/humann3.yaml" - output: - chocophlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_CHOCOPHLAN_TRIGGER_FILE"], - uniref_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UNIREF_TRIGGER_FILE"], - utility_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UTILITY_MAPPING_TRIGGER_FILE"], - metaphlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + "/" + config["METAPHLAN_TRIGGER_FILE"] - params: - humann3_dbs_dir = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"], - metaphlan_dir = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] - resources: - mem_mb = 100000 - log: - config["logs_dir"] + "setup-humann3-dbs.log" - benchmark: - "benchmarks/setup_humann3_dbs-benchmarks.tsv" - shell: - """ - mkdir -p {params} - - printf "### Setting up humann3 reference databases ###\n\n" > {log} 2>&1 - - if [ ! -f {output.chocophlan_db_trigger} ] - then - printf " Downloading full chocophlan db:\n\n" >> {log} 2>&1 - humann3_databases --download chocophlan full {params.humann3_dbs_dir} >> {log} 2>&1 - touch {output.chocophlan_db_trigger} - fi - - if [ ! -f {output.uniref_db_trigger} ] - then - printf "\n\n Downloading uniref90_ec_filtered_diamond db:\n\n" >> {log} 2>&1 - humann3_databases --download uniref uniref90_ec_filtered_diamond {params.humann3_dbs_dir} >> {log} 2>&1 - touch {output.uniref_db_trigger} - fi - - if [ ! -f {output.utility_db_trigger} ] - then - printf "\n\n Downloading full utility_mapping db:\n\n" >> {log} 2>&1 - humann3_databases --download utility_mapping full {params.humann3_dbs_dir} >> {log} 2>&1 - touch {output.utility_db_trigger} - fi - - if [ ! -f {output.metaphlan_db_trigger} ] - then - printf "\n\n Downloading metaphlan db:\n\n" >> {log} 2>&1 - metaphlan --install --index mpa_vJan21_CHOCOPhlAnSGB_202103 --bowtie2db {params.metaphlan_dir} >> {log} 2>&1 - # above added due to issues discussed here: https://forum.biobakery.org/t/metaphlan-v4-0-2-and-huma-3-6-metaphlan-taxonomic-profile-provided-was-not-generated-with-the-expected-database/4296/29 - # metaphlan --install --bowtie2db {params.metaphlan_dir} >> {log} 2>&1 - touch {output.metaphlan_db_trigger} - fi - """ - - -rule clean_all: - shell: - """ - rm -rf {dirs_to_create} - """ diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh new file mode 100755 index 00000000..4ac5d2de --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -e +# only built for use on N288 cluster +# example usage: bash clean-paths.sh +# making sure by chance we are not overwriting a wanted file called 't' + +if [ -s t ]; then +printf "\n This simple program temporarily writes to a file called 't'\n" +printf " Since that exists already here, we are not going to continue.\n\n" +exit +fi + + +sed 's|/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/||g' ${1} \ +| sed 's|/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/||g' \ +| sed 's|/global/data/Data_Processing/Metagenomics_Datasets/||g' \ +| sed 's|/global/data/Data_Processing/Amplicon_Datasets/||g' \ +| sed 's|/global/smf/miniconda38_admin/envs/[^/]*/||g' \ +| sed 's|/[^ ]*/GLDS-|GLDS-|g' \ +| sed 's|/global/[^ ]*||g' > t && mv t ${1} \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-all-gene-tables.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-all-gene-tables.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py new file mode 100755 index 00000000..b0b4a3cb --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python + +import argparse +import subprocess +import os +import sys +import tempfile +import re +import shutil +import pandas as pd +import requests + + +#################### +## 1. For OSD ARG # +#################### +# 1. Process the OSD arg to proper format +# 2. Download the ISA file +# 3. Convert to runsheet(s) +# 4. Select which runsheet to use + +######################## +## 1. For runsheet arg # +######################## +# 1. Select which runsheet to use + +########################## +## 2. Neutral flow after # +########################## +# 1. Validate schema of runsheet +# 2. Check if read_paths are URLs, prompt for download + + +# Process OSD arg: if numeric, append OSD-, if OSD-# or GLDS-#, leave it +def process_osd_argument(osd_arg): + # Check if the argument is just numeric + if osd_arg.isdigit(): + return f"OSD-{osd_arg}" + # Check if it's already in the correct format (OSD-numeric or GLDS-numeric) + elif re.match(r'^(OSD|GLDS)-\d+$', osd_arg): + return osd_arg + else: + print("Invalid format for --OSD argument. Use 'numeric', 'OSD-numeric', or 'GLDS-numeric'.") + sys.exit(1) + +# Check provided OSD/GLDS is not on the list of those that can't be autoprocessed +def check_provided_osd_or_glds(osd_arg): + # dictionaries of OSD/GLDS accessions and reason for not running, key = ID: value = reason + # there are 3 because ID can be provided prefixed with "OSD-", "GLDS-", or nothing - not the most efficient here, but ¯\_(ツ)_/¯ + not_autoprocessable_OSD_dict = { + "OSD-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", + "OSD-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", + "OSD-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." + } + + not_autoprocessable_GLDS_dict = { + "GLDS-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", + "GLDS-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", + "GLDS-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." + } + + not_autoprocessable_dict = { + "65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", + "66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", + "82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." + } + + # Checking based on OSD IDs + if osd_arg in not_autoprocessable_OSD_dict: + print(f"\nThe specified dataset {osd_arg} is unable to be processed with this workflow.") + print(f" Reason: {not_autoprocessable_OSD_dict[osd_arg]}\n") + sys.exit(1) + + # checking based on GLDS IDs + if osd_arg in not_autoprocessable_GLDS_dict: + print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") + print(f" Reason: {not_autoprocessable_GLDS_dict[osd_arg]}\n") + sys.exit(1) + + # checking based on plain IDs + if osd_arg in not_autoprocessable_dict: + print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") + print(f" Reason: {not_autoprocessable_dict[osd_arg]}\n") + sys.exit(1) + +# Run dpt-get-isa-archive in a temp folder, move it back to cd, return the filename +def download_isa_archive(accession_number): + with tempfile.TemporaryDirectory() as temp_dir: + try: + # Run the command in the temporary directory + subprocess.run( + ["dpt-get-isa-archive", "--accession", str(accession_number)], + check=True, + text=True, + cwd=temp_dir + ) + + # Find the downloaded zip file in the temp directory + downloaded_files = [f for f in os.listdir(temp_dir) if f.endswith('.zip')] + if not downloaded_files: + print("No ISA archive file was downloaded.", file=sys.stderr) + return None + + # Assuming there's only one file, get its name + downloaded_file = downloaded_files[0] + + # Move the file back to the current directory + shutil.move(os.path.join(temp_dir, downloaded_file), downloaded_file) + + full_path = os.path.abspath(downloaded_file) + return full_path + + except subprocess.CalledProcessError as e: + print("An error occurred while downloading ISA archive.", file=sys.stderr) + sys.exit(1) + +# Run dpt-isa-to-runsheet in a temp folder, move runsheet(s) back to cd, return list of runsheet(s) +def convert_isa_to_runsheet(accession_number, isa_zip): + with tempfile.TemporaryDirectory() as temp_dir: + # Copy the ISA archive to the temporary directory + temp_isa_zip_path = shutil.copy(isa_zip, temp_dir) + + try: + # Run the dpt-isa-to-runsheet command in the temporary directory + subprocess.run( + ["dpt-isa-to-runsheet", "--accession", accession_number, "--config-type", "amplicon", "--config-version", "Latest", "--isa-archive", os.path.basename(temp_isa_zip_path)], + check=True, + cwd=temp_dir, + stdout=sys.stdout, + stderr=sys.stderr + ) + + # Get the list of created files in the temp directory + created_files = [f for f in os.listdir(temp_dir) if os.path.isfile(os.path.join(temp_dir, f)) and f != os.path.basename(temp_isa_zip_path)] + + # Move the created files back to the current directory + moved_files = [] + for file in created_files: + shutil.move(os.path.join(temp_dir, file), file) + moved_files.append(file) + + return moved_files + + except subprocess.CalledProcessError as e: + print("An error occurred while converting ISA archive to runsheet.", file=sys.stderr) + sys.exit(1) + + +def handle_runsheet_selection(runsheet_files, target=None, specified_runsheet=None): + selected_runsheet = None + + # Use the specified runsheet if provided + if specified_runsheet and specified_runsheet in runsheet_files: + selected_runsheet = specified_runsheet + print(f"Using specified runsheet: {selected_runsheet}") + return selected_runsheet + + if len(runsheet_files) == 1: + if target: + runsheet = runsheet_files[0] + try: + runsheet_df = pd.read_csv(runsheet) + target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] + if target.lower() == target_region.lower(): + selected_runsheet = runsheet + except Exception as e: + print(f"Error reading {runsheet}: {e}") + print(f"Using runsheet: {selected_runsheet}") + + elif len(runsheet_files) > 1: + if target: + matching_runsheets = [] + for runsheet in runsheet_files: + try: + runsheet_df = pd.read_csv(runsheet) + target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] + if target.lower() == target_region.lower(): + matching_runsheets.append(runsheet) + except Exception as e: + print(f"Error reading {runsheet}: {e}") + + if len(matching_runsheets) == 1: + # One matching runsheet found + selected_runsheet = matching_runsheets[0] + print(f"Using runsheet: {selected_runsheet}") + + elif len(matching_runsheets) > 1: + # Multiple matching runsheets found + print("The study contains multiple assays with the same target. Please specify one of the following runsheet names as a parameter for the --specify-runsheet argument:") + for rs in matching_runsheets: + print(rs) + return None + + else: + # No matching runsheets found + print("No runsheet matches the specified genomic target. Please check the target or specify a runsheet using --specify-runsheet.") + return None + + else: + # No target specified and multiple runsheets are available + print("Multiple runsheets found but no genomic target specified. Cannot proceed. Use -t {16S, 18S, ITS} or --target {16S, 18S, ITS} to specify which assay/dataset to use.") + return None + + # Remove unselected runsheet files if a runsheet was selected + if selected_runsheet: + unselected_runsheets = [file for file in runsheet_files if file != selected_runsheet] + for file in unselected_runsheets: + try: + os.remove(file) + except Exception as e: + pass + + return selected_runsheet + +def check_runsheet_read_paths(runsheet_df): + # Check if a string is a URL / genelab URL + def is_url(s): + return "http://" in s or "https://" in s or "genelab-data.ndc.nasa.gov" in s + + + # Check if 'read2_path' column exists + paired_end = runsheet_df['paired_end'].eq(True).all() + + # Check the first row to determine if the paths are URLs or local paths + first_row = runsheet_df.iloc[0] + + uses_url = is_url(first_row['read1_path']) + if uses_url: + print("Runsheet references URLs.") + else: + print("Runsheet references local read files.") + + return uses_url + +def sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt'): + # Check if the DataFrame is paired-end + paired_end = runsheet_df['paired_end'].eq(True).all() + + with open(output_file, 'w') as file: + for index, row in runsheet_df.iterrows(): + # Extract base names minus the suffixes + base_read1 = os.path.basename(row['read1_path']).replace(row['raw_R1_suffix'], '') + + if paired_end: + base_read2 = os.path.basename(row['read2_path']).replace(row['raw_R2_suffix'], '') + # Check if base names match for paired-end data, necessary for snakemake arg expansion + if base_read1 != base_read2: + print(f"Mismatch in sample IDs in row {index}: {base_read1} vs {base_read2}") + sys.exit(1) + + # Write the base name to the file + file.write(f"{base_read1}\n") + + print(f"Unique sample IDs written to {output_file}") + +def handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt'): + print("Downloading read files...") + # Check if the DataFrame is paired-end + paired_end = runsheet_df['paired_end'].eq(True).all() + # Write 'Sample Name' into unique-sample-IDs.txt + with open(output_file, 'w') as file: + for sample_name in runsheet_df['Sample Name']: + file.write(sample_name + '\n') + + # Create ./raw_reads/ directory if it does not exist + raw_reads_dir = os.path.abspath('./raw_reads/') + if not os.path.exists(raw_reads_dir): + os.makedirs(raw_reads_dir) + + # Initialize count for skipped downloads + skipped_downloads_count = 0 + # Iterate over each row and download files if they don't exist + for _, row in runsheet_df.iterrows(): + sample_id = row['Sample Name'] + read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) + read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) if paired_end else None + + # Download Read 1 if it doesn't exist + if not os.path.exists(read1_path): + download_url_to_file(row['read1_path'], read1_path) + else: + skipped_downloads_count += 1 + + # Download Read 2 if it doesn't exist and if paired_end + if paired_end and read2_path and not os.path.exists(read2_path): + download_url_to_file(row['read2_path'], read2_path) + elif paired_end and read2_path: + skipped_downloads_count += 1 + + # Print the number of skipped downloads + if skipped_downloads_count > 0: + print(f"{skipped_downloads_count} read file(s) were already present and were not downloaded.") + +def download_url_to_file(url, file_path, max_retries=3, timeout_seconds=120): + retries = 0 + success = False + + while retries < max_retries and not success: + try: + response = requests.get(url, stream=True, timeout=timeout_seconds) + response.raise_for_status() # Raises an HTTPError for bad status codes + + with open(file_path, 'wb') as file: + shutil.copyfileobj(response.raw, file) + success = True + + except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: + retries += 1 + print(f"Attempt {retries}: Error occurred: {e}") + + except requests.exceptions.RequestException as e: + print(f"An unexpected error occurred: {e}") + break + + if not success: + print("Failed to download the read files.") + + +def write_params(runsheet_df, uses_urls): + + # Extract necessary variables from runsheet_df + data_type = "PE" if runsheet_df['paired_end'].eq(True).all() else "SE" + raw_R1_suffix = runsheet_df['raw_R1_suffix'].unique()[0] + raw_R2_suffix = runsheet_df['raw_R2_suffix'].unique()[0] if data_type == "PE" else "" + f_primer = runsheet_df['F_Primer'].unique()[0] + r_primer = runsheet_df['R_Primer'].unique()[0] if data_type == "PE" else "" + target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] + + # Determine raw_reads_directory + if uses_urls: + raw_reads_directory = os.path.abspath('./raw_reads/') + '/' + else: + read1_path_dir = os.path.dirname(runsheet_df['read1_path'].iloc[0]) + raw_reads_directory = os.path.abspath(read1_path_dir) + '/' if read1_path_dir else "./" + + with open("GLparams_file.csv", "w") as f: + f.write("raw_reads_directory,raw_R1_suffix,raw_R2_suffix,f_primer,r_primer,target_region,data_type\n") + if data_type == "PE": + f.write(f"{raw_reads_directory},{raw_R1_suffix},{raw_R2_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") + else: + f.write(f"{raw_reads_directory},{raw_R1_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") + + + +def write_input_file(runsheet_df): + """ Write input file for the workflow...""" + + print("writing out GLfile.csv...") + # Check if the DataFrame is paired-end + paired_end = runsheet_df['paired_end'].eq(True).all() + + # Create ./raw_reads/ directory if it does not exist + raw_reads_dir = os.path.abspath('./raw_reads/') + if not os.path.exists(raw_reads_dir): + os.makedirs(raw_reads_dir) + + # Create input file + with open("GLfile.csv", 'w') as file: + + if paired_end: + file.write(f"sample_id,forward,reverse,paired\n") + # Iterate over each row and download files if they don't exist + for _, row in runsheet_df.iterrows(): + sample_id = row['Sample Name'] + read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) + read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) + file.write(f"{sample_id},{read1_path},{read2_path},true\n") + else: + file.write(f"sample_id,forward,paired\n") + for _, row in runsheet_df.iterrows(): + sample_id = row['Sample Name'] + read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) + file.write(f"{sample_id},{read1_path},false\n") + + +# Check for single primer set, also check for invalid characters in primers used, exit if either +def validate_primer_sequences(runsheet_df): + errors = [] + + # Check that there is only 1 entry in each primer column + if len(runsheet_df['F_Primer'].unique()) > 1: + errors.append(f"Multiple primer sequences present in F_Primer: {runsheet_df['F_Primer'].unique()}.") + + if len(runsheet_df['R_Primer'].unique()) > 1: + errors.append(f"Multiple primer sequences present in R_primer: {runsheet_df['R_Primer'].unique()}.") + + + # Check for non-letter characters in primer sequences + def has_non_letter_characters(primer): + # Pattern to find any character that is not a letter + non_letter_pattern = re.compile(r'[^A-Za-z]') + return non_letter_pattern.search(primer) + + # Check each unique primer in the F_Primer and R_Primer columns + for f_primer in runsheet_df['F_Primer'].unique(): + if has_non_letter_characters(f_primer): + errors.append(f"Non-letter characters detected in F_Primer: '{f_primer}'") + + for r_primer in runsheet_df['R_Primer'].unique(): + if has_non_letter_characters(r_primer): + errors.append(f"Non-letter characters detected in R_Primer: '{r_primer}'") + + if errors: + print("Error: Invalid primer sequence(s) detected in the runsheet.") + for error in errors: + print(f" - {error}") + print("Correct the primer sequences in the runsheet and rerun the workflow from the runsheet using the --runsheetPath argument.") + sys.exit(1) + + +def main(): + # Argument parser setup with short argument names and an automatic help option + parser = argparse.ArgumentParser( + description='Create Runsheet from Genelab ID.', + add_help=True, + usage='%(prog)s [options]' # Custom usage message + ) + + parser.add_argument('-o', '--OSD', + metavar='osd_number', + help='A GeneLab OSD dataset accession number to pull its read files and associated metadata. Acceptable formats: ###, OSD-###, GLDS-###', + type=str) + + parser.add_argument('-t', '--target', + choices=['16S', '18S', 'ITS'], + help='Specify the amplicon target for the assay. Options: 16S, 18S, ITS. This is used to select the appropriate dataset from an OSD study when multiple options are available.', + type=str) + + parser.add_argument('-r', '--runsheetPath', + metavar='/path/to/runsheet.csv', + help='Set up the Snakemake workflow using a specified runsheet file.', + type=str) + + + parser.add_argument('--specify-runsheet', + help='Specifies the runsheet for an OSD dataset by name. Only used if there are multiple datasets with the same target in the study.', + metavar='runsheet_name', + type=str) + + + # Check if no arguments were provided + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + try: + args = parser.parse_args() + except SystemExit: + parser.print_help() + sys.exit(1) + + target = args.target + isa_zip = "" + + # If OSD is used, pull ISA metadata for the study, create and select the runsheet + if args.OSD: + accession_number = process_osd_argument(args.OSD) + + # checking OSD/GLDS ID is not on the list of those the workflow definitely can't handle + check_provided_osd_or_glds(args.OSD) + + isa_zip = download_isa_archive(accession_number) + if isa_zip: + runsheet_files = convert_isa_to_runsheet(accession_number, isa_zip) + if runsheet_files: + runsheet_file = handle_runsheet_selection(runsheet_files, target, args.specify_runsheet) + if runsheet_file is None: + sys.exit() + else: + print("No runsheet files were created.") + else: + print("No ISA archive was downloaded. Cannot proceed to runsheet conversion.", file=sys.stderr) + sys.exit(1) + + # If a runsheet is specified, use that runsheet + elif args.runsheetPath: + runsheet_file = args.runsheetPath + + # Load the runsheet if a file is specified + # Create unique-sample-IDs.txt based on filenames or 'Sample Name' if URLs + # Download files if necessary + if args.OSD or args.runsheetPath: + if runsheet_file: + #runsheet_df = validate_runsheet_schema(runsheet_file) + runsheet_df = pd.read_csv(runsheet_file) + if runsheet_df is not None: + uses_urls = check_runsheet_read_paths(runsheet_df) + + # Check for primer file / invalid primers + validate_primer_sequences(runsheet_df) + + # Create the 'unique-sample-IDs.txt' file and download read files if necessary + if uses_urls: + handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt') + else: + sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt') + + # Create the config.yaml file + write_params(runsheet_df=runsheet_df, uses_urls=uses_urls) + # Create input file required by the workflow + write_input_file(runsheet_df=runsheet_df) + else: + print("Failed to validate the runsheet file.", file=sys.stderr) + sys.exit(1) + else: + print("No runsheet file specified.", file=sys.stderr) + sys.exit(1) + + + + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh new file mode 100644 index 00000000..53a711e4 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -e + +echo "Downloading the GTDB-Tk database to ${GTDBTK_DATA_PATH}..." + +# GTDBTK_DB_PATH is defined in build.sh, store the db there + + +db_url=https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz + + +wget $db_url -P ${GTDBTK_DATA_PATH} +tar xvzf ${GTDBTK_DATA_PATH}/gtdbtk_r202_data.tar.gz -C ${GTDBTK_DATA_PATH} --strip 1 +rm ${GTDBTK_DATA_PATH}/gtdbtk_r202_data.tar.gz + +echo "GTDB-Tk database has been successfully downloaded." + +exit 0 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get-cov-and-depth.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get-cov-and-depth.sh new file mode 100755 index 00000000..a0641fed --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get-cov-and-depth.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +SAMPLE_ID=$1 +ASSEMBLY=$2 +NT=$3 +BAM=$4 +PILEUP_MEM=$5 + + +# Only running if the assembly produced anything +if [ -s ${ASSEMBLY} ]; then + + # Only running on genes also if genes were identified + if [ -s ${NT} ]; then + + pileup.sh -Xmx${PILEUP_MEM} -in ${BAM} \ + fastaorf=${NT} outorf=${SAMPLE_ID}-gene-cov.tmp \ + out=${SAMPLE_ID}-contig-cov-and-det.tmp + + # Filtering coverages based on detection + # Genes + grep -v "#" ${SAMPLE_ID}-gene-cov-and-det.tmp | \ + awk -F $'\t' ' BEGIN {OFS=FS} { if ( $10 <= 0.5 ) $4 = 0 } { print \$1,\$4 } ' \ + > ${SAMPLE_ID}-gene-cov.tmp + + cat <( printf "gene_ID\tcoverage\n" ) ${SAMPLE_ID}-gene-cov.tmp \ + > ${SAMPLE_ID}-gene-coverages.tsv + + # Contigs + grep -v "#" ${SAMPLE_ID}-contig-cov-and-det.tmp | \ + awk -F $'\t' ' BEGIN {OFS=FS} { if ( $5 <= 50 ) $2 = 0 } { print $1,$2 } ' \ + > ${SAMPLE_ID}-contig-cov.tmp + + cat <( printf "contig_ID\tcoverage\n" ) ${SAMPLE_ID}-contig-cov.tmp \ + > ${SAMPLE_ID}-contig-coverages.tsv + + # Removing intermediate files + rm ${SAMPLE_ID}-gene-cov-and-det.tmp ${SAMPLE_ID}-contig-cov-and-det.tmp \ + ${SAMPLE_ID}-gene-cov.tmp ${SAMPLE_ID}-contig-cov.tmp + + else + + pileup.sh -in ${BAM} out=${SAMPLE_ID}-contig-cov-and-det.tmp + + # Filtering coverages based on detection + # Contigs + grep -v "#" ${SAMPLE_ID}-contig-cov-and-det.tmp | \ + awk -F $'\t' ' BEGIN {OFS=FS} { if ( $5 <= 50 ) $2 = 0 } { print $1,$2 } ' \ + > ${SAMPLE_ID}-contig-cov.tmp + cat <( printf "contig_ID\tcoverage\n" ) ${SAMPLE_ID}-contig-cov.tmp \ + > ${SAMPLE_ID}-contig-coverages.tsv + + # Writing out empty genes coverage file + printf "gene_ID\tcoverage\n" > ${SAMPLE_ID}-gene-coverages.tsv + printf "\n\nGene-level coverage info not recovered because the assembly didn't have any genes identified.\n" + + # Removing intermediate files + rm ${SAMPLE_ID}-contig-cov-and-det.tmp ${SAMPLE_ID}-contig-cov.tmp + + fi + +else + + printf "gene_ID\tcoverage\n" > ${SAMPLE_ID}-gene-coverages.tsv + printf "contig_ID\tcoverage\n" > ${SAMPLE_ID}-contig-coverages.tsv + printf "Coverage info not recovered because the assembly didn't produce anything.\n" + +fi \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh new file mode 100755 index 00000000..2bf57eb5 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +MAGs_dir=$1 +MAG_assembly_summaries=$2 +MAGs_checkm_out=$3 +gtdbtk-out=$4 + +# Making sure none of the intermediate files exist already +rm -rf checkm-estimates.tmp \ + gtdb-taxonomies.tmp \ + checkm-estimates-with-headers.tmp \ + gtdb-taxonomies-with-headers.tmp \ + MAGs-overview.tmp \ + MAGs-overview-header.tmp \ + MAGs-overview-sorted.tmp +for MAG in $(cut -f 1 ${MAG_assembly_summaries} | tail -n +2); do + + grep -w -m 1 "^${MAG}" ${MAGs_checkm_out} | \ + cut -f 12,13,14 >> checkm-estimates.tmp + + grep -w "^${MAG}" ${gtdbtk-out}/gtdbtk.*.summary.tsv | \ + cut -f 2 | sed 's/^.__//' | \ + sed 's/;.__/\t/g' | \ + awk 'BEGIN{ OFS=FS="\t" } { for (i=1; i<=NF; i++) if ( $i ~ /^ *$/ ) $i = "NA" }; 1' \ + >> gtdb-taxonomies.tmp +done + diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/parse-MAG-annots.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/parse-MAG-annots.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/swap-MAG-IDs.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py rename to Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/swap-MAG-IDs.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml deleted file mode 100644 index d340c671..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml +++ /dev/null @@ -1,258 +0,0 @@ -############################################################################################ -## Configuration file for GeneLab Illumina metagenomics processing workflow ## -## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## -############################################################################################ - -############################################################ -##################### VARIABLES TO SET ##################### -############################################################ - -############################################################################ -##### This first set of variables needs to match what is on our system ##### -############################################################################ - -## single-column file with unique portion of sample names -sample_info_file: - "unique-sample-IDs.txt" - -## raw reads directory (can be relative to workflow directory, or needs to be full path) -raw_reads_dir: - "../Raw_Sequence_Data/" - -## if data are single-end only (only one read-file per sample), set this to "TRUE", anything else is considered paired-end -single_end_data: - "" - -## raw read suffixes (region following the unique part of the sample names) - # e.g. for "Sample-1_R1_raw.fastq.gz" would be "_R1_raw.fastq.gz" -raw_R1_suffix: - "_R1_raw.fastq.gz" -raw_R2_suffix: - "_R2_raw.fastq.gz" - - # if single-end data, set this one (others above don't matter) -raw_suffix: - "_raw.fastq.gz" - -## root directory of reference databases (or where they will be downloaded if they don't exist yet) - # this should be provided as a full path (starting with `/`) and include the ending `/` as in the - # below example (note that the the `~/` home shortcut is not expanded - # by snakemake's evaluation of files, so don't use that) - # also note, if this is a GeneLab processed dataset config file, the path may - # have been modified for security purposes and no longer be listed as a full path here -REF_DB_ROOT_DIR: - "/path/to/ref-dbs/" - -###################################################################### -##### The rest only need to be altered if we want to change them ##### -###################################################################### - -## run assembly-based workflow, read-based, or both -# (values need to be one of: "assembly-based", "read-based", or "both") -workflow: - "both" - -## number of threads to use PER snakemake job (which is set with the -j parameter passed to snakemake call) - # passed to megahit, bowtie2, samtools, metabat2, checkm-pplacer (many may be running concurrently) -num_threads: - 8 - -## number of CPUs to use PER snakemake job - # passed to KOFamScan, CAT, checkm (many may be running concurrently) -num_cpus: - 8 - -## number of cpus passed to pplacer by gtdb-tk and checkm, pplacer can have issues with memory with multiple cpus; see e.g. https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes -gtdb_tk_checkm_pplacer_cpus: - 1 - -## number of CPUs to use for gtdb-tk (only 1 gtdb-tk job will be run, so not multiplied) -gtdb_tk_num_cpus: - 8 - -## scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes - # leave empty if wanting to use memory, the default, put in quotes the path to a directory that already exists if wanting to use disk space -gtdb_tk_scratch_location: - "" - -## maximum memory allowed passed to megahit assembler - # can be set either by proportion of available on system, e.g. 0.5 - # or by absolute value in bytes, e.g. 100e9 would be 100 GB -max_mem_megahit: - 100e9 - -## Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options -block_size: - 4 - -## reduced_tree option for checkm, limits the RAM usage to 16GB; https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands#tree - # "TRUE" for yes, anything will be considered "FALSE" and the default full tree will be used -reduced_tree: - "" - -## MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics -minimum_estimated_completion: - 90 -maximum_estimated_redundancy: - 10 -maximum_estimated_strain_heterogeneity: - 50 - -## quality trimmed/filtered suffixes -filtered_R1_suffix: - "_R1_filtered.fastq.gz" -filtered_R2_suffix: - "_R2_filtered.fastq.gz" - -# if single-end -filtered_suffix: - "_filtered.fastq.gz" - -## output directories (all relative to processing directory, will be created) -fastqc_out_dir: - "../FastQC_Outputs/" -filtered_reads_dir: - "../Filtered_Sequence_Data/" -assembly_based_dir: - "../Assembly-based_Processing/" -assemblies_dir: - "../Assembly-based_Processing/assemblies/" -genes_dir: - "../Assembly-based_Processing/predicted-genes/" -annotations_and_tax_dir: - "../Assembly-based_Processing/annotations-and-taxonomy/" -mapping_dir: - "../Assembly-based_Processing/read-mapping/" -combined_output_dir: - "../Assembly-based_Processing/combined-outputs/" -bins_dir: - "../Assembly-based_Processing/bins/" -MAGs_dir: - "../Assembly-based_Processing/MAGs/" -read_based_dir: - "../Read-based_Processing/" -logs_dir: - "logs/" - - -## additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets) -# leave as empty, i.e. "", if not wanted, include separator at end if adding one, e.g. "Swift1S_" -additional_filename_prefix: - "" - - -## setting for trimming recommended when working with Swift 1S libraries - # adds `swift=t` setting to bbduk quality trimming/filtering command - # for info on this see, e.g., https://swiftbiosci.com/wp-content/uploads/2019/03/16-0853-Tail-Trim-Final-442019.pdf - # set to "TRUE" if data was generated with Swift 1S library prep -swift_1S: - "FALSE" - -## memory used by bbmap's pileup.sh (within the get_cov_and_det rule) -# passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes -# 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased -pileup_mem: - "5g" - -################################################################################################################ -##### Resource specifications that may need to be changed (mostly only necessary if using a job scheduler) ##### -####### Could leave these as-is to start, but they are here to be increased if a job fails due to memory ####### -################################################################################################################ - -### these are all passed in the "resources" directive of their respective rules in the Snakefile, going to - # the "mem_mb" argument (so should be provided in terms of megabytes) - -# passed to megahit in the assembly_PE and assembly_SE rules - # this should match what is passed to "max_mem_megahit" above, though it needs to be written differently - # this is passed as "mem_mb", so 100000 would be equal to the default 100e9 set above for "max_mem_megahit" -megahit_memory_resources: - 100000 - -# passed to pileup.sh within the get_cov_and_det rule - # should match what is passed to "pileup_mem" above, though needs to be written differently - # this is passed as "mem_mb", e.g., 5g above, for 5 gigabytes, would be 5000 megabytes, so we need to set this variable to 5000 -pileup_memory_resources: - 5000 - -# passed to mapping_SE and mapping_PE rules, passed as "mem_mb", so 25000 here means 25 gigabytes of memory will be allocated by the scheduler -mapping_memory_resources: - 25000 - -# passed to rule gtdbtk_on_MAGs -gtdbtk_memory_resources: - 500000 - -# passed to rule checkm_on_bins: -checkm_memory_resources: - 250000 - -# passed to humann3 on rules humann3_PE and humann3_SE: -humann3_memory_resources: - 100000 - -# passed to CAT in tax_classification rule: -CAT_memory_resources: - 40000 - -# passed to KOFamScan in rule KO_annotation -KOFamScan_memory_resources: - 5000 - - -####################################################### -################# REFERENCE DATABASES ################# -####################################################### -# The below variables probably shouldn't be changed unless we really want to for some reason. -# The workflow will check the location pointed to above for the below databases, and install them -# if they are not already there. It looks for the below "TRIGGER" filenames (they -# all end with "*_DB_SETUP") in the directory for each database, which it creates when -# it sets them up initially. If we want to point to DBs that already exist on our setup, -# we need to add these (empty) files to their respective directories. The -# workflow just checks the file is there to know it doesn't need to setup the DB. -# -# All together, after installed and unpacked, these will take up about 240 GB. But may -# require up to 500 GB during installation and initial un-packing. - -## specific database locations -KOFAMSCAN_DIR: - "kofamscan_db" -KOFAMSCAN_TRIGGER_FILE: - "KO_DB_SETUP" -CAT_DIR: - "CAT_prepare_20210107" -CAT_DL_FILE: - "CAT_prepare_20210107.tar.gz" -CAT_DL_LINK: - "tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" -CAT_TRIGGER_FILE: - "CAT_DB_SETUP" -CAT_DB: - "/2021-01-07_CAT_database" -CAT_TAX: - "/2021-01-07_taxonomy" -GTDB_DATA_PATH: - "GTDB-tk-ref-db" -GTDB_TRIGGER_FILE: - "GTDBTK_DB_SETUP" -HUMANN3_DBS_DIR: - "humann3-db" -HUMANN3_CHOCOPHLAN_TRIGGER_FILE: - "CHOCOPHLAN_DB_SETUP" -HUMANN3_UNIREF_TRIGGER_FILE: - "UNIREF_DB_SETUP" -HUMANN3_UTILITY_MAPPING_TRIGGER_FILE: - "UTILITY_MAPPING_SETUP" -METAPHLAN4_DB_DIR: - "metaphlan4-db" -METAPHLAN_TRIGGER_FILE: - "METAPHLAN4_DB_SETUP" - -## example usage command ## -# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p - -# `--use-conda` – this specifies to use the conda environments included in the workflow -# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). -# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) -# `-p` – specifies to print out each command being run to the screen - -# See `snakemake -h` for more options and details. diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config/bbtools_adapters.fa b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config/bbtools_adapters.fa new file mode 100644 index 00000000..a87d4258 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config/bbtools_adapters.fa @@ -0,0 +1,317 @@ +>Reverse_adapter +AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Universal_Adapter +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>pcr_dimer +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG +>PCR_Primers +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>TruSeq_Adapter_Index_1_6 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_2 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_3 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_4 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_5 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_6 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_7 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_8 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_9 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_10 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_11 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_12 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_13 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_14 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_15 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_16 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_18_7 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_19 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_20 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_21 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_22 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_23 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_25 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_27 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG +>I5_Nextera_Transposase_1 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>I7_Nextera_Transposase_1 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>I5_Nextera_Transposase_2 +CTGTCTCTTATACACATCTCTGATGGCGCGAGGGAGGC +>I7_Nextera_Transposase_2 +CTGTCTCTTATACACATCTCTGAGCGGGCTGGCAAGGC +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]501 +GACGCTGCCGACGAGCGATCTAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502 +GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503 +GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]504 +GACGCTGCCGACGATCTACTCTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505 +GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506 +GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507 +GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508 +GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517 +GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701 +CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702 +CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703 +CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704 +CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705 +CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706 +CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707 +CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N708 +CCGAGCCCACGAGACCAGAGAGGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N709 +CCGAGCCCACGAGACGCTACGCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710 +CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711 +CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712 +CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG +>I5_Primer_Nextera_XT_Index_Kit_v2_S502 +GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S503 +GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S505 +GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S506 +GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S507 +GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S508 +GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S510 +GACGCTGCCGACGAATTAGACGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S511 +GACGCTGCCGACGACGGAGAGAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S513 +GACGCTGCCGACGACTAGTCGAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S515 +GACGCTGCCGACGAAGCTAGAAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S516 +GACGCTGCCGACGAACTCTAGGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S517 +GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S518 +GACGCTGCCGACGACTTAATAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S520 +GACGCTGCCGACGAATAGCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S521 +GACGCTGCCGACGATAAGGCTCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S522 +GACGCTGCCGACGATCGCATAAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I7_Primer_Nextera_XT_Index_Kit_v2_N701 +CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N702 +CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N703 +CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N704 +CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N705 +CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N706 +CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N707 +CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N710 +CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N711 +CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N712 +CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N714 +CCGAGCCCACGAGACGCTCATGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N715 +CCGAGCCCACGAGACATCTCAGGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N716 +CCGAGCCCACGAGACACTCGCTAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N718 +CCGAGCCCACGAGACGGAGCTACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N719 +CCGAGCCCACGAGACGCGTAGTAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N720 +CCGAGCCCACGAGACCGGAGCCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N721 +CCGAGCCCACGAGACTACGCTGCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N722 +CCGAGCCCACGAGACATGCGCAGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N723 +CCGAGCCCACGAGACTAGCGCTCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N724 +CCGAGCCCACGAGACACTGAGCGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N726 +CCGAGCCCACGAGACCCTAAGACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N727 +CCGAGCCCACGAGACCGATCAGTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N728 +CCGAGCCCACGAGACTGCAGCTAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N729 +CCGAGCCCACGAGACTCGACGTCATCTCGTATGCCGTCTTCTGCTTG +>I5_Adapter_Nextera +CTGATGGCGCGAGGGAGGCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I7_Adapter_Nextera_No_Barcode +CTGAGCGGGCTGGCAAGGCAGACCGATCTCGTATGCCGTCTTCTGCTTG +>Nextera_LMP_Read1_External_Adapter +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>Nextera_LMP_Read2_External_Adapter +GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>RNA_Adapter_(RA5)_part_#_15013205 +GATCGTCGGACTGTAGAACTCTGAAC +>RNA_Adapter_(RA3)_part_#_15013207 +CCTTGGCACCCGAGAATTCCA +>Stop_Oligo_(STP)_8 +CCACGGGAACGTGGTGGAATTC +>RNA_RT_Primer_(RTP)_part_#_15013981 +TGGAATTCTCGGGTGCCAAGGC +>RNA_PCR_Primer_(RP1)_part_#_15013198 +TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT +>RNA_PCR_Primer_Index_1_(RPI1)_2,9 +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_2_(RPI2) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_3_(RPI3) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_4_(RPI4) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_5_(RPI5) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_6_(RPI6) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_7_(RPI7) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_8_(RPI8) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_9_(RPI9) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_10_(RPI10) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_11_(RPI11) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_12_(RPI12) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_13_(RPI13) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTCAAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_14_(RPI14) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTTCCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_15_(RPI15) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGTCAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_16_(RPI16) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_17_(RPI17) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTAGAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_18_(RPI18) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_19_(RPI19) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_20_(RPI20) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGGCCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_21_(RPI21) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTTTCGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_22_(RPI22) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGTACGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_23_(RPI23) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGAGTGGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_24_(RPI24) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGTAGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_25_(RPI25) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_26_(RPI26) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGAGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_27_(RPI27) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATTCCTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_28_(RPI28) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAAAAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_29_(RPI29) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAACTAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_30_(RPI30) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACCGGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_31_(RPI31) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACGATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_32_(RPI32) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACTCAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_33_(RPI33) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGGCGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_34_(RPI34) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATGGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_35_(RPI35) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATTTTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_36_(RPI36) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCAACAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_37_(RPI37) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGGAATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_38_(RPI38) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTAGCTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_39_(RPI39) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTATACATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_40_(RPI40) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTCAGAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_41_(RPI41) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_42_(RPI42) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAATCGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_43_(RPI43) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_44_(RPI44) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTATAATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_45_(RPI45) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCATTCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_46_(RPI46) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCCCGAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_47_(RPI47) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGAAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_48_(RPI48) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGGCAATCTCGTATGCCGTCTTCTGCTTG +>PhiX_read1_adapter +AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAA +>PhiX_read2_adapter +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA +>Bisulfite_R1 +AGATCGGAAGAGCACACGTCTGAAC +>Bisulfite_R2 +AGATCGGAAGAGCGTCGTGTAGGGA +>Illumina Small RNA v1.5 3p Adapter +ATCTCGTATGCCGTCTTCTGCTTG +>Illumina RNA 3p Adapter (RA3) +TGGAATTCTCGGGTGCCAAGG +>Illumina RNA 5p Adapter (RA5) +GTTCAGAGTTCTACAGTCCGACGATC +>Illumina 3p RNA Adapter +TCGTATGCCGTCTTCTGCTTGT + diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml index f228b5fa..1e286a7c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml @@ -1,3 +1,4 @@ +name: CAT channels: - conda-forge - bioconda diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf new file mode 100644 index 00000000..85600dbd --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -0,0 +1,208 @@ +nextflow.enable.dsl=2 + +/************************************************** +* HELP MENU ************************************** +************************************************** +if (params.help) { + println("┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅") + println("┇ RNASeq Consensus Pipeline: $workflow.manifest.version ┇") + println("┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅") + println("Usage example 1: Processing GLDS datasets using genome fasta and gtf from Ensembl") + println(" > nextflow run ./main.nf --gldsAccession GLDS-194 -resume -profile conda --paired true") + println() + println("Usage example 2: Processing GLDS datasets using local genome fasta and gtf") + println(" Note: ensemblVersion and ref_source are used here to label subdirectories for derived reference files.") + println(" > nextflow run ./main.nf --gldsAccession GLDS-194 --ensemblVersion 96 --ref_source --ref_fasta --ref_gtf ") + println() + println("Usage example 3: Processing Other datasets") + println(" Note: This requires a user-created runsheet.") + println(" > nextflow run ./main.nf --runsheetPath ") + println() + println("arguments:") + println(" --help show this help message and exit") + println(" --gldsAccession GLDS-000") + println(" the GLDS accession id to process through the RNASeq Concensus Pipeline.") + println(" --runsheetPath Use a local runsheet instead one automatically generated from a GLDS ISA archive.") + println(" --ensemblVersion n Specifies the ensembl Version to use for the reference genome. The default version is ") + println(" --skipVV Skip automated V&V. Default: false") + println(" --paired Are the input reads paired-end. Default: true. set to false if single-end") + println(" --outputDir Directory to save staged raw files and processed files. Default: ") + exit 0 + } + +println "PARAMS: $params" +println "\n" +println "Storing any newly fetched primary references files here: ${params.referenceStorePath}" +println "Storing any newly generated derived reference files here: ${params.derivedStorePath}" + +/************************************************** +* CHECK REQUIRED PARAMS AND LOAD ***************** +************************************************** +// Get all params sourced data into channels +// Set up channel containing glds accession number +if ( params.gldsAccession ){ + ch_glds_accession = Channel.from( params.gldsAccession ) + } else { + exit 1, "Missing Required Parameter: gldsAccession. Example for setting on CLI: --gldsAccession GLDS-194" + } + +// Check conditionally required parameter (if using direct fasta, an ensemblVersion must also be supplied) +if ( params.ref_fasta ) { + if ( !params.ensemblVersion ) { exit 1, "Missing Required Parameter: ensemblVersion. Example for setting on CLI: --ensemblVersion 96" } +} + +if ( !params.outputDir ) { params.outputDir = "$workflow.launchDir" } + +ch_multiqc_config = params.multiqcConfig ? Channel.fromPath( params.multiqcConfig ) : Channel.fromPath("NO_FILE") + + + +*/ + +// color defs +c_back_bright_red = "\u001b[41;1m"; +c_bright_green = "\u001b[32;1m"; +c_blue = "\033[0;34m"; +c_reset = "\033[0m"; + +// Processes to create the required database(s) if not provided +include { SETUP_CAT_DB; SETUP_KOFAMSCAN_DB; SETUP_GTDBTK_DB; + SETUP_CHOCOPHLAN; SETUP_UNIREF; SETUP_UTILITY_MAPPING; + SETUP_METAPHLAN } from "./modules/database_creation.nf" +include { make_humann_db } from "./modules/database_creation.nf" + +// Read quality check and filtering +include { quality_check as raw_qc; BBDUK } from "./modules/quality_assessment.nf" +include { quality_check as filtered_qc } from "./modules/quality_assessment.nf" + +// Read-based workflow +include { read_based } from "./modules/read_based_processing.nf" + +// Assembly-based workflow +include { assembly_based } from "./modules/assembly_based_processing.nf" + + +// Workflow to perform read-based analysis +workflow run_read_based_analysis { + + + take: + filtered_ch + + main: + + if(!params.database.chocophlan_dir ||!params.database.uniref_dir || + !params.database.metaphlan_db_dir || !params.database.utilities_dir) { + + make_humann_db() + read_based(filtered_ch, + make_humann_db.out.chocophlan_dir, + make_humann_db.out.uniref_dir, + make_humann_db.out.metaphlan_db_dir, + make_humann_db.out.utilities_dir) + }else{ + + read_based(filtered_ch, + params.database.chocophlan_dir, + params.database.uniref_dir, + params.database.metaphlan_db_dir, + params.database.utilities_dir) + } + +} + +// Workflow to perform assembly-based analysis +workflow run_assembly_based_analysis { + + take: + file_ch + filtered_ch + + + main: + kofam_db = params.database.ko_db_dir + if(!params.database.ko_db_dir) { + SETUP_KOFAMSCAN_DB() + kofam_db = SETUP_KOFAMSCAN_DB.out.ko_db_dir + } + + cat_db = params.database.cat_db + if(!params.database.cat_db){ + + SETUP_CAT_DB(params.database.CAT_DB_LINK) + cat_db = SETUP_CAT_DB.out.cat_db + } + + gtdbtk_db_dir = params.database.gtdbtk_db_dir + if(!params.database.gtdbtk_db_dir){ + SETUP_GTDBTK_DB() + gtdbtk_db_dir = SETUP_GTDBTK_DB.out.gtdbtk_db_dir + } + + // Run assembly based workflow + assembly_based(file_ch, filtered_ch, kofam_db, + cat_db, gtdbtk_db_dir, params.use_gtdbtk_scratch_location) + +} + + + +// A function to delete white spaces from an input string and covert it to lower case +def deleteWS(string){ + + return string.replaceAll(/\s+/, '').toLowerCase() + +} + +// Main workflow +workflow { + + // Parse file input + if(params.GLDS_accession){ + + GET_RUNSHEET() + GET_RUNSHEET.out.input_file + .splitCsv(header:true) + .set{file_ch} + + GET_RUNSHEET.out.params_file + .splitCsv(header:true) + .set{params_ch} + + + }else{ + + Channel.fromPath(params.csv_file, checkIfExists: true) + .splitCsv(header:true) + .set{file_ch} + } + + + file_ch.map{ + row -> deleteWS(row.paired) == 'true' ? tuple( "${row.sample_id}", [file("${row.forward}"), file("${row.reverse}")], deleteWS(row.paired)) : + tuple( "${row.sample_id}", [file("${row.forward}")], deleteWS(row.paired)) + }.set{reads_ch} + //reads_ch.view() + //return + + // Qality check and trim the input reads + raw_qc(Channel.of("raw"), params.multiqc_config,reads_ch) + filtered_ch = BBDUK(reads_ch, params.adapters) + filtered_qc(Channel.of("filtered"), params.multiqc_config, filtered_ch) + + // Run the analysis based on selection i.e, read-based, assembly-based or both + // it will run both by default + if(params.workflow == 'read-based'){ + run_read_based_analysis(filtered_ch) + }else if(params.workflow == 'assembly-based') { + run_assembly_based_analysis(file_ch,filtered_ch) + }else{ + run_read_based_analysis(filtered_ch) + run_assembly_based_analysis(file_ch, filtered_ch) + } + +} + +workflow.onComplete { + log.info ( workflow.success ? "\nDone! Workflow completed without any error\n" : "Oops .. something went wrong" ) +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf new file mode 100644 index 00000000..af87331a --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf @@ -0,0 +1,92 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 +params.paired = false +params.max_mem = 100e9 + +/**************************************************************************************** +************************** Sequence assembly and summary ******************************* +****************************************************************************************/ + +// This process handles running the assembly for each individual sample. +process ASSEMBLE { + + tag "Assembling ${sample_id}-s reads using megahit..." + + input: + tuple val(sample_id), path(reads), val(isPaired) + output: + tuple val(sample_id), path("${sample_id}_final.contigs.fa") + script: + """ + # Removing output directory if exists already but process still needs to be + # run (because there is no --force option to megahit i dont't think): + [ -d ${sample_id}-megahit-out/ ] && rm -rf ${sample_id}-megahit-out/ + + if [ ${isPaired} == true ]; then + + BASENAME_FORWARD=`basename -s '.gz' ${reads[0]}` + BASENAME_REVERSE=`basename -s '.gz' ${reads[1]}` + + zcat ${reads[0]} > \${BASENAME_FORWARD} + zcat ${reads[1]} > \${BASENAME_REVERSE} + + megahit -1 \${BASENAME_FORWARD} -2 \${BASENAME_REVERSE} \\ + -m ${params.max_mem} -t ${task.cpus} \\ + --min-contig-len 500 -o ${sample_id}-megahit-out + + else + + BASENAME=`basename -s '.gz' ${reads[0]}` + zcat ${reads[0]} > \${BASENAME} + megahit -r \${BASENAME} -m ${params.max_mem} -t ${task.cpus} \\ + --min-contig-len 500 -o ${sample_id}-megahit-out + fi + + mv ${sample_id}-megahit-out/final.contigs.fa ${sample_id}_final.contigs.fa + """ +} + + +process RENAME_HEADERS { + + tag "Renaming ${sample_id}-s assembly fasta file-s headers..." + label "bit" + label "assembly" + + input: + tuple val(sample_id), path(assembly) + output: + tuple val(sample_id), path("${sample_id}-assembly.fasta") + script: + """ + bit-rename-fasta-headers -i ${assembly} \\ + -w c_${sample_id} \\ + -o ${sample_id}-assembly.fasta + + # Checking the assembly produced anything (megahit can run, produce + # the output fasta, but it will be empty if no contigs were assembled) + if [ ! -s ${sample_id}-assembly.fasta ]; then + printf "${sample_id}\\tNo contigs assembled\\n" > Failed-assemblies.tsv + fi + """ +} + + +// This process summarizes and reports general stats for all individual sample assemblies in one table. +process SUMMARIZE_ASSEMBLIES { + + tag "Generating a summary of all the assemblies..." + label "bit" + label "assembly" + + input: + path(assemblies) + output: + path("${params.additional_filename_prefix}assembly-summaries${params.assay_suffix}.tsv") + script: + """ + bit-summarize-assembly \\ + -o ${params.additional_filename_prefix}assembly-summaries${params.assay_suffix}.tsv \\ + ${assemblies} + """ +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf new file mode 100644 index 00000000..62352746 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf @@ -0,0 +1,195 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 +params.cat_db = "/mnt/c/Users/olabi/Documents/bioinformatics/test/processing_info/ref-dbs/CAT_prepare_20210107/2021-01-07_CAT_database/" +params.ko_db_dir = "/mnt/c/Users/olabi/Documents/bioinformatics/test/processing_info/ref-dbs/kofamscan_db/" +params.block_size = 4 + +/**************************************************************************************** +************************** Sequence Assembly Annotation ******************************* +****************************************************************************************/ +// This process calls genes on each assembly file. +process CALL_GENES { + + tag "Predicting genes for ${sample_id}-s assembly" + label "call_genes" + + input: + tuple val(sample_id), path(assembly) + output: + // Amino acids, nucleotides and gff + tuple val(sample_id), path("${sample_id}-genes.faa"), path("${sample_id}-genes.fasta"), path("${sample_id}-genes.gff") + script: + """ + # Only running if assembly produced any contigs + if [ -s ${assembly} ]; then + + prodigal -q -c -p meta -a ${sample_id}-genes.faa \\ + -d ${sample_id}-genes.fasta \\ + -f gff -o ${sample_id}-genes.gff \\ + -i ${assembly} + else + + touch ${sample_id}-genes.faa ${sample_id}-genes.fasta ${sample_id}-genes.gff + printf "Gene-calling not performed because the assembly didn't produce anything.\\n" + + fi + """ +} + +// Removing line-wraps using bit +process REMOVE_LINEWRAPS { + + tag "Remove line wraps in ${sample_id}-s nucleotide and amino acid file..." + label "call_genes" + label "bit" + + input: + tuple val(sample_id), path(aa), path(nt), path(gff) + + output: + tuple val(sample_id), path("${sample_id}-genes.faa"), path("${sample_id}-genes.fasta"), path(gff) + + script: + """ + if [ -s ${aa} ] && [ -s ${nt} ]; then + # Removing line-wraps + bit-remove-wraps ${aa} > ${sample_id}-genes.faa.tmp 2> /dev/null && \\ + mv ${sample_id}-genes.faa.tmp ${sample_id}-genes.faa + + bit-remove-wraps ${nt} > ${sample_id}-genes.fasta.tmp 2> /dev/null && \\ + mv ${sample_id}-genes.fasta.tmp ${sample_id}-genes.fasta + else + + touch ${sample_id}-genes.faa ${sample_id}-genes.fasta + printf "Line wrapping not performed because gene-calling wasn't performed on ${sample_id}.\\n" + fi + """ +} + + +// This process runs the gene-level (KO) functional annotation for each sample. +// KO annotatiuon of the predicted amino acids +process KO_ANNOTATION { + + tag "Running KO annotation of ${sample_id}-s predicted amino acids.." + //label "contig_annotation" + + input: + tuple val(sample_id), path(assembly), path(aa), path(nt), path(gff) + path(ko_db_dir) + output: + tuple val(sample_id), path("${sample_id}-KO-tab.tmp") + + script: + """ + # only running if assembly produced any contigs and genes were identified (they are required for this) + if [ -s ${assembly} ] && [ -s ${aa} ]; then + + exec_annotation -p ${ko_db_dir}/profiles/ \\ + -k ${ko_db_dir}/ko_list \\ + --cpu ${task.cpus} -f detail-tsv \\ + -o ${sample_id}-KO-tab.tmp --tmp-dir ${sample_id}-tmp-KO-dir \\ + --report-unannotated ${aa} + + else + + touch ${sample_id}-KO-tab.tmp + printf "Functional annotations not performed because the assembly didn't produce anything and/or no genes were identified.\\n" + + fi + """ +} + + +process FILTER_KFAMSCAN { + + tag "Filtering ${sample_id}-s KO annotation results..." + label "bit" + label "contig_annotation" + + input: + tuple val(sample_id), path(KO_tab_tmp) + output: + tuple val(sample_id), path("${sample_id}-annotations.tsv") + + script: + """ + if [ -s ${KO_tab_tmp} ]; then + + bit-filter-KOFamScan-results -i ${KO_tab_tmp} -o ${sample_id}-annotations.tsv + + else + + touch ${sample_id}-annotations.tsv + printf "Nothing to filter since functional annotation was not performed.\\n" + + fi + """ + +} + +// This process runs the gene- and contig-level taxonomic classifications for each assembly. +process TAX_CLASSIFICATION { + + tag "Taxonomy classification of ${sample_id}-s " + label "contig_annotation" + + input: + tuple val(sample_id), path(assembly), path(aa), path(nt), path(gff) + path(cat_db) + output: + // Gene and contig taxonomy + tuple val(sample_id), path("${sample_id}-gene-tax.tsv"), path("${sample_id}-contig-tax.tsv") + script: + """ + # Only running if assembly produced any contigs and + # genes were identified (they are required for this) + if [ -s ${assembly} ] && [ -s ${aa} ]; then + + CAT contigs -d ${cat_db}/${params.cat_db_sub_dir} -t ${cat_db}/${params.cat_taxonomy_dir} \\ + -n ${task.cpus} -r 3 --top 4 \\ + --I_know_what_Im_doing -c ${assembly} \\ + -p ${aa} -o ${sample_id}-tax-out.tmp \\ + --no_stars --block_size ${params.block_size} \\ + --index_chunks 2 --force + + # Adding names to gene classifications + CAT add_names -i ${sample_id}-tax-out.tmp.ORF2LCA.txt \\ + -o ${sample_id}-gene-tax.tmp -t ${cat_db}/${params.cat_taxonomy_dir} \\ + --only_official --exclude_scores + + # Formatting gene classifications + bash format-gene-tax-classifications.sh \\ + ${sample_id}-gene-tax.tmp ${sample_id}-gene-tax.tsv + + # Adding names to contig classifications + CAT add_names -i ${sample_id}-tax-out.tmp.contig2classification.txt \\ + -o ${sample_id}-contig-tax.tmp -t ${cat_db}/${params.cat_taxonomy_dir} \\ + --only_official --exclude_scores + + # Formatting contig classifications + bash format-contig-tax-classifications.sh \\ + ${sample_id}-contig-tax.tmp ${sample_id}-contig-tax.tsv + + else + + touch ${sample_id}-gene-tax.tsv ${sample_id}-contig-tax.tsv + printf "Assembly-based taxonomic classification not performed because the assembly didn't produce anything and/or no genes were identified.\\n" + + fi + """ +} + +workflow annotate_assembly { + take: + assembly_ch + ko_db_dir + cat_db + + main: + genes_ch = CALL_GENES(assembly_ch) | REMOVE_LINEWRAPS + + KO_ANNOTATION(assembly_ch.join(genes_ch) ko_db_dir) | FILTER_KFAMSCAN + TAX_CLASSIFICATION(assembly_ch, genes_ch, cat_db) + +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf new file mode 100644 index 00000000..2e8287e1 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf @@ -0,0 +1,132 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +*************************** Assembly-based processing workflow ************************* +****************************************************************************************/ + +// Assembly-based workflow +include { ASSEMBLE; RENAME_HEADERS; SUMMARIZE_ASSEMBLIES } from "./assembly.nf" +include { MAPPING; SAM_TO_BAM } from "./read_mapping.nf" +include { CALL_GENES; REMOVE_LINEWRAPS } from "./assembly_annotation.nf" +include { KO_ANNOTATION; FILTER_KFAMSCAN } from "./assembly_annotation.nf" +include { TAX_CLASSIFICATION } from "./assembly_annotation.nf" +include { GET_COV_AND_DET } from "./coverage.nf" +include { COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE; MAKE_COMBINED_GENE_LEVEL_TABLES } from "./combine_contig_annotation.nf" +include { COMBINE_CONTIG_TAX_AND_COVERAGE; MAKE_COMBINED_CONTIG_TAX_TABLES } from "./combine_contig_annotation.nf" +include { METABAT_BINNING } from "./binning.nf" +include { summarize_bins } from "./summarize_bins.nf" +include { summarize_mags } from "./summarize_MAG.nf" +include { GENERATE_ASSEMBLY_PROCESSING_OVERVIEW_TABLE } from "./summarize_assembly-based_processing.nf" + +workflow assembly_based { + + take: + file_ch + filtered_ch + ko_db_dir + cat_db + gtdbtk_db_dir + use_gtdbtk_scratch_location + + main: + /***************************************************** + ************* Assembly-based analysis **************** + *****************************************************/ + // Assemble reads to contigs + assembly_ch = ASSEMBLE(filtered_ch) | RENAME_HEADERS + assemblies_ch = assembly_ch.map{ + sample_id, assembly -> file("${assembly}") + }.collect() + SUMMARIZE_ASSEMBLIES(assemblies_ch) + + // Map reads to assembly + read_mapping_ch = MAPPING(assembly_ch.join(filtered_ch)) | SAM_TO_BAM + + // Annotate assembly + genes_ch = CALL_GENES(assembly_ch) | REMOVE_LINEWRAPS + if (ko_db_dir){ + annotations_ch = KO_ANNOTATION(assembly_ch.join(genes_ch), ko_db_dir) | FILTER_KFAMSCAN + }else{ + SETUP_KOFAMSCAN_DB() + annotations_ch = KO_ANNOTATION(assembly_ch.join(genes_ch), + SETUP_KOFAMSCAN_DB.out.ko_db_dir) | FILTER_KFAMSCAN + } + + if (cat_db){ + taxonomy_ch = TAX_CLASSIFICATION(assembly_ch.join(genes_ch), cat_db) + }else{ + SETUP_CAT_DB(params.database.CAT_DB_LINK) + taxonomy_ch = TAX_CLASSIFICATION(assembly_ch.join(genes_ch), SETUP_CAT_DB.out.cat_db) + } + + // Calculate gene coverage and depth + coverage_ch = GET_COV_AND_DET(read_mapping_ch + .join(assembly_ch) + .join(genes_ch)) + + // Combine contig annotation + tax_and_cov_ch = COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE(coverage_ch + .join(annotations_ch) + .join(taxonomy_ch) + .join(genes_ch) + .join(assembly_ch)) + + gene_coverage_annotation_and_tax_files_ch = tax_and_cov_ch.map{ + sample_id, coverage -> file("${coverage}") + }.collect() + + MAKE_COMBINED_GENE_LEVEL_TABLES(gene_coverage_annotation_and_tax_files_ch) + + combined_cov_ch = COMBINE_CONTIG_TAX_AND_COVERAGE(coverage_ch + .join(taxonomy_ch) + .join(genes_ch) + .join(assembly_ch)) + + MAKE_COMBINED_CONTIG_TAX_TABLES(combined_cov_ch.map{ + sample_id, coverage -> file("${coverage}") + }.collect()) + + // Assembly binning + binning_ch = METABAT_BINNING(assembly_ch.join(read_mapping_ch)) + binning_ch | summarize_bins + metabat_assembly_depth_files_ch = binning_ch.map{ + sample_id, depth, bins -> file("${depth}") + }.collect() + bins_ch = binning_ch.map{ + sample_id, depth, bins -> bins instanceof List ? bins.each{it}: bins + }.flatten().collect() + + + // Check Bins and Summarize MAGs + if(gtdbtk_db_dir){ + summarize_mags(summarize_bins.out.bins_checkm_results, + bins_ch, + gtdbtk_db_dir, use_gtdbtk_scratch_location, + gene_coverage_annotation_and_tax_files_ch) + }else{ + SETUP_GTDBTK_DB() + summarize_mags(summarize_bins.out.bins_checkm_results, + bins_ch, + SETUP_GTDBTK_DB.out.gtdbtk_db_dir, use_gtdbtk_scratch_location, + gene_coverage_annotation_and_tax_files_ch) + } + + // Get the predicted amino acids for all the samples + genes_aa_ch = genes_ch.map{sample_id, aa, nt, gff -> file("${aa}")}.collect() + + // Generating a file with sample ids on a new line + file_ch.map{row -> "${row.sample_id}"} + .collectFile(name: "${baseDir}/unique-sample-IDs.txt", newLine: true) + .set{sample_ids_ch} + + + bam_files = read_mapping_ch.map{sample_id, bam -> file("${bam}")}.collect() + // Summarize Assembly-based analysis + GENERATE_ASSEMBLY_PROCESSING_OVERVIEW_TABLE(sample_ids_ch, summarize_mags.out.MAGs_overview, + summarize_mags.out.MAGs_dir, assemblies_ch, + genes_aa_ch, + metabat_assembly_depth_files_ch, + bins_ch, + bam_files) +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf new file mode 100644 index 00000000..db175ce0 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf @@ -0,0 +1,85 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +********************* Assembly binning ************************************************* +****************************************************************************************/ + + + +// This process runs metabat2 for binning contigs. +process METABAT_BINNING { + + tag "Binning ${sample_id}-s contigs with metabat2..." + + input: + tuple val(sample_id), path(assembly), path(bam) + output: + tuple val(sample_id), path("${sample_id}-metabat-assembly-depth.tsv"), path("${sample_id}-bin*") + + script: + """ + # Only running if the assembly produced anything + if [ -s ${assembly} ]; then + + jgi_summarize_bam_contig_depths \\ + --outputDepth ${sample_id}-metabat-assembly-depth.tsv \\ + --percentIdentity 97 \\ + --minContigLength 1000 \\ + --minContigDepth 1.0 \\ + --referenceFasta ${assembly} ${bam} + + # only running if there are contigs with coverage + # information in the coverage file we just generated + if [ `wc -l ${sample_id}-metabat-assembly-depth.tsv | sed 's/^ *//' | cut -f 1 -d " "` -gt 1 ]; then + + metabat2 \\ + --inFile ${assembly} \\ + --outFile ${sample_id}-bin \\ + --abdFile ${sample_id}-metabat-assembly-depth.tsv \\ + -t ${task.cpus} + + else + + printf "\\n\\nThere was no coverage info generated in ${sample_id}-metabat-assembly-depth.tsv, so no binning with metabat was performed.\\n\\n" + + fi + + # changing extensions from .fa to .fasta to match nt fasta extension elsewhere in GeneLab + find . -name '${sample_id}*.fa' > ${sample_id}-bin-files.tmp + + if [ -s ${sample_id}-bin-files.tmp ]; then + paste -d " " <( sed 's/^/mv /' ${sample_id}-bin-files.tmp ) \\ + <( sed 's/.fa/.fasta/' ${sample_id}-bin-files.tmp ) \\ + > ${sample_id}-rename.tmp + bash ${sample_id}-rename.tmp + fi + + rm -rf ${sample_id}-bin-files.tmp ${sample_id}-rename.tmp + + else + + touch ${sample_id}-metabat-assembly-depth.tsv + printf "Binning not performed because the assembly didn't produce anything.\\n" + fi + """ +} + + + +workflow binning { + + take: + assembly_ch + read_mapping_ch + + + main: + binning_ch = METABAT_BINNING(assembly_ch.join(read_mapping_ch)) + + + emit: + binning_results = binning_ch + +} + diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf new file mode 100644 index 00000000..22897d57 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf @@ -0,0 +1,202 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + + +/**************************************************************************************** +************************** Combine Contig Annotation *********************************** +****************************************************************************************/ + + +/* +This process combines the gene-level functional annotations, taxonomic classifications, + and coverage information for each individual sample. +*/ +process COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE { + + tag "Combining gene and taxonomy annotations for ${sample_id}" + label "bit" + label "contig_annotation" + + input: + tuple val(sample_id), path(gene_coverages), path(contig_coverages), + path(annotations), path(gene_tax), path(contig_tax), + path(aa), path(nt), path(gff), path(assembly) + output: + tuple val(sample_id), path("${sample_id}-gene-coverage-annotation-and-tax.tsv") + script: + """ + # Only running if the assembly produced anything + # and genes were identified (they are required for this) + if [ -s ${assembly} ] && [ -s ${aa} ]; then + + paste <( tail -n +2 ${gene_coverages} | \\ + sort -V -k 1 ) <( tail -n +2 ${annotations} | \\ + sort -V -k 1 | cut -f 2- ) <( tail -n +2 ${gene_tax} | \\ + sort -V -k 1 | \\ + cut -f 2- ) > ${sample_id}-gene.tmp + + paste <( head -n 1 ${gene_coverages} ) \\ + <( head -n 1 ${annotations} | \\ + cut -f 2- ) <( head -n 1 ${gene_tax} | \\ + cut -f 2- ) > ${sample_id}-gene-header.tmp + + cat ${sample_id}-gene-header.tmp ${sample_id}-gene.tmp \\ + > ${sample_id}-gene-coverage-annotation-and-tax.tsv + + rm -rf ${sample_id}-gene.tmp ${sample_id}-gene-header.tmp + + + else + + printf "gene_ID\\tcoverage\\tKO_ID\\tKO_function\\ttaxid\\tdomain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n" \\ + > ${sample_id}-gene-coverage-annotation-and-tax.tsv + + fi + """ +} + + +process MAKE_COMBINED_GENE_LEVEL_TABLES { + + tag "Combining all gene level annotations...." + label "bit" + label "combine_outputs" + + input: + path(gene_coverage_annotation_and_tax_files) + output: + path("${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages-CPM${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv") + script: + """ + bit-GL-combine-KO-and-tax-tables ${gene_coverage_annotation_and_tax_files} -o ${params.additional_filename_prefix}Combined + + # Renaming to have GL assay-specific suffix + mv "${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages.tsv" \\ + "${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages${params.assay_suffix}.tsv" + + mv "${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages-CPM.tsv" \\ + "${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages-CPM${params.assay_suffix}.tsv" + + mv "${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages.tsv" \\ + "${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages${params.assay_suffix}.tsv" + + mv "${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM.tsv" \\ + "${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv" + """ +} + +// This process combines the contig-level taxonomic and +// coverage information for each individual sample. +process COMBINE_CONTIG_TAX_AND_COVERAGE { + + tag "Combining taxonomy and coverage for ${sample_id}...." + label "bit" + label "contig_annotation" + + input: + tuple val(sample_id), path(gene_coverages), path(contig_coverages), + path(gene_tax), path(contig_tax), + path(aa), path(nt), path(gff), path(assembly) + output: + tuple val(sample_id), path("${sample_id}-contig-coverage-and-tax.tsv") + script: + """ + # Only running if the assembly produced anything + if [ -s ${assembly} ]; then + + # If there were no genes called, there is no contig-level taxonomy, so dealing with that here + if [ -s ${aa} ]; then + + paste <( tail -n +2 ${contig_coverages} | \\ + sort -V -k 1 ) <( tail -n +2 ${contig_tax} | \\ + sort -V -k 1 | cut -f 2- ) > ${sample_id}-contig.tmp + paste <( head -n 1 ${contig_coverages} ) \\ + <( head -n 1 ${contig_tax} | cut -f 2- ) \\ + > ${sample_id}-contig-header.tmp + + cat ${sample_id}-contig-header.tmp ${sample_id}-contig.tmp \\ + > ${sample_id}-contig-coverage-and-tax.tsv + + rm -rf ${sample_id}-contig.tmp ${sample_id}-contig-header.tmp + + else + + paste <( tail -n +2 ${contig_coverages} | sort -V -k 1 ) > ${sample_id}-contig-p1.tmp + + sed 's/.*/NA/g' ${sample_id}-contig-p1.tmp > ${sample_id}-tax-col.tmp + + paste ${sample_id}-contig-p1.tmp ${sample_id}-tax-col.tmp ${sample_id}-tax-col.tmp \\ + ${sample_id}-tax-col.tmp ${sample_id}-tax-col.tmp ${sample_id}-tax-col.tmp \\ + ${sample_id}-tax-col.tmp ${sample_id}-tax-col.tmp ${sample_id}-tax-col.tmp \\ + > ${sample_id}-contig.tmp + + cat <( printf "contig_ID\\tcoverage\\ttaxid\\tdomain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n" ) \\ + ${sample_id}-contig.tmp > ${sample_id}-contig-coverage-and-tax.tsv + rm -rf ${sample_id}-contig-p1.tmp ${sample_id}-tax-col.tmp ${sample_id}-contig.tmp + + fi + + else + + printf "contig_ID\\tcoverage\\ttaxid\\tdomain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n" \\ + > ${sample_id}-contig-coverage-and-tax.tsv + + fi + """ +} + +process MAKE_COMBINED_CONTIG_TAX_TABLES { + + tag "Making a summary contig taxonomy table...." + label "bit" + label "combine_outputs" + + input: + path(contig_coverage_and_tax_files) + output: + path("${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv") + script: + """ + bit-GL-combine-contig-tax-tables ${contig_coverage_and_tax_files} -o ${params.additional_filename_prefix}Combined + + # Renaming to have GL assay-specific suffix + mv "${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages.tsv" \\ + "${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages${params.assay_suffix}.tsv" + + mv "${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM.tsv" \\ + "${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv" + """ +} + + + + +workflow { + take: + coverages_ch + annotations_ch + taxonomy_ch + gene_call_ch + assembly_ch + + main: + tax_and_cov_ch = COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE(coverages_ch + .join(annotations_ch) + .join(taxonomy_ch) + .join(gene_call_ch) + .join(assembly_ch)) + + MAKE_COMBINED_GENE_LEVEL_TABLES(tax_and_cov_ch.map{sample_id, coverage -> file("${coverage}")}.collect()) + + combined_cov_ch = COMBINE_CONTIG_TAX_AND_COVERAGE(coverages_ch + .join(taxonomy_ch) + .join(gene_call_ch) + .join(assembly_ch)) + + MAKE_COMBINED_CONTIG_TAX_TABLES(combined_cov_ch.map{sample_id, coverage -> file("${coverage}")}.collect()) +} + diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf new file mode 100644 index 00000000..1c34c086 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf @@ -0,0 +1,80 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 +params.pileup_mem = "5g" + +/* + This process pulls out coverage and detection information for each sample, gene-level and contig-level, + and filters the gene-level coverage information based on requiring at least 50% detection. +*/ + +process GET_COV_AND_DET { + + tag "Calculating gene and contig coverage for ${sample_id}..." + + input: + tuple val(sample_id), path(bam), path(assembly), path(aa), path(nt), path(gff) + output: + // Gene_covs and contig_covs + tuple val(sample_id), path("${sample_id}-gene-coverages.tsv"), path("${sample_id}-contig-coverages.tsv") + script: + """ + # get-cov-and-depth.sh ${sample_id} ${assembly} ${nt} ${bam} ${params.pileup_mem} + + # Only running if the assembly produced anything + if [ -s ${assembly} ]; then + + # Only running on genes also if genes were identified + if [ -s ${nt} ]; then + + pileup.sh -Xmx${params.pileup_mem} -in ${bam} \\ + fastaorf=${nt} outorf=${sample_id}-gene-cov-and-det.tmp \\ + out=${sample_id}-contig-cov-and-det.tmp + + # Filtering coverages based on detection + # Genes + grep -v "#" ${sample_id}-gene-cov-and-det.tmp | \\ + awk -F \$'\\t' ' BEGIN {OFS=FS} { if ( \$10 <= 0.5 ) \$4 = 0 } { print \$1,\$4 } ' \\ + > ${sample_id}-gene-cov.tmp + + cat <( printf "gene_ID\\tcoverage\\n" ) ${sample_id}-gene-cov.tmp > ${sample_id}-gene-coverages.tsv + + # Contigs + grep -v "#" ${sample_id}-contig-cov-and-det.tmp | \\ + awk -F \$'\\t' ' BEGIN {OFS=FS} { if ( \$5 <= 50 ) \$2 = 0 } { print \$1,\$2 } ' \\ + > ${sample_id}-contig-cov.tmp + + cat <( printf "contig_ID\\tcoverage\\n" ) ${sample_id}-contig-cov.tmp > ${sample_id}-contig-coverages.tsv + + # Removing intermediate files + rm ${sample_id}-gene-cov-and-det.tmp ${sample_id}-contig-cov-and-det.tmp \\ + ${sample_id}-gene-cov.tmp ${sample_id}-contig-cov.tmp + + else + + pileup.sh -in ${bam} out=${sample_id}-contig-cov-and-det.tmp + + # Filtering coverages based on detection + # Contigs + grep -v "#" ${sample_id}-contig-cov-and-det.tmp | \\ + awk -F \$'\\t' ' BEGIN {OFS=FS} { if ( \$5 <= 50 ) \$2 = 0 } { print \$1,\$2 } ' \\ + > ${sample_id}-contig-cov.tmp + cat <( printf "contig_ID\\tcoverage\\n" ) ${sample_id}-contig-cov.tmp > ${sample_id}-contig-coverages.tsv + + # Writing out empty genes coverage file + printf "gene_ID\\tcoverage\\n" > ${sample_id}-gene-coverages.tsv + printf "\\n\\nGene-level coverage info not recovered because the assembly didn't have any genes identified.\\n" + + # Removing intermediate files + rm ${sample_id}-contig-cov-and-det.tmp ${sample_id}-contig-cov.tmp + + fi + + else + + printf "gene_ID\\tcoverage\\n" > ${sample_id}-gene-coverages.tsv + printf "contig_ID\\tcoverage\\n" > ${sample_id}-contig-coverages.tsv + printf "Coverage info not recovered because the assembly didn't produce anything.\\n" + + fi + """ +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf new file mode 100644 index 00000000..5883b490 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf @@ -0,0 +1,32 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +params.GLDS_accession = "OSS-466" + +process GET_RUNSHEET { + + beforeScript "chmod +x ${baseDir}/bin/create_runsheet.py" + + output: + path("*_runsheet.csv"), emit: runsheet + path("*.zip"), emit: zip + path("GLparams_file.csv"), emit: params_file + path("GLfile.csv"), emit: input_file + + script: + """ + create_runsheet.py --OSD ${params.GLDS_accession} + """ +} + + +workflow { + + GET_RUNSHEET() + file_ch = GET_RUNSHEET.out.input_file + .splitCsv() + + params_ch = GET_RUNSHEET.out.params_file + .splitCsv(header:true) + +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf new file mode 100644 index 00000000..e77e6965 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf @@ -0,0 +1,254 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +params.CAT_DL_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" + +/**************************************************************************************** +*************************** Metagenomics databases set-up ****************************** +****************************************************************************************/ + +// This process download CAT reference database. +process SETUP_CAT_DB { + + tag "Downloading and setting up contig annotation tool-s (CAT) database..." + label "db_setup" + + input: + val(CAT_DB_LINK) + output: + path("CAT_prepare_20210107/"), emit: cat_db + path("CAT_prepare_20210107/CAT_DB_SETUP"), emit: completion_indicator + script: + """ + printf "### Setting up CAT's reference database ###\\n\\n" + printf " Downloading reference db:\\n\\n" + curl -L -C - -o CAT_prepare_20210107.tar.gz ${CAT_DB_LINK} + + printf "\\n\\n Extracting reference db:\\n\\n" + tar -xvzf CAT_prepare_20210107.tar.gz + + rm CAT_prepare_20210107.tar.gz CAT_prepare_20210107/2021-01-07_CAT_database/2021-01-07.nr.gz + touch CAT_prepare_20210107/CAT_DB_SETUP + printf "### Set up completed successfully ###\\n\\n" + """ +} + +// This process downloads KOFamScan db (minimally currently). +process SETUP_KOFAMSCAN_DB { + + tag "Downloading and setting up kofam scan-s database..." + label "db_setup" + + output: + path("kofamscan_db/"), emit: ko_db_dir + path("kofamscan_db/KO_DB_SETUP"), emit: completion_indicator + script: + """ + printf "### Setting up KOFamScan reference database ###\\n\\n" + + # Using https instead of ftp for those whose systems don't have access to the ftp servers + + printf "\\n Downloading ko_list file:\\n\\n" + + if ! curl -L -C - --connect-timeout 15 -o ko_list.gz ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz + then + printf "\\n\\n Downloading via http since ftp seemed to fail making the connection:\\n\\n" + curl -L -C - -o ko_list.gz https://www.genome.jp/ftp/db/kofam/ko_list.gz + fi + + printf "\\n\\n Downloading profiles.tar.gz file:\\n\\n" + + + if ! curl -L -C - --connect-timeout 15 -o profiles.tar.gz ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz + then + printf "\\n\\n Downloading via http since ftp seemed to fail making the connection:\\n\\n" + curl -L -C - -o profiles.tar.gz https://www.genome.jp/ftp/db/kofam/profiles.tar.gz + fi + + [ -d kofamscan_db/ ] || mkdir kofamscan_db/ + printf "\\n\\n Decompressing profiles.tar.gz file:\\n\\n" + tar -xzf profiles.tar.gz -C kofamscan_db/ + rm profiles.tar.gz + + gunzip ko_list.gz && \\ + mv ko_list kofamscan_db/ && \\ + touch kofamscan_db/KO_DB_SETUP + printf "### Set up completed successfully ###\\n\\n" + """ +} + +// This process downloads the gtdb-tk db (minimally currently) +process SETUP_GTDBTK_DB { + + tag "Downloading and setting up genome taxonomy database toolkit-s (GTDBTK) database..." + label "db_setup" + + output: + path("GTDB-tk-ref-db/"), emit: gtdbtk_db_dir + path("GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP"), emit: completion_indicator + shell: + ''' + [ -d GTDB-tk-ref-db/ ] || mkdir -p GTDB-tk-ref-db/ + + # But still needs to be set for this particular session that is downloading and setting up the db + export GTDBTK_DATA_PATH=GTDB-tk-ref-db/ + + # Make a copy of the download script to edit wget's timeout duration + cat `which download-db.sh` |sed 's/\$db_url/--timeout=3600 $db_url/' > download-db.sh && \ + chmod +x ./download-db.sh + # Downloading + ./download-db.sh && touch GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP + printf "### Set up completed successfully ###\\n\\n" + ''' +} + +// The processes below download the databases required by humann3. +process SETUP_CHOCOPHLAN { + + tag "Downloading and setting up Humann-s chocoplan nucleotide database..." + label "humann_setup" + label "db_setup" + + output: + path("humann3-db/chocophlan"), emit: chocophlan_dir + path("humann3-db/CHOCOPHLAN_DB_SETUP"), emit: completion_indicator + script: + """ + [ -d humann3-db/ ] || mkdir -p humann3-db/ + printf "### Setting up humann3 reference databases ###\\n\\n" + + if [ ! -f humann3-db/CHOCOPHLAN_DB_SETUP ]; then + printf " Downloading full chocophlan db:\\n\\n" + # No need to update locations since I pass them as arguaments to the script + humann3_databases --update-config no --download chocophlan full humann3-db/ && \\ + touch humann3-db/CHOCOPHLAN_DB_SETUP + printf "### Set up completed successfully ###\\n\\n" + fi + """ +} + + +process SETUP_UNIREF { + + tag "Downloading and setting up Humann-s uniref protein database..." + label "humann_setup" + label "db_setup" + + output: + path("humann3-db/uniref/"), emit: uniref_dir + path("humann3-db/UNIREF_DB_SETUP"), emit: completion_indicator + script: + """ + [ -d humann3-db/ ] || mkdir -p humann3-db/ + printf "### Setting up humann3's uniref database ###\\n\\n" + if [ ! -f humann3-db/UNIREF_DB_SETUP ];then + printf "\\n\\n Downloading uniref90_ec_filtered_diamond db:\\n\\n" + # No need to update locations since I pass them as arguaments to the script + humann3_databases --update-config no --download uniref uniref90_ec_filtered_diamond humann3-db/ && \\ + touch humann3-db/UNIREF_DB_SETUP + printf "### Set up completed successfully ###\\n\\n" + fi + """ +} + +process SETUP_UTILITY_MAPPING { + + tag "Downloading and setting up Humann-s utilities mapping database..." + label "humann_setup" + label "db_setup" + + output: + path("humann3-db/utility_mapping/"), emit: utilities_dir + path("humann3-db/UTILITY_MAPPING_SETUP"), emit: completion_indicator + script: + """ + [ -d humann3-db/ ] || mkdir -p humann3-db/ + printf "### Setting up humann3's utilities mapping database ###\\n\\n" + if [ ! -f humann3-db/UTILITY_MAPPING_SETUP ];then + printf "\\n\\n Downloading full utility_mapping db:\\n\\n" + + # Containers are read only but conda environments can be modified + if [ ${params.use_conda} == 'true'];then + + humann3_databases --update-config yes --download utility_mapping full humann3-db/ && \\ + touch humann3-db/UTILITY_MAPPING_SETUP + + else + + humann3_databases --update-config no --download utility_mapping full humann3-db/ && \\ + touch humann3-db/UTILITY_MAPPING_SETUP + + fi + printf "### Set up completed successfully ###\\n\\n" + fi + """ +} + + +process SETUP_METAPHLAN { + tag "Downloading and setting up Humann-s utilities mapping database..." + label "humann_setup" + label "db_setup" + + output: + path("metaphlan4-db/"), emit: metaphlan_db_dir + path("metaphlan4-db/METAPHLAN4_DB_SETUP"), emit: completion_indicator + + script: + """ + [ -d metaphlan4-db/ ] || mkdir -p metaphlan4-db/ + printf "### Setting up metaphlan's reference database ###\\n\\n" + + if [ ! -f metaphlan4-db/METAPHLAN4_DB_SETUP ];then + printf "\\n\\n Downloading metaphlan db:\\n\\n" + metaphlan --install --bowtie2db metaphlan4-db/ && \\ + touch metaphlan4-db/METAPHLAN4_DB_SETUP + printf "### Set up completed successfully ###\\n\\n" + fi + """ +} + + +workflow make_humann_db { + + main: + SETUP_CHOCOPHLAN() + SETUP_UNIREF() + SETUP_UTILITY_MAPPING() + SETUP_METAPHLAN() + + emit: + chocophlan_dir = SETUP_CHOCOPHLAN.out.chocophlan_dir + uniref_dir = SETUP_UNIREF.out.uniref_dir + metaphlan_db_dir = SETUP_METAPHLAN.out.metaphlan_db_dir + utilities_dir = SETUP_UTILITY_MAPPING.out.utilities_dir + +} + +workflow make_databases { + + take: + CAT_DB_LINK + + main: + SETUP_CAT_DB(CAT_DB_LINK) + SETUP_KOFAMSCAN_DB() + SETUP_GTDBTK_DB() + make_humann_db() + + emit: + cat_db = SETUP_CAT_DB.out.cat_db + kofam_db = SETUP_KOFAMSCAN_DB.out.ko_db_dir + gtdbtk_db_dir = SETUP_GTDBTK_DB.out.gtdbtk_db_dir + chocophlan_dir = make_humann_db.out.chocophlan_dir + uniref_dir = make_humann_db.out.uniref_dir + metaphlan_db_dir = make_humann_db.out.metaphlan_db_dir + utilities_dir = make_humann_db.out.utilities_dir + + } + + + +workflow { + make_databases(Channel.of(params.CAT_DB_LINK)) +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf new file mode 100644 index 00000000..c4371eb7 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf @@ -0,0 +1,116 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +********************* Sequence quality assessment and control processes **************** +****************************************************************************************/ + +// a 2-column (single-end) or 3-column (paired-end) file +params.prefix = "raw" // "filetered" +params.csv_file = "file.csv" +params.swift_1S = false +params.adapters = "${baseDir}/config/bbtools_dapters.fa" +params.multiqc_config = "config/multiqc.config" + +process FASTQC { + // FastQC performed on reads + tag "Running fastqc on ${sample_id}" + + input: + tuple val(sample_id), path(reads), val(isPaired) + output: + tuple path("*.html"), path("*.zip") + script: + """ + fastqc -o . \\ + -t ${task.cpus} \\ + ${reads} + """ +} + +process MULTIQC { + + tag "Running multiqc on the ${prefix} files.." + + input: + val(prefix) + path(multiqc_config) + path(files) + output: + path("${params.additional_filename_prefix}${prefix}_multiqc${params.assay_suffix}_report.zip") + script: + """ + multiqc -q --filename ${params.additional_filename_prefix}${prefix}_multiqc \\ + --force --cl-config 'max_table_rows: 99999999' \\ + --interactive --config ${multiqc_config} \\ + --outdir ${params.additional_filename_prefix}${prefix}_multiqc_report ${files} > /dev/null 2>&1 + + + # zipping and removing unzipped dir + zip -q -r \\ + ${params.additional_filename_prefix}${prefix}_multiqc${params.assay_suffix}_report.zip \\ + ${params.additional_filename_prefix}${prefix}_multiqc_report + + """ + } + + +// This process runs quality filtering/trimming on input fastq files. +process BBDUK { + + + tag "Quality filtering ${sample_id}-s reads.." + beforeScript "chmod +x ${baseDir}/bin/*" + + input: + tuple val(sample_id), path(reads), val(isPaired) + path(adapters) + output: + tuple val(sample_id), path("*${params.filtered_suffix}"), val(isPaired) + script: + def isSwift = params.swift_1S ? 't' : 'f' + """ + if [ ${isPaired} == true ];then + + bbduk.sh in=${reads[0]} in2=${reads[1]} \\ + out1=${sample_id}${params.filtered_R1_suffix} \\ + out2=${sample_id}${params.filtered_R2_suffix} \\ + ref=${adapters} \\ + ktrim=l k=17 ftm=5 qtrim=rl \\ + trimq=10 mlf=0.5 maxns=0 swift=${isSwift} + else + + bbduk.sh in=${reads[0]} out1=${sample_id}${params.filtered_suffix} \\ + ref=${adapters} \\ + ktrim=l k=17 ftm=5 qtrim=rl \\ + trimq=10 mlf=0.5 maxns=0 swift=${isSwift} + + fi + """ +} + + +workflow quality_check { + + take: + prefix_ch + multiqc_config + reads_ch + + + main: + fastqc_ch = FASTQC(reads_ch).flatten().collect() + MULTIQC(prefix_ch, multiqc_config, fastqc_ch) +} + +workflow { + + Channel.fromPath(params.csv_file) + .splitCsv() + .map{ row -> row.paired ? tuple( "${row.sample_id}", [file("${row.forward}"), file("${row.reverse}")], row.paired) : + tuple( "${row.sample_id}", [file("${row.forward}")], row.paired)} + .set{reads_ch} + + res_ch = quality_check(Channel.of(params.prefix), params.multiqc_config, reads_ch) + BBDUK(reads_ch) +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf new file mode 100644 index 00000000..84c02acc --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf @@ -0,0 +1,269 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 +params.additional_filename_prefix = "" +params.assay_suffix = "_GLmetagenomics" + +/**************************************************************************************** +********************* Read-based processing using Humann3 ******************************* +****************************************************************************************/ + + +/* + This process runs humann3 and metaphlan4 on each individual sample generating the + read-based functional annotations and taxonomic classifications. +*/ + +process HUMANN { + + tag "Running humann on ${sample_id}-s reads..." + label "read_based" + + input: + tuple val(sample_id), path(reads), val(isPaired) + path(chocophlan_dir) + path(uniref_dir) + path(metaphlan_dir) + + output: + path("${sample_id}-humann3-out-dir/${sample_id}_genefamilies.tsv"), emit: genefamilies + path("${sample_id}-humann3-out-dir/${sample_id}_pathabundance.tsv"), emit: pathabundance + path("${sample_id}-humann3-out-dir/${sample_id}_pathcoverage.tsv"), emit: pathcoverage + path("${sample_id}-humann3-out-dir/${sample_id}_metaphlan_bugs_list.tsv"), emit: metaphlan_bugs_list + script: + """ + zcat ${reads} > ${sample_id}-reads.tmp.fq + + humann --input ${sample_id}-reads.tmp.fq \\ + --output ${sample_id}-humann3-out-dir/ \\ + --threads ${task.cpus} \\ + --output-basename ${sample_id} \\ + --metaphlan-options "--bowtie2db ${metaphlan_dir} --unclassified_estimation --add_viruses --sample_id ${sample_id}" \\ + --nucleotide-database ${chocophlan_dir} \\ + --protein-database ${uniref_dir} \\ + --bowtie-options "--sensitive --mm" + + mv ${sample_id}-humann3-out-dir/${sample_id}_humann_temp/${sample_id}_metaphlan_bugs_list.tsv \\ + ${sample_id}-humann3-out-dir/${sample_id}_metaphlan_bugs_list.tsv + + """ +} + + +/* + This process combines the read-based humann3 output functional + tables from indiviual samples into single tables across the GLDS dataset. +*/ + +process COMBINE_READ_BASED_PROCESSING_TABLES { + + tag "Combining the read based processing tables..." + label "read_based" + + input: + path(gene_families) + path(path_abundances) + path(path_coverages) + path(utilities_path) + output: + path("${params.additional_filename_prefix}gene-families-initial.tsv"), emit: gene_families + path("${params.additional_filename_prefix}pathway-abundances-initial.tsv"), emit: path_abundances + path("${params.additional_filename_prefix}pathway-coverages-initial.tsv"), emit: path_coverages + script: + """ + if [ ${params.use_conda} == true ]; then + # Setting humann3 utilities location (can be off if we pointed to + # a previously installed database, and doesn't hurt to reset if it was already good-to-go) + humann_config --update database_folders utility_mapping ${utilities_path} > /dev/null 2>&1 + fi + + # they each need to be in the same directories to be merged + mkdir -p gene-family-results/ path-abundance-results/ path-coverage-results/ + cp ${gene_families} gene-family-results/ + cp ${path_abundances} path-abundance-results/ + cp ${path_coverages} path-coverage-results/ + + humann_join_tables -i gene-family-results/ -o ${params.additional_filename_prefix}gene-families-initial.tsv > /dev/null 2>&1 + humann_join_tables -i path-abundance-results/ -o ${params.additional_filename_prefix}pathway-abundances-initial.tsv > /dev/null 2>&1 + humann_join_tables -i path-coverage-results/ -o ${params.additional_filename_prefix}pathway-coverages-initial.tsv > /dev/null 2>&1 + """ +} + + +/* + The read-based functional annotation tables have taxonomic info and non-taxonomic info mixed + together initially. Humann comes with utility scripts to split these. This process does that, + generating non-taxonomically grouped functional info files and taxonomically grouped ones. +*/ + +process SPLIT_READ_BASED_PROCESSING_TABLES { + + tag "Splitting humann stratified tables..." + label "read_based" + + input: + path(gene_families) + path(path_abundances) + path(path_coverages) + output: + path("${params.additional_filename_prefix}Gene-families.tsv"), emit: gene_families + path("${params.additional_filename_prefix}Gene-families-grouped-by-taxa.tsv"), emit: gene_families_grouped + path("${params.additional_filename_prefix}Pathway-abundances.tsv"), emit: path_abundances + path("${params.additional_filename_prefix}Pathway-abundances-grouped-by-taxa.tsv"), emit: path_abundances_grouped + path("${params.additional_filename_prefix}Pathway-coverages.tsv"), emit: path_coverages + path("${params.additional_filename_prefix}Pathway-coverages-grouped-by-taxa.tsv"), emit: path_coverages_grouped + script: + """ + [ -d temp_processing/ ] && rm -rf temp_processing/ + mkdir temp_processing/ + + # Gene Families + humann_split_stratified_table -i ${gene_families} -o temp_processing/ > /dev/null 2>&1 + mv temp_processing/${params.additional_filename_prefix}gene-families-initial_stratified.tsv ${params.additional_filename_prefix}Gene-families-grouped-by-taxa.tsv + mv temp_processing/${params.additional_filename_prefix}gene-families-initial_unstratified.tsv ${params.additional_filename_prefix}Gene-families.tsv + + # Pathway Abundance + humann_split_stratified_table -i ${path_abundances} -o temp_processing/ > /dev/null 2>&1 + mv temp_processing/${params.additional_filename_prefix}pathway-abundances-initial_stratified.tsv ${params.additional_filename_prefix}Pathway-abundances-grouped-by-taxa.tsv + mv temp_processing/${params.additional_filename_prefix}pathway-abundances-initial_unstratified.tsv ${params.additional_filename_prefix}Pathway-abundances.tsv + + # Pathway Coverage + humann_split_stratified_table -i ${path_coverages} -o temp_processing/ > /dev/null 2>&1 + mv temp_processing/${params.additional_filename_prefix}pathway-coverages-initial_stratified.tsv ${params.additional_filename_prefix}Pathway-coverages-grouped-by-taxa.tsv + mv temp_processing/${params.additional_filename_prefix}pathway-coverages-initial_unstratified.tsv ${params.additional_filename_prefix}Pathway-coverages.tsv + """ +} + + +/* + This process generates some normalized tables of the read-based functional outputs from + humann that are more readily suitable for across sample comparisons. +*/ + +process GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES { + + tag "Generating normalized humann tables..." + label "read_based" + + input: + path(gene_families) + path(path_abundances) + output: + path("${params.additional_filename_prefix}Gene-families-cpm${params.assay_suffix}.tsv"), emit: gene_families + path("${params.additional_filename_prefix}Pathway-abundances-cpm${params.assay_suffix}.tsv"), emit: path_abundances + + script: + """ + humann_renorm_table \\ + -i ${gene_families} \\ + -o ${params.additional_filename_prefix}Gene-families-cpm${params.assay_suffix}.tsv \\ + --update-snames > /dev/null 2>&1 + + humann_renorm_table \\ + -i ${path_abundances} \\ + -o ${params.additional_filename_prefix}Pathway-abundances-cpm${params.assay_suffix}.tsv \\ + --update-snames > /dev/null 2>&1 + """ +} + + +/* + This process summarizes the read-based humann annotations based on Kegg Orthlogy terms. +*/ + +process GEN_READ_BASED_PROCESSING_KO_TABLE { + + tag "Retrieving Kegg Orthologs..." + label "read_based" + + input: + path(gene_families) + output: + path("${params.additional_filename_prefix}Gene-families-KO-cpm${params.assay_suffix}.tsv") + + script: + """ + humann_regroup_table \\ + -i ${gene_families} \\ + -g uniref90_ko 2> /dev/null | \\ + humann_rename_table \\ + -n kegg-orthology 2> /dev/null | \\ + humann_renorm_table \\ + -o ${params.additional_filename_prefix}Gene-families-KO-cpm${params.assay_suffix}.tsv \\ + --update-snames > /dev/null 2>&1 + """ +} + + + +//This process merges the taxonomy tables generated by metaphlan +process COMBINE_READ_BASED_PROCESSING_TAXONOMY { + + tag "Merging metaphlan taxonomy tables..." + label "read_based" + + input: + path(metaphlan_bugs_list_files) + output: + path("${params.additional_filename_prefix}Metaphlan-taxonomy${params.assay_suffix}.tsv") + script: + """ + merge_metaphlan_tables.py ${metaphlan_bugs_list_files} \\ + > ${params.additional_filename_prefix}Metaphlan-taxonomy${params.assay_suffix}.tsv 2> /dev/null + + # Removing redundant text from headers + sed -i 's/_metaphlan_bugs_list//g' ${params.additional_filename_prefix}Metaphlan-taxonomy${params.assay_suffix}.tsv + """ +} + + + +workflow read_based { + + take: + filtered_reads + chocophlan_dir + uniref_dir + metaphlan_dir + utilities_path + + main: + + + HUMANN(filtered_reads, chocophlan_dir, uniref_dir, metaphlan_dir) + + gene_families_ch = HUMANN.out.genefamilies.collect() + pathabundance_ch = HUMANN.out.pathabundance.collect() + pathcoverage_ch = HUMANN.out.pathcoverage.collect() + metaphlan_bugs_list_ch = HUMANN.out.metaphlan_bugs_list.collect() + + COMBINE_READ_BASED_PROCESSING_TABLES(gene_families_ch, pathabundance_ch, pathcoverage_ch, utilities_path) + + SPLIT_READ_BASED_PROCESSING_TABLES(COMBINE_READ_BASED_PROCESSING_TABLES.out.gene_families, + COMBINE_READ_BASED_PROCESSING_TABLES.out.path_abundances, + COMBINE_READ_BASED_PROCESSING_TABLES.out.path_coverages) + + GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES(SPLIT_READ_BASED_PROCESSING_TABLES.out.gene_families, + SPLIT_READ_BASED_PROCESSING_TABLES.out.path_abundances) + + ko_table_ch = GEN_READ_BASED_PROCESSING_KO_TABLE(SPLIT_READ_BASED_PROCESSING_TABLES.out.gene_families) + + taxonomy_ch = COMBINE_READ_BASED_PROCESSING_TAXONOMY(metaphlan_bugs_list_ch) + + emit: + gene_families = GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES.out.gene_families + path_abundances = GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES.out.path_abundances + ko_table = ko_table_ch + taxonomy = taxonomy_ch + +} + + +workflow { + + read_based(filtered_reads_ch, + params.database.chocophlan_dir, + params.database.uniref_dir, + params.database.metaphlan_db_dir, + params.database.utilities_dir) + +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf new file mode 100644 index 00000000..0fb61d8a --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf @@ -0,0 +1,84 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +********************* Read mapping to contig assembly using Bowtie2 ******************** +****************************************************************************************/ + +// This process builds the bowtie2 index and runs the mapping for each sample +process MAPPING { + + tag "Mapping ${sample_id}-s reads to its assembly ${assembly}..." + label "mapping" + + input: + tuple val(sample_id), path(assembly), path(reads), val(isPaired) + output: + tuple val(sample_id), path("${sample_id}.sam") + script: + """ + if [ ${isPaired} == 'true' ]; then + # Only running if the assembly produced anything + if [ -s ${assembly} ]; then + + bowtie2-build ${assembly} ${sample_id}-index + bowtie2 --mm -q --threads ${task.cpus} \\ + -x ${sample_id}-index -1 ${reads[0]} -2 ${reads[1]} \\ + --no-unal > ${sample_id}.sam 2> ${sample_id}-mapping-info.txt + rm ${sample_id}-index* + else + + touch ${sample_id}.sam + printf "Mapping not performed for ${sample_id} because the assembly didn't produce anything.\\n" + + fi + # Single-end + else + + # Only running if the assembly produced anything + if [ -s ${assembly} ]; then + + bowtie2-build ${assembly} ${sample_id}-index + bowtie2 --mm -q --threads ${task.cpus} \\ + -x ${sample_id}-index -r ${reads[0]} \\ + --no-unal > ${sample_id}.sam 2> ${sample_id}-mapping-info.txt + + rm ${sample_id}-index* + else + + touch ${sample_id}.sam + printf "Mapping not performed for ${sample_id} because the assembly didn't produce anything.\\n" + + fi + + fi + """ +} + + + +// This process builds the bowtie2 index and runs the mapping for each sample +process SAM_TO_BAM { + + tag "Sorting and converting ${sample_id}-s sam to bam files..." + label "mapping" + + input: + tuple val(sample_id), path(sam) + output: + tuple val(sample_id), path("${sample_id}.bam") + script: + """ + # Only running if the assembly produced anything + if [ -s ${sam} ]; then + + samtools sort -@ ${task.cpus} ${sam} > ${sample_id}.bam 2> /dev/null + + else + + touch ${sample_id}.bam + printf "Sorting and converting not performed for ${sample_id} because read mapping didn't produce anything.\\n" + + fi + """ +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf new file mode 100644 index 00000000..746ab665 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf @@ -0,0 +1,343 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +********************* Summarize Meta assembled genomes (MAGs) ************************** +****************************************************************************************/ + + +params.min_est_comp = 90 +params.max_est_redund = 10 +params.max_est_strain_het = 50 + +/* +Scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; +see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes +leave empty if wanting to use memory, the default, put in quotes the path to a directory that +already exists if wanting to use disk space +*/ + +params.gtdb_tk_scratch_location = "" + +/* + Filters checkm results based on estimate completion, redundancy, and + strain heterogeneity. Defaults are conservatively 90, 10, and 50 +*/ + +process FILTER_CHECKM_RESULTS_AND_COPY_MAGS { + + tag "Filtering checkm-s results..." + label "mags" + label "bit" + + input: + path(bins_checkm_results) + path(bins) + output: + path("${params.additional_filename_prefix}MAGs-checkm-out.tsv"), emit: MAGs_checkm_out + path("MAGs_dir/"), emit: MAGs_dir + script: + """ + # Only running if there were bins recovered + if [ `find -L . -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + cat <( printf "Bin Id\\tMarker lineage\\t# genomes\\t# markers\\t# marker sets\\t0\\t1\\t2\\t3\\t4\\t5+\\tCompleteness\\tContamination\\tStrain heterogeneity\\n" ) \\ + <( awk -F '\\t' ' \$12 >= ${params.min_est_comp} && \$13 <= ${params.max_est_redund} && \$14 <= ${params.max_est_strain_het} ' ${bins_checkm_results} ) \\ + > MAGs-checkm-out.tmp + + sed 's/-bin\\./-MAG-/' MAGs-checkm-out.tmp > ${params.additional_filename_prefix}MAGs-checkm-out.tsv + + [ -d MAGs_dir/ ] || mkdir MAGs_dir/ + for MAG in `cut -f 1 MAGs-checkm-out.tmp | tail -n +2` + do + new_ID=`echo \$MAG | sed 's/-bin\\./-MAG-/'` + cp \$MAG.fasta MAGs_dir/\${new_ID}.fasta + done + + else + + printf "There were no MAGs recovered.\\n" > ${params.additional_filename_prefix}MAGs-checkm-out.tsv + + fi + """ +} + + + +// Assign taxonomy to MAGs with gtdb-tk + +process GTDBTK_ON_MAGS { + + tag "Assigning taxonomy to your MAGs with gtdb-tk..." + label "mags" + + input: + path(MAGs_checkm_out) + path(MAGs_dir) + path(gtdbtk_db_dir) + val(use_gtdbtk_scratch_location) + env(GTDBTK_DATA_PATH) + + output: + path("gtdbtk-out/") + script: + """ + # Only running if any MAGs were recovered + if [ `find -L ${MAGs_dir} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + if [ ${use_gtdbtk_scratch_location} == 'true' ]; then + + [ -d gtdbtk_scratch_location/ ] || mkdir gtdbtk_scratch_location/ + + gtdbtk classify_wf \\ + --scratch_dir gtdbtk_scratch_location/ \\ + --genome_dir ${MAGs_dir} \\ + -x fasta \\ + --out_dir gtdbtk-out/ \\ + --cpus ${task.cpus} \\ + --pplacer_cpus 1 + + else + + gtdbtk classify_wf \\ + --genome_dir ${MAGs_dir} \\ + -x fasta \\ + --out_dir gtdbtk-out/ \\ + --cpus ${task.cpus} \\ + --pplacer_cpus 1 + + fi + + else + + mkdir -p gtdbtk-out/ + printf "There were no MAGs recovered.\\n" \\ + > gtdbtk-out/No-MAGs-recovered.txt + + printf "\\n\\nThere were no MAGs recovered, so GTDB-tk was not run.\\n\\n" + + fi + """ +} + + + +// Summarize MAG assemblies +process SUMMARIZE_MAG_ASSEMBLIES { + + tag "Summarizing MAG assemblies..." + label "mags" + label "bit" + + input: + path(MAGs_dir) + output: + path("${params.additional_filename_prefix}MAG-assembly-summaries.tsv") + script: + """ + # Only running if any MAGs were recovered + if [ `find -L ${MAGs_dir} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + bit-summarize-assembly ${MAGs_dir}/*.fasta -o MAG-summaries.tmp -t + + # Slimming down the output + cut -f 1,2,3,5,6,8,11,18,19,20 MAG-summaries.tmp \\ + > ${params.additional_filename_prefix}MAG-assembly-summaries.tsv + + else + + printf "There were no MAGs recovered.\\n" \\ + > ${params.additional_filename_prefix}MAG-assembly-summaries.tsv + + fi + """ +} + +process GENERATE_MAGS_OVERVIEW_TABLE { + + tag "Generating an overview table of all MAGs..." + label "mags" + label "bit" + + input: + path(MAG_assembly_summaries) + path(MAGs_checkm_out) + path(gtdbtk_out) + path(MAGs_dir) + output: + path("${params.additional_filename_prefix}MAGs-overview${params.assay_suffix}.tsv") + + script: + """ + # Only running if any MAGs were recovered + if [ `find -L ${MAGs_dir} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + #--------------------------- get_MAGs_estimates_and_taxonomy.sh ------------------------------------# + get_MAGs_estimates_and_taxonomy.sh ${MAGs_dir} ${MAG_assembly_summaries} ${MAGs_checkm_out} ${gtdbtk_out} + #----------------------------------------------------------------------------------------------------# + + # Adding headers + cat <(printf "est. completeness\\test. redundancy\\test. strain heterogeneity\\n") \\ + checkm-estimates.tmp > checkm-estimates-with-headers.tmp + + cat <(printf "domain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n") \\ + gtdb-taxonomies.tmp > gtdb-taxonomies-with-headers.tmp + + paste ${MAG_assembly_summaries} \\ + checkm-estimates-with-headers.tmp \\ + gtdb-taxonomies-with-headers.tmp \\ + > MAGs-overview.tmp + + # Ordering by taxonomy + head -n 1 MAGs-overview.tmp > MAGs-overview-header.tmp + + tail -n +2 MAGs-overview.tmp | \\ + sort -t \$'\\t' -k 14,20 > MAGs-overview-sorted.tmp + + cat MAGs-overview-header.tmp MAGs-overview-sorted.tmp \\ + > ${params.additional_filename_prefix}MAGs-overview${params.assay_suffix}.tsv + + else + + printf "There were no MAGs recovered.\\n" \\ + > ${params.additional_filename_prefix}MAGs-overview${params.assay_suffix}.tsv + + fi + """ +} + + +process SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS { + + tag "Parsing MAG KO annotations..." + label "mags" + label "bit" + + input: + path(MAGs_overview) + path(gene_coverage_annotation_and_tax_files) + path(MAGs_dir) + output: + path("${params.additional_filename_prefix}MAG-level-KO-annotations${params.assay_suffix}.tsv") + + script: + """ + # Only running if any MAGs were recovered + if [ `find -L ${MAGs_dir} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + for MAG in `cut -f 1 ${MAGs_overview} | tail -n +2` + do + + sample_ID=`echo \$MAG | sed 's/-MAG-[0-9]*\$//'` + grep "^>" ${MAGs_dir}/\$MAG.fasta | tr -d ">" > curr-contig-ids.tmp + + parse-MAG-annots.py \\ + -i \${sample_ID}-gene-coverage-annotation-and-tax.tsv \\ + -w curr-contig-ids.tmp \\ + -M \$MAG \\ + -o ${params.additional_filename_prefix}MAG-level-KO-annotations${params.assay_suffix}.tsv + + done + + else + + printf "There were no MAGs recovered.\\n" \\ + > ${params.additional_filename_prefix}MAG-level-KO-annotations${params.assay_suffix}.tsv + + fi + """ +} + + +process SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { + + tag "Summarizing MAG KO annotations using kegg decoder..." + label "mags" + + + input: + path(MAG_level_KO_annotations) + path(MAGs_dir) + output: + path("${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.tsv") + + script: + """ + # Getting number of MAGs recovered + num_mags_recovered=`find -L ${MAGs_dir}/ -name '*.fasta' | wc -l | sed 's/^ *//'` + # Only running if any MAGs were recovered + if [ \$num_mags_recovered -gt 0 ]; then + + # KEGGDecoder splits on the first underscore to identify unique genome/MAG IDs + # this can be problematic with how things are named, so we are swapping them all to not have + # any "_" first, then afterwards we are changing the output table back to the original names so + # they match elsewhere (they will still be slightly different in the html output, but that is + # only manually explored anyway) + + # Making version of input for KEGGDecoder with no underscores + tr "_" "-" < ${MAG_level_KO_annotations} > mod-MAG-level-KO-annotations.tmp + + # Making mapping file + paste <( cut -f 1 ${MAG_level_KO_annotations} ) \\ + <( cut -f 1 mod-MAG-level-KO-annotations.tmp ) \\ + > MAG-ID-map.tmp + + # Running KEGGDecoder + # can only create html output if there are more than 1 + if [ \$num_mags_recovered -gt 1 ]; then + KEGG-decoder -v interactive -i mod-MAG-level-KO-annotations.tmp -o MAG-KEGG-Decoder-out.tmp + else + KEGG-decoder -i mod-MAG-level-KO-annotations.tmp -o MAG-KEGG-Decoder-out.tmp + fi + + # Swapping MAG IDs back in output tsv from KEGGDecoder + swap-MAG-IDs.py -i MAG-KEGG-Decoder-out.tmp -m MAG-ID-map.tmp -o MAG-KEGG-Decoder-out.tsv && \\ + mv MAG-KEGG-Decoder-out.tsv \\ + ${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.tsv + + + else + + printf "There were no MAGs recovered.\\n" \\ + > ${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.tsv + + fi + """ +} + + + +workflow summarize_mags { + take: + bins_checkm_results_ch + bins_ch + gtdbtk_db_dir + use_gtdbtk_scratch_location + gene_coverage_annotation_and_tax_files_ch + + + main: + FILTER_CHECKM_RESULTS_AND_COPY_MAGS(bins_checkm_results_ch, bins_ch) + MAGs_checkm_out_ch = FILTER_CHECKM_RESULTS_AND_COPY_MAGS.out.MAGs_checkm_out + MAGs_dir_ch = FILTER_CHECKM_RESULTS_AND_COPY_MAGS.out.MAGs_dir + + gtdbtk_out_ch = GTDBTK_ON_MAGS(MAGs_checkm_out_ch, MAGs_dir_ch, gtdbtk_db_dir, use_gtdbtk_scratch_location, gtdbtk_db_dir) + + MAG_assembly_summaries_ch = SUMMARIZE_MAG_ASSEMBLIES(MAGs_dir_ch) + + MAGs_overview_ch = GENERATE_MAGS_OVERVIEW_TABLE(MAG_assembly_summaries_ch, + MAGs_checkm_out_ch, + gtdbtk_out_ch, + MAGs_dir_ch) + + MAG_level_KO_annotations_ch = SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS(MAGs_overview_ch, + gene_coverage_annotation_and_tax_files_ch, + MAGs_dir_ch) + + SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER(MAG_level_KO_annotations_ch, MAGs_dir_ch) + + emit: + MAGs_overview = MAGs_overview_ch + MAGs_dir = MAGs_dir_ch + +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf new file mode 100644 index 00000000..3822fc91 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf @@ -0,0 +1,40 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +/**************************************************************************************** +********************* Summarize Assembly based metagenomics processing ************************** +****************************************************************************************/ + +process GENERATE_ASSEMBLY_PROCESSING_OVERVIEW_TABLE { + + tag "Summarizing the results of assemnly processing...." + label "bit" + + input: + path(sample_IDs_file) + path(MAGs_overview) + path(MAGs_dir) + path(assemblies) + path(genes_aa) + path(metabat_assembly_depth_files) + path(bins) + path(bam_files) + output: + path("${params.additional_filename_prefix}Assembly-based-processing-overview${params.assay_suffix}.tsv") + script: + """ + mkdir assemblies_dir/ && mv *-assembly.fasta assemblies_dir/ + mkdir genes_dir/ && mv *-genes.faa genes_dir/ + mkdir mapping_dir/ && mv *-metabat-assembly-depth.tsv *.bam mapping_dir/ + mkdir bins_dir/ && mv *-bin*.fasta bins_dir/ + bash generate-assembly-based-overview-table.sh \\ + ${sample_IDs_file} \\ + assemblies_dir/ \\ + genes_dir/ \\ + mapping_dir/ \\ + bins_dir/ \\ + ${MAGs_dir}/ \\ + ${params.additional_filename_prefix}Assembly-based-processing-overview${params.assay_suffix}.tsv + """ +} + diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf new file mode 100644 index 00000000..24a790ed --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf @@ -0,0 +1,155 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +params.reduced_tree = "True" + +/**************************************************************************************** +********************* Bin check and summary ******************************************** +****************************************************************************************/ + +// Summarize bin assemblies +process SUMMARIZE_BIN_ASSEMBLIES { + + tag "Getting a summary of the recovered bins..." + label "bins" + label "bit" + + input: + path(bins) + output: + path("${params.additional_filename_prefix}bin-assembly-summaries.tsv") + + script: + """ + # Only running if any bins were recovered + if [ `find -L . -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + bit-summarize-assembly *.fasta -o bin-summaries.tmp -t + + # Slimming down the output + cut -f 1,2,3,5,6,8,11,18,19,20 bin-summaries.tmp \\ + > ${params.additional_filename_prefix}bin-assembly-summaries.tsv + + else + + printf "There were no bins recovered.\\n" \\ + > ${params.additional_filename_prefix}bin-assembly-summaries.tsv + + fi + """ +} + + +// Runs checkm on recovered bins +process CHECKM_ON_BINS { + + tag "Running checkm on the recovered bins..." + label "bins" + + input: + path(bins) + output: + path("${params.additional_filename_prefix}bins-checkm-out.tsv") + + script: + """ + # only running if there were bins recovered + if [ `find -L . -name '*fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + mkdir -p checkm-working-tmp/ + + if [ ${params.reduced_tree} == "True" ]; then + + checkm lineage_wf \\ + -f ${params.additional_filename_prefix}bins-checkm-out.tsv \\ + --tab_table \\ + -t ${task.cpus} \\ + --reduced_tree \\ + --pplacer_threads 1 \\ + -x fasta . checkm-out-tmp/ \\ + --tmpdir checkm-working-tmp/ + + else + + checkm lineage_wf \\ + -f ${params.additional_filename_prefix}bins-checkm-out.tsv \\ + --tab_table \\ + -t ${task.cpus} \\ + --pplacer_threads 1 \\ + -x fasta . checkm-out-tmp/ \\ + --tmpdir checkm-working-tmp/ + + fi + + else + + printf "There were no bins recovered, so checkm was not run.\\n" \\ + > ${params.additional_filename_prefix}bins-checkm-out.tsv + + fi + """ +} + +process GENERATE_BINS_OVERVIEW_TABLE { + + tag "Generating an overall overview of the recovered bins..." + label "bins" + label "bit" + + input: + path(bin_assembly_summaries) + path(bins_checkm_results) + path(bins) + output: + path("${params.additional_filename_prefix}bins-overview${params.assay_suffix}.tsv") + + script: + """ + # Only running if there were bins recovered + if [ `find -L . -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + # Making sure none of the intermediate files exist already + [ -f checkm-estimates.tmp ] && rm -rf checkm-estimates.tmp + [ -f checkm-estimates-with-headers.tmp ] && rm -rf checkm-estimates-with-headers.tmp + + for bin in `cut -f 1 ${bin_assembly_summaries} | tail -n +2` + do + + grep -w -m 1 "^\$bin" ${bins_checkm_results} | \\ + cut -f 12,13,14 >> checkm-estimates.tmp + + done + + # Adding header + cat <(printf "est. completeness\\test. redundancy\\test. strain heterogeneity\\n") \\ + checkm-estimates.tmp > checkm-estimates-with-headers.tmp + + # Combining + paste ${bin_assembly_summaries} checkm-estimates-with-headers.tmp \\ + > ${params.additional_filename_prefix}bins-overview${params.assay_suffix}.tsv + + else + + printf "There were no bins recovered.\\n" \\ + > ${params.additional_filename_prefix}bins-overview${params.assay_suffix}.tsv + + fi + """ +} + + +workflow summarize_bins { + + take: + binning_ch + + main: + bins = binning_ch.map{ sample_id, depth, bins -> bins instanceof List ? bins.each{it}: bins }.flatten().collect() + bin_assembly_summaries_ch = SUMMARIZE_BIN_ASSEMBLIES(bins) + bins_checkm_results_ch = CHECKM_ON_BINS(bins) + table = GENERATE_BINS_OVERVIEW_TABLE(bin_assembly_summaries_ch, bins_checkm_results_ch, bins) + + emit: + bins_checkm_results = bins_checkm_results_ch + overview_table = table +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config new file mode 100644 index 00000000..e4a23d60 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -0,0 +1,410 @@ +// global parameter +params { + + + // input file + // a 3-column (single-end) or 4-column (paired-end) file + csv_file = "${baseDir}/file.csv" + /* Run assembly-based workflow, read-based, or both + (values need to be one of: "assembly-based", "read-based", or "both") + It runs both by default + */ + workflow = "both" + assay_suffix = "_GLmetagenomics" + // additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets) + // leave as empty, i.e. "", if not wanted, include separator at end if adding one, e.g. "Swift1S_" + additional_filename_prefix = "" + publishDir_mode = "link" // "copy", "link", "symlink" + + // Quality trimmed/filtered suffixes + filtered_R1_suffix = "_R1_filtered.fastq.gz" + filtered_R2_suffix = "_R2_filtered.fastq.gz" + + // If single-end + filtered_suffix = "_filtered.fastq.gz" + + // Directories + + // Raw reads directory (can be relative to workflow directory, or needs to be full path) + raw_reads_dir = "${baseDir}/Raw_Sequence_Data/" + // output directories (all relative to processing directory, will be created) + fastqc_out_dir = "${baseDir}/FastQC_Outputs/" + filtered_reads_dir = "${baseDir}/Filtered_Sequence_Data/" + assembly_based_dir = "${baseDir}/Assembly-based_Processing/" + assemblies_dir = "${baseDir}/Assembly-based_Processing/assemblies/" + genes_dir = "${baseDir}/Assembly-based_Processing/predicted-genes/" + annotations_and_tax_dir = "${baseDir}/Assembly-based_Processing/annotations-and-taxonomy/" + mapping_dir = "${baseDir}/Assembly-based_Processing/read-mapping/" + combined_output_dir = "${baseDir}/Assembly-based_Processing/combined-outputs/" + bins_dir = "${baseDir}/Assembly-based_Processing/bins/" + MAGs_dir = "${baseDir}/Assembly-based_Processing/MAGs/" + read_based_dir = "${baseDir}/Read-based_Processing/" + + // Database creation + database { + CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" + cat_db = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/CAT_prepare_20210107/" + ko_db_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/kofamscan_db/" + metaphlan_db_dir = false // "/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/metaphlan4-db/" + chocophlan_dir = false // "/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/humann3-db/chocophlan/" + uniref_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/humann3-db/uniref/" + utilities_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/humann3-db/utility_mapping/" + gtdbtk_db_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/GTDB-tk-ref-db/" + } + + // Quality assessment parameters + swift_1S = false + adapters = "${baseDir}/config/bbtools_adapters.fa" + multiqc_config = "${baseDir}/config/multiqc.config" + + + // Assembly + max_mem = 100e9 // 100GB + + // Binning parameters + reduced_tree = "True" + + // Annotation parameters + pileup_mem = "5g" // pileup.sh paramater for calculating contig coverage and depth + block_size = 4 // CAT blocksize + + // ---------- CAT database directory strings -----------------------------------------// + // The string below will be added to the end of the params.database.cat_db provided above + // cat taxonomy directory with cat_db path provided above + cat_taxonomy_dir = "2021-01-07_taxonomy/" + cat_db_sub_dir = "2021-01-07_CAT_database/" + + // MAG parameters + min_est_comp = 90 + max_est_redund = 10 + max_est_strain_het = 50 + + /* + Scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; + see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes + leave empty if wanting to use memory, the default, put in quotes the path to a directory that + already exists if wanting to use disk space + */ + + use_gtdbtk_scratch_location = false + + + conda{ + // Specify the paths to your existing conda environments + qc = "/global/smf/miniconda38_admin/envs/a67a313b901bffeb949ee679820981a4_" + humann3 = "/global/smf/miniconda38_admin/envs/8cadfa296db4337f4d5f4ee6bd59f2d3_" + cat = "/global/smf/miniconda38_admin/envs/genelab-utils/envs/CAT" + prodigal = "/global/smf/miniconda38_admin/envs/31039157d8d2320a653a50b2353ce0c9_" + metabat = "/global/smf/miniconda38_admin/envs/1e48cd71cb5739eefcdf849c11557b7e" + gtdbtk = "/global/smf/miniconda38_admin/envs/05cd6e2c70e60a034af2af7d2e5abb8f" + kegg_decoder = "/global/smf/miniconda38_admin/envs/5dc7e7553274029dcf87b4ce64b4391e/" + megahit = "/global/smf/miniconda38_admin/envs/8600b7db4da6dc22ed180c125d86b864" + bit = "/global/smf/miniconda38_admin/envs/0bb3c81a02cf9007313e2d71282d6319" + kofamscan = "/global/smf/miniconda38_admin/envs/56ab8e5f1bff6ce5c2441dcaca538be7" + mapping = "/global/smf/miniconda38_admin/envs/c6fadba5e06e7acb62dac81b8e83183a" + checkm = "/global/smf/miniconda38_admin/envs/3402ea06a7a3579585df8cbd75611bf1" + } + + GLDS_accession = false + executor = "local" // "slurm" + errorStrategy = "ignore" + params.use_conda = false +} + + +profiles { + + slurm { + executor = "slurm" + process.queueSize = 32 // how many jobs should be submitted at one time + singularity.enabled = true + singularity.autoMounts = true + singularity.cacheDir = '/global/data/temp_scratch/oobayomi/metagenomics/version2/singularity/' + } + + conda { + conda.enabled = true + params.use_conda = true + } + + singularity { + singularity.enabled = true + singularity.autoMounts = true + singularity.cacheDir = '/global/data/temp_scratch/oobayomi/metagenomics/version2/singularity/' + process.ext.singularity_pull_docker_container = true + } + + docker { + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g)' + docker.userEmulation = true + } + +} + +params.DB_ROOT = "${baseDir}/Reference_DBs" +params.use_conda = false + + +// Mount the databases to their predefined locations in the Biobakery container +if(!params.database.chocophlan_dir || !params.database.uniref_dir || + !params.database.metaphlan_db_dir || !params.database.utilities_dir) { + + //biobakery/humann:3.6 - replace /usr/local/lib/python3.6/dist-packages/humann/data/ + //chocophlan = "${params.DB_ROOT}/humann3-db/chocophlan/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/chocophlan_DEMO" + //uniref = "${params.DB_ROOT}/humann3-db/uniref/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/uniref_DEMO" + //utilities = "${params.DB_ROOT}/humann3-db/utility_mapping/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/misc" + utilities = "${params.DB_ROOT}/humann3-db/utility_mapping/:/usr/local/lib/python3.6/dist-packages/humann/data/misc" + +}else{ + + //biobakery/humann:3.6 - replace /usr/local/lib/python3.6/dist-packages/humann/data/ + //chocophlan = "${params.database.chocophlan_dir}:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/chocophlan_DEMO" + //uniref = "${params.database.uniref_dir}:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/uniref_DEMO" + //utilities = "${params.database.utilities_dir}:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/misc" + utilities = "${params.database.utilities_dir}:/usr/local/lib/python3.6/dist-packages/humann/data/misc" +} + + + +process { + + executor = "slurm" //"local" + //errorStrategy = "ignore" //{ params.errorStrategy ? params.errorStrategy : "ignore"} + queue = "normal,priority" + maxRetries = 2 + memory = '5 GB' + cache = 'lenient' + cpus = 8 + //debug = true + + + withLabel: bit { + cpus = 2 + conda = {params.conda.bit ? params.conda.bit : "envs/bit.yaml"} + container = "olabiyi/bit-astrobiomike:1.0" + memory = "5 GB" + } + +// Database set-up + withLabel: humann_setup { + conda = {params.conda.humann3 ? params.conda.humann3 : "envs/humann3.yaml"} + container = "biobakery/humann:3.9" //"olabiyi/humann3.6-metaphlan4.0.1:1.0" // "biobakery/humann:3.6" + } + + withName: SETUP_METAPHLAN { + memory = "100 GB" + } + + + withLabel: db_setup { + storeDir = "${params.DB_ROOT}/" + } + + withName: SETUP_CAT_DB { + conda = {params.conda.cat ? params.conda.cat : "envs/cat.yaml"} + container = "olabiyi/bit-astrobiomike:1.0" + } + + withName: SETUP_KOFAMSCAN_DB { + conda = {params.conda.kofamscan ? params.conda.kofamscan : "envs/kofamscan.yaml"} + container = "olabiyi/bit-astrobiomike:1.0" + } + + withName: SETUP_GTDBTK_DB { + conda = {params.conda.gtdbtk ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} + container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" + } + +// Qaulity control and assesment + withName: FASTQC { + conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + container = "staphb/fastqc:0.12.1" + cpus = 2 + publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode] + } + + withName: MULTIQC { + conda = {params.conda.qc ? params.conda.qc: "envs/qc.yaml"} + container = "staphb/multiqc:1.19" + cpus = 2 + publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] + } + + withName: BBDUK { + conda = {params.conda.qc ? params.conda.qc: "envs/qc.yaml"} + container = "staphb/bbtools:38.86" + cpus = 5 + memory = "20 GB" + publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode] + } + + +// Read-based processing + + withLabel: read_based { + conda = {params.conda.humann3 ? params.conda.humann3 : "envs/humann3.yaml"} + // this -> "biobakery/humann:3.9" is the latest version + container = "biobakery/humann:3.9" // "olabiyi/humann3.6-metaphlan4.0.1:1.0" //"biobakery/humann:3.6" // version compatible with our prebuilt database + publishDir = [path: params.read_based_dir, mode: params.publishDir_mode] + } + + + withName: HUMANN { + cpus = 8 + memory = "100 GB" + } + + withName: GEN_READ_BASED_PROCESSING_KO_TABLE { + containerOptions = "-B ${utilities}" + } + + +// Assembly-based proessing + + withLabel: assembly { + publishDir = [path: params.assemblies_dir, mode: params.publishDir_mode] + } + + withName: ASSEMBLE { + conda = {params.conda.megahit ? params.conda.megahit : "envs/megahit.yaml"} + container = "biocontainers/megahit:1.2.9_cv1" + cpus = 8 + memory = "20 GB" + } + + withLabel: mapping { + conda = {params.conda.mapping ? params.conda.mapping : "envs/mapping.yaml"} + cpus = 8 + memory = "5 GB" + } + + withName: MAPPING { + container = "biocontainers/bowtie2:v2.4.1_cv1" + } + + withName: SAM_TO_BAM { + container = "staphb/samtools:1.20" + publishDir = [path: params.mapping_dir, mode: params.publishDir_mode] + } + + withName: CALL_GENES { + conda = {params.conda.prodigal ? params.conda.prodigal : "envs/prodigal.yaml"} + container = "quay.io/biocontainers/prodigal:2.6.3--h031d066_8" + cpus = 8 + } + + withLabel: call_genes { + publishDir = [path: params.genes_dir, mode: params.publishDir_mode] + } + + withLabel: contig_annotation { + publishDir = [path: params.annotations_and_tax_dir, mode: params.publishDir_mode] + } + + withName: KO_ANNOTATION { + conda = {params.conda.kofamscan ? params.conda.kofamscan : "envs/kofamscan.yaml"} + container = "quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2" + cpus = 8 + memory = "10 GB" + disk = "20 GB" + publishDir = [path: params.annotations_and_tax_dir, mode: params.publishDir_mode] + } + + withName: TAX_CLASSIFICATION { + conda = {params.conda.cat ? params.conda.cat : "envs/cat.yaml"} + container = "nanozoo/catbat:5.2.3--e9c0a44" + cpus = 8 + memory = "50 GB" + disk = "100 GB" + } + + withName: GET_COV_AND_DET { + conda = {params.conda.mapping ? params.conda.mapping : "envs/mapping.yaml"} + container = "staphb/bbtools:38.86" + cpus = 8 + memory = "20 GB" + publishDir = [path: params.mapping_dir, mode: params.publishDir_mode] + } + + withLabel: combine_outputs { + publishDir = [path: params.combined_output_dir, mode: params.publishDir_mode] + } + + + withName: METABAT_BINNING { + conda = {params.conda.metabat ? params.conda.metabat : "envs/metabat.yaml"} + container = "nanozoo/metabat2:2.15--c1941c7" + cpus = 8 + publishDir = [[path: params.mapping_dir, mode: params.publishDir_mode, pattern: "*-metabat-assembly-depth.tsv"], + [path: params.bins_dir, mode: params.publishDir_mode, pattern: "*-bin*"]] + } + + withLabel: bins { + publishDir = [path: params.bins_dir, mode: params.publishDir_mode] + } + + withName: CHECKM_ON_BINS { + conda = {params.conda.checkm ? params.conda.checkm : "envs/checkm.yaml"} + container = "nanozoo/checkm:1.1.3--c79a047" + cpus = 8 + memory = "50 GB" + disk = "50 GB" + } + + withLabel: mags { + publishDir = [path: params.MAGs_dir, mode: params.publishDir_mode] + } + + withName: GTDBTK_ON_MAGS { + conda = {params.conda.gtdbtk ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} + container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" + containerOptions = {"-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata"} // { ${workflow.containerEngine} == 'singularity' ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } + cpus = 8 + memory = "600 GB" + disk = "700 GB" + } + + withName: SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { + conda = {params.conda.kegg_decoder ? params.conda.kegg_decoder : "envs/keggdecoder.yaml"} + container = "fmalmeida/keggdecoder:latest" + cpus = 8 + } + + withName: GENERATE_ASSEMBLY_PROCESSING_OVERVIEW_TABLE { + publishDir = [path: params.assembly_based_dir, mode: params.publishDir_mode] + } + +} + + +// Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${baseDir}/Resource_Usage/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${baseDir}/Resource_Usage/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${baseDir}/Resource_Usage/execution_trace_${trace_timestamp}.txt" +} +dag { + enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram + file = "${baseDir}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" +} + + + +manifest { + author = 'Olabiyi Aderemi Obayomi, Mike Lee' + homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/' + description = 'GeneLab bioinformatics processing pipelines for metagenomics sequencing data' + mainScript = 'main.nf' + defaultBranch = 'main' + nextflowVersion = '>=22.10.1' + version = '1.0.0' +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh deleted file mode 100755 index 7c006303..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env bash -set -e - -ls benchmarks/ > benchmark-filenames.tmp - -head -n 1 benchmarks/$( head -n 1 benchmark-filenames.tmp ) > benchmark-header.tmp - -paste <( printf "process" ) benchmark-header.tmp > building-tab.tmp - -for file in $(cat benchmark-filenames.tmp) -do - - cat <( paste <( echo ${file} | sed 's/-benchmarks.tsv//' ) <( tail -n +2 benchmarks/${file} ) ) >> building-tab.tmp - -done - -mv building-tab.tmp all-benchmarks.tsv -rm -rf benchmark-filenames.tmp benchmark-header.tmp diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py deleted file mode 100755 index 2acb7e3e..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python -import subprocess -import sys - -jobid = sys.argv[1] - -# if wanting to use, this should be added to the snakemake call from the root workflow dir: `--cluster-status scripts/slurm-status.py` - -output = str(subprocess.check_output("sacct -j %s --format State --noheader | head -1 | awk '{print $1}'" % jobid, shell=True).strip()) - -running_status=["PENDING", "CONFIGURING", "COMPLETING", "RUNNING", "SUSPENDED"] -if "COMPLETED" in output: - print("success") -elif any(r in output for r in running_status): - print("running") -else: - print("failed") diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm new file mode 100644 index 00000000..24c812c5 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm @@ -0,0 +1,63 @@ +#!/bin/bash + +#SBATCH --job-name="nf_master" ## Replace job_name with the name of the job you are running ## +#SBATCH --output=nf_master.o.%j ## Replace job_name with the name of the job you are running ## +#SBATCH --error=nf_master.e.%j ## Replace job_name with the name of the job you are running ## +#SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ## +#SBATCH --mem=2G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## +#SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ## +#SBATCH --mail-user=olabiyi.a.obayomi@nasa.gov ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## +#SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ## + +. ~/.profile + + +echo "nf_master" ## Replace job_name with the name of the job you are running ## +echo "" + + +## Add a time-stamp at the start of the job ## +start=$(date +%s) +echo "start time: $start" + +## Print the name of the compute node executing the job ## +echo $HOSTNAME + + +## Activate the conda environemnt containing the tools you need to run your job ## +## You can see a list of all available environments by running the command: conda env list ## +## If you need a conda envrionment installed request it using JIRA ## + +source activate /global/smf/miniconda38_admin/envs/genelab-utils ## Replace conda_env_name with the name of the environment ## + + +## Print the version of the tool you are using to ensure the tool version is recorded ## +echo "" +echo "Nextflow version: " ## Replace Tool with the name of the tool you are using ## +nextflow -v ## Replace this command with the command the tool uses to print its version ## +echo "" + + +## The command(s) that you want to run in this slurm job ## +export NXF_SINGULARITY_CACHEDIR=singularity/ +nextflow run main.nf -profile singularity -resume --csv_file SE_file.csv ## Replace command with the command(s) you want to run ## + + +## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## +echo "" +end=$(date +%s) +echo "end time: $end" +runtime_s=$(echo $(( end - start ))) +echo "total run time(s): $runtime_s" +sec_per_min=60 +sec_per_hr=3600 +runtime_m=$(echo "scale=2; $runtime_s / $sec_per_min;" | bc) +echo "total run time(m): $runtime_m" +runtime_h=$(echo "scale=2; $runtime_s / $sec_per_hr;" | bc) +echo "total run time(h): $runtime_h" +echo "" + + +## Print the slurm job ID so you have it recorded and can view slurm job statistics if needed ## +echo "slurm job ID: ${SLURM_JOB_ID}" + From ca431fc81a11c72547037b7396a0c43f65ef4e0e Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 15 May 2024 17:15:02 -0700 Subject: [PATCH 02/48] Added Documentation --- .../SW_MGIllumina/workflow_code/main.nf | 261 +++++++++++++----- .../workflow_code/nextflow.config | 103 ++++--- 2 files changed, 246 insertions(+), 118 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 85600dbd..49ed41b3 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -1,69 +1,197 @@ +#!/usr/bin/env nextflow nextflow.enable.dsl=2 +// color defs +c_back_bright_red = "\u001b[41;1m"; +c_bright_green = "\u001b[32;1m"; +c_blue = "\033[0;34m"; +c_reset = "\033[0m"; + + /************************************************** * HELP MENU ************************************** -************************************************** +**************************************************/ if (params.help) { - println("┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅") - println("┇ RNASeq Consensus Pipeline: $workflow.manifest.version ┇") - println("┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅") - println("Usage example 1: Processing GLDS datasets using genome fasta and gtf from Ensembl") - println(" > nextflow run ./main.nf --gldsAccession GLDS-194 -resume -profile conda --paired true") println() - println("Usage example 2: Processing GLDS datasets using local genome fasta and gtf") - println(" Note: ensemblVersion and ref_source are used here to label subdirectories for derived reference files.") - println(" > nextflow run ./main.nf --gldsAccession GLDS-194 --ensemblVersion 96 --ref_source --ref_fasta --ref_gtf ") + println("Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version") + println("USAGE:") + println("Example 1: Submit and run jobs with slurm in singularity containers.") + println(" > nextflow run main.nf -resume -profile slurm_sing --csv_file PE_file.csv") + println() + println("Example 2: : Submit and run jobs with slurm in conda environments.") + println(" > nextflow run main.nf -resume -profile slurm_conda --csv_file SE_file.csv") println() - println("Usage example 3: Processing Other datasets") - println(" Note: This requires a user-created runsheet.") - println(" > nextflow run ./main.nf --runsheetPath ") + println("Example 3: Run jobs locally in conda environments, supply a GLDS accession, and specify the path to an existing conda environment.") + println(" > nextflow run main.nf -resume -profile conda --GLDS_accession OSD-456 --conda.qc ") println() - println("arguments:") - println(" --help show this help message and exit") - println(" --gldsAccession GLDS-000") - println(" the GLDS accession id to process through the RNASeq Concensus Pipeline.") - println(" --runsheetPath Use a local runsheet instead one automatically generated from a GLDS ISA archive.") - println(" --ensemblVersion n Specifies the ensembl Version to use for the reference genome. The default version is ") - println(" --skipVV Skip automated V&V. Default: false") - println(" --paired Are the input reads paired-end. Default: true. set to false if single-end") - println(" --outputDir Directory to save staged raw files and processed files. Default: ") + println("Required arguments:") + println("""-profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda]. + singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. + slurm_sing and slurm_conda will submit and run jobs using slurm in singularity containers and conda environments, respectively. """) + println("--csv_file [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired). Mandatory if a GLDS accession is not provided.") + println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") + println(" The sample_id column should contain unique sample ids.") + println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") + println(" The paired column should be true for paired-end or anything else for single-end reads.") + + println("Optional arguments:") + println(" --help Print this help message and exit") + println(" --workflow [STRING] Which workflow should be run. Options are one of [read-based, assembly-based, both]. Default: both") + println(" --publishDir_mode [STRING] How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println(" --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: ignore") + println(" --swift_1S [BOOLEAN] Setting for trimming recommended when working with Swift 1S libraries.") + println(" adds `swift=t` setting to bbduk quality trimming/filtering command. For info on this, see example, ") + println(" https://swiftbiosci.com/wp-content/uploads/2019/03/16-0853-Tail-Trim-Final-442019.pdf.") + println(" Set to true if data was generated with Swift 1S library prep. Default: false") + println(" --adapters [PATH] Path to BBtools adapters for reads filtering. Default: config/bbtools_adapters.fa") + println(" --multiqc_config [PATH] Path to a custom multiqc config file. Default: config/multiqc.config") + println(" --use_gtdbtk_scratch_location [BOOLEAN] Should a scratch location be used to store GTDBTK temp files? true or false.") + println(" Scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive;") + println(" see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes") + println(" leave empty if wanting to use memory, the default, put in quotes the path to a directory that") + println(" already exists if wanting to use disk space. Default: false") + + println("MAG parameters: MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics") + println(" --min_est_comp [INT] Minimum estimated completion. Default: 90") + println(" --max_est_redund [INT] Minimum estimated redundancy. Default: 10") + println(" --max_est_strain_het [INT] Minimum estimated strain heterogeneity. Default: 50") + println(" --reduced_tree [STRING] reduced_tree option for checkm, limits the RAM usage to 16GB; https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands#tree.") + println(" 'True' for yes, anything else will be considered 'False' and the default full tree will be used. Default: 'True' ") + println(" --max_mem [INT] Maximum memory allowed passed to megahit assembler. Can be set either by proportion of available on system, e.g. 0.5") + println(" or by absolute value in bytes, e.g. 100e9 would be 100 GB. Default: 100e9") + + println(" --pileup_mem [STRING] pileup.sh paramater for calculating contig coverage and depth. Memory used by bbmap's pileup.sh (within the GET_COV_AND_DET process). ") + println(" passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes.") + println(" 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased.Default: '5g' ") + println(" --block_size [int] Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options. Default: 4") + + println("File Suffixes:") + println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Only applicable when input reads are single-end. Default: _filtered.fastq.gz") + println(" --filtered_R1_suffix [STRING] Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz") + println(" --filtered_R2_suffix [STRING] Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz") + println("Output directories:") + println(" --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/") + println(" --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: FastQC_Outputs/") + println(" --filtered_reads_dir [PATH] Where should your filtered reads be stored. Default: Filtered_Sequence_Data/") + println(" --assembly_based_dir [PATH] Where should the results of assembly-based analysis be stored. Default: Assembly-based_Processing/") + println(" --assemblies_dir [PATH] Where should your assemblies be stored. Default: Assembly-based_Processing/assemblies/") + println(" --genes_dir [PATH] Where should the predicted genes from your assemblies be stored. Default: Assembly-based_Processing/predicted-genes/") + println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: Assembly-based_Processing/annotations-and-taxonomy/") + println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: Assembly-based_Processing/read-mapping/") + println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: Assembly-based_Processing/combined-outputs/") + println(" --bins_dir [PATH] Assembly bins directory. Default: Assembly-based_Processing/bins/") + println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: Assembly-based_Processing/MAGs/") + println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: Read-based_Processing/") + + println("Genelab specific arguements:") + println(" --GLDS_accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") + println(" --assay_suffix [STRING] Genelabs assay suffix. Default: _GLmetagenomics.") + println(" --additional_filename_prefix [STRING] additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets)") + println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: '' ") + + println("Paths to existing databases and database links.") + println("CAT database directory strings:") + println("The strings below will be added to the end of the --database.cat_db path arguement provided below") + println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/") + println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/") + println(" --CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") + println(" --database.cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") + println(" --database.ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") + println(" --database.metaphlan_db_dir [PATH] Path to metaphlan database. Example, /path/to/Reference_DBs/metaphlan4-db/. Default: null.") + println(" --database.chocophlan_dir [PATH] Path to Humann's chocophlan nucleotide database. Example, /path/to/Reference_DBs/humann3-db/chocophlan/. Default: null.") + println(" --database.uniref_dir [PATH] Path to Humann's Uniref protein database. Example, /path/to/Reference_DBs/humann3-db/uniref/. Default: null.") + println(" --database.utilities_dir [PATH] Path to Humann's untilities database. Example, /path/to/Reference_DBs/humann3-db/utility_mapping/. Default: null.") + println(" --database.gtdbtk_db_dir [PATH] Path to GTDBTK database. Example, /path/Reference_DBs/GTDB-tk-ref-db/. Default: null.") + + println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") + println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") + println(" --conda.humann3 [PATH] Path to a conda environment with humann3 installed. Default: null.") + println(" --conda.cat [PATH] Path to a conda environment containing CAT (Contig annotation tool). Default: null.") + println(" --conda.prodigal [PATH] Path to a conda environment with prodigal installed. Default: null.") + println(" --conda.metabat [PATH] Path to a conda environment containing metabat. Default: null.") + println(" --conda.gtdbtk [PATH] Path to a conda environment containing gtdbtk. Default: null.") + println(" --conda.kegg_decoder [PATH] Path to a conda environment with kegg_decoder installed. Default: null.") + println(" --conda.megahit [PATH] Path to a conda environment containing megahit. Default: null.") + println(" --conda.bit [PATH] Path to a conda environment with bit installed. Default: null.") + println(" --conda.kofamscan [PATH] Path to a conda environment containing KOFAM SCAN . Default: null.") + println(" --conda.mapping [PATH] Path to a conda environment with bowtie and samtools installed. Default: null.") + println(" --conda.checkm [PATH] Path to a conda environment with checkm installed. Default: null.") + print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number cpus, memory per task etc.") exit 0 } -println "PARAMS: $params" -println "\n" -println "Storing any newly fetched primary references files here: ${params.referenceStorePath}" -println "Storing any newly generated derived reference files here: ${params.derivedStorePath}" - -/************************************************** -* CHECK REQUIRED PARAMS AND LOAD ***************** -************************************************** -// Get all params sourced data into channels -// Set up channel containing glds accession number -if ( params.gldsAccession ){ - ch_glds_accession = Channel.from( params.gldsAccession ) - } else { - exit 1, "Missing Required Parameter: gldsAccession. Example for setting on CLI: --gldsAccession GLDS-194" - } - -// Check conditionally required parameter (if using direct fasta, an ensemblVersion must also be supplied) -if ( params.ref_fasta ) { - if ( !params.ensemblVersion ) { exit 1, "Missing Required Parameter: ensemblVersion. Example for setting on CLI: --ensemblVersion 96" } -} - -if ( !params.outputDir ) { params.outputDir = "$workflow.launchDir" } - -ch_multiqc_config = params.multiqcConfig ? Channel.fromPath( params.multiqcConfig ) : Channel.fromPath("NO_FILE") - - - -*/ - -// color defs -c_back_bright_red = "\u001b[41;1m"; -c_bright_green = "\u001b[32;1m"; -c_blue = "\033[0;34m"; -c_reset = "\033[0m"; +log.info """ + Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version + + You have set the following parameters: + Profile: ${workflow.profile} + Input csv file : ${params.csv_file} + GLDS Accession : ${params.GLDS_accession} + Workflow : ${params.workflow} + Nextflow Directory publishing mode: ${params.publishDir_mode} + Swift 1S Libraries: ${params.swift_1S} + Nextflow Error strategy: ${params.errorStrategy} + BBDUK Adapters: ${params.adapters} + Use GTDBTK Scratch Location: ${params.use_gtdbtk_scratch_location} + MultiQC configuration file: ${params.multiqc_config} + Megahit Maximum Memory: ${params.max_mem} + Pile-up Memory: ${params.pileup_mem} + CAT block size: ${params.block_size} + + File Suffixes: + Filtered Reads Suffix (if single-end): ${params.filtered_suffix} + Filtered Forward Reads Suffix: ${params.filtered_R1_suffix} + Filtered Reverse Reads Suffix: ${params.filtered_R2_suffix} + + MAG Parameters: + Minimum completion: ${params.min_est_comp} + Maximum redundancy: ${params.max_est_redund} + Maximum strain heterogeneity: ${params.max_est_strain_het} + Use Reduced Tree: ${params.reduced_tree} + + Output Directories: + Raw reads: ${params.raw_reads_dir} + FastQC: ${params.fastqc_out_dir} + Filtered Reads: ${params.filtered_reads_dir} + Assembly-based Analysis: ${params.assembly_based_dir} + Assemblies: ${params.assemblies_dir} + Predicted Genes: ${params.genes_dir} + Contigs Taxonomy and Annotation: ${params.annotations_and_tax_dir} + Read mapping: ${params.mapping_dir} + Assemblies Summary: ${params.combined_output_dir} + Bins: ${params.bins_dir} + Meta Assembled Genomes (MAGs): ${params.MAGs_dir} + Read-based Analysis: ${params.read_based_dir} + + Genelab Assay Suffix: ${params.assay_suffix} + Additional Filename Prefix: ${params.additional_filename_prefix} + + Conda Environments: + qc: ${params.conda.qc} + humann3: ${params.conda.humann3} + CAT: ${params.conda.cat} + prodigal: ${params.conda.prodigal} + metabat: ${params.conda.metabat} + gtdbtk: ${params.conda.gtdbtk} + kegg decoder: ${params.conda.kegg_decoder} + megahit: ${params.conda.megahit} + bit: ${params.conda.bit} + kofamscan: ${params.conda.kofamscan} + mapping: ${params.conda.mapping} + checkm: ${params.conda.checkm} + + Databases: + CAT Taxonomy: ${params.cat_taxonomy_dir} + CAT DB sub directory: ${params.cat_db_sub_dir} + CAT URL: ${params.database.CAT_DB_LINK} + CAT DB: ${params.database.cat_db} + KOFAM Scan: ${params.database.ko_db_dir} + Metaphlan: ${params.database.metaphlan_db_dir} + Chocophlan: ${params.database.chocophlan_dir} + Uniref: ${params.database.uniref_dir} + Utilities: ${params.database.utilities_dir} + GTDBTK: ${params.database.gtdbtk_db_dir} + """.stripIndent() // Processes to create the required database(s) if not provided include { SETUP_CAT_DB; SETUP_KOFAMSCAN_DB; SETUP_GTDBTK_DB; @@ -91,8 +219,14 @@ workflow run_read_based_analysis { main: - if(!params.database.chocophlan_dir ||!params.database.uniref_dir || - !params.database.metaphlan_db_dir || !params.database.utilities_dir) { + chocophlanDirExists = params.database.chocophlan_dir != null + unirefDirExists = params.database.uniref_dir != null + metaphlanDirExists = params.database.metaphlan_db_dir != null + utilitiesDirExists = params.database.utilities_dir != null + + // if any of the four databases + if(!chocophlanDirExists ||!unirefDirExists || + !metaphlanDirExists || !utilitiesDirExists) { make_humann_db() read_based(filtered_ch, @@ -121,20 +255,20 @@ workflow run_assembly_based_analysis { main: kofam_db = params.database.ko_db_dir - if(!params.database.ko_db_dir) { + if(params.database.ko_db_dir == null) { SETUP_KOFAMSCAN_DB() kofam_db = SETUP_KOFAMSCAN_DB.out.ko_db_dir } cat_db = params.database.cat_db - if(!params.database.cat_db){ + if(params.database.cat_db == null){ - SETUP_CAT_DB(params.database.CAT_DB_LINK) + SETUP_CAT_DB(params.dataase.CAT_DB_LINK) cat_db = SETUP_CAT_DB.out.cat_db } gtdbtk_db_dir = params.database.gtdbtk_db_dir - if(!params.database.gtdbtk_db_dir){ + if(params.database.gtdbtk_db_dir == null){ SETUP_GTDBTK_DB() gtdbtk_db_dir = SETUP_GTDBTK_DB.out.gtdbtk_db_dir } @@ -145,8 +279,6 @@ workflow run_assembly_based_analysis { } - - // A function to delete white spaces from an input string and covert it to lower case def deleteWS(string){ @@ -169,7 +301,6 @@ workflow { .splitCsv(header:true) .set{params_ch} - }else{ Channel.fromPath(params.csv_file, checkIfExists: true) @@ -182,8 +313,6 @@ workflow { row -> deleteWS(row.paired) == 'true' ? tuple( "${row.sample_id}", [file("${row.forward}"), file("${row.reverse}")], deleteWS(row.paired)) : tuple( "${row.sample_id}", [file("${row.forward}")], deleteWS(row.paired)) }.set{reads_ch} - //reads_ch.view() - //return // Qality check and trim the input reads raw_qc(Channel.of("raw"), params.multiqc_config,reads_ch) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index e4a23d60..fe64672c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -1,7 +1,6 @@ -// global parameter +// Global parameter params { - // input file // a 3-column (single-end) or 4-column (paired-end) file csv_file = "${baseDir}/file.csv" @@ -43,13 +42,13 @@ params { // Database creation database { CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" - cat_db = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/CAT_prepare_20210107/" - ko_db_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/kofamscan_db/" - metaphlan_db_dir = false // "/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/metaphlan4-db/" - chocophlan_dir = false // "/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/humann3-db/chocophlan/" - uniref_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/humann3-db/uniref/" - utilities_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/humann3-db/utility_mapping/" - gtdbtk_db_dir = false //"/global/data/Data_Processing/Metagenomics_Datasets/Reference_DBs/GTDB-tk-ref-db/" + cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" + ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" + metaphlan_db_dir = null // "/path/to/Reference_DBs/metaphlan4-db/" + chocophlan_dir = null // "/path/to/Reference_DBs/humann3-db/chocophlan/" + uniref_dir = null // "/path/to/Reference_DBs/humann3-db/uniref/" + utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" + gtdbtk_db_dir = null // "/path/Reference_DBs/GTDB-tk-ref-db/" } // Quality assessment parameters @@ -91,53 +90,56 @@ params { conda{ // Specify the paths to your existing conda environments - qc = "/global/smf/miniconda38_admin/envs/a67a313b901bffeb949ee679820981a4_" - humann3 = "/global/smf/miniconda38_admin/envs/8cadfa296db4337f4d5f4ee6bd59f2d3_" - cat = "/global/smf/miniconda38_admin/envs/genelab-utils/envs/CAT" - prodigal = "/global/smf/miniconda38_admin/envs/31039157d8d2320a653a50b2353ce0c9_" - metabat = "/global/smf/miniconda38_admin/envs/1e48cd71cb5739eefcdf849c11557b7e" - gtdbtk = "/global/smf/miniconda38_admin/envs/05cd6e2c70e60a034af2af7d2e5abb8f" - kegg_decoder = "/global/smf/miniconda38_admin/envs/5dc7e7553274029dcf87b4ce64b4391e/" - megahit = "/global/smf/miniconda38_admin/envs/8600b7db4da6dc22ed180c125d86b864" - bit = "/global/smf/miniconda38_admin/envs/0bb3c81a02cf9007313e2d71282d6319" - kofamscan = "/global/smf/miniconda38_admin/envs/56ab8e5f1bff6ce5c2441dcaca538be7" - mapping = "/global/smf/miniconda38_admin/envs/c6fadba5e06e7acb62dac81b8e83183a" - checkm = "/global/smf/miniconda38_admin/envs/3402ea06a7a3579585df8cbd75611bf1" + qc = null // "/path/to/envs/qc" + humann3 = null //"/path/to/envs/humann3" + cat = null // "/path/to/envs/CAT" + prodigal = null // "/path/to/envs/prodigal" + metabat = null // "/path/to/envs/metabat" + gtdbtk = null // "/path/to/envs/gtdbtk" + kegg_decoder = null // "/path/to/envs/kegg_decoder" + megahit = null // "/path/to/envs/megahit" + bit = null // "/path/to/envs/bit" + kofamscan = null // "/path/to/envs/kofamscan" + mapping = null // "/path/to/envs/mapping" + checkm = null // "/path/to/envs/checkm" } GLDS_accession = false - executor = "local" // "slurm" errorStrategy = "ignore" - params.use_conda = false } +// Setting the default container engine as singularity +containerEngine = "singularity" profiles { - slurm { - executor = "slurm" - process.queueSize = 32 // how many jobs should be submitted at one time - singularity.enabled = true - singularity.autoMounts = true - singularity.cacheDir = '/global/data/temp_scratch/oobayomi/metagenomics/version2/singularity/' + slurm_conda { + executor = 'slurm' + conda.enabled = true } - - conda { - conda.enabled = true - params.use_conda = true + conda { + conda.enabled = true } + slurm_sing { + executor = 'slurm' + singularity.enabled = true + singularity.autoMounts = true + singularity.cacheDir = "singularity/" // local singularity images location + containerEngine = "singularity" + } singularity { singularity.enabled = true singularity.autoMounts = true - singularity.cacheDir = '/global/data/temp_scratch/oobayomi/metagenomics/version2/singularity/' - process.ext.singularity_pull_docker_container = true + singularity.cacheDir = "singularity/" // local singularity images location + containerEngine = "singularity" } docker { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g)' docker.userEmulation = true + containerEngine = "docker" } } @@ -146,9 +148,14 @@ params.DB_ROOT = "${baseDir}/Reference_DBs" params.use_conda = false +chocophlanDirExists = params.database.chocophlan_dir != null +unirefDirExists = params.database.uniref_dir != null +metaphlanDirExists = params.database.metaphlan_db_dir != null +utilitiesDirExists = params.database.utilities_dir != null + + // Mount the databases to their predefined locations in the Biobakery container -if(!params.database.chocophlan_dir || !params.database.uniref_dir || - !params.database.metaphlan_db_dir || !params.database.utilities_dir) { +if(!chocophlanDirExists ||!unirefDirExists || !metaphlanDirExists || !utilitiesDirExists) { //biobakery/humann:3.6 - replace /usr/local/lib/python3.6/dist-packages/humann/data/ //chocophlan = "${params.DB_ROOT}/humann3-db/chocophlan/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/chocophlan_DEMO" @@ -168,9 +175,7 @@ if(!params.database.chocophlan_dir || !params.database.uniref_dir || process { - - executor = "slurm" //"local" - //errorStrategy = "ignore" //{ params.errorStrategy ? params.errorStrategy : "ignore"} + errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"} queue = "normal,priority" maxRetries = 2 memory = '5 GB' @@ -189,7 +194,7 @@ process { // Database set-up withLabel: humann_setup { conda = {params.conda.humann3 ? params.conda.humann3 : "envs/humann3.yaml"} - container = "biobakery/humann:3.9" //"olabiyi/humann3.6-metaphlan4.0.1:1.0" // "biobakery/humann:3.6" + container = "biobakery/humann:3.9" } withName: SETUP_METAPHLAN { @@ -216,7 +221,7 @@ process { container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" } -// Qaulity control and assesment +// Quality control and assesment withName: FASTQC { conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" @@ -245,7 +250,7 @@ process { withLabel: read_based { conda = {params.conda.humann3 ? params.conda.humann3 : "envs/humann3.yaml"} // this -> "biobakery/humann:3.9" is the latest version - container = "biobakery/humann:3.9" // "olabiyi/humann3.6-metaphlan4.0.1:1.0" //"biobakery/humann:3.6" // version compatible with our prebuilt database + container = "biobakery/humann:3.9" publishDir = [path: params.read_based_dir, mode: params.publishDir_mode] } @@ -256,7 +261,7 @@ process { } withName: GEN_READ_BASED_PROCESSING_KO_TABLE { - containerOptions = "-B ${utilities}" + containerOptions = { ${containerEngine} == 'singularity' ? "-B ${utilities}" : "-v ${utilities}"} } @@ -359,7 +364,7 @@ process { withName: GTDBTK_ON_MAGS { conda = {params.conda.gtdbtk ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" - containerOptions = {"-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata"} // { ${workflow.containerEngine} == 'singularity' ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } + containerOptions = { ${containerEngine} == 'singularity' ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } cpus = 8 memory = "600 GB" disk = "700 GB" @@ -392,15 +397,9 @@ trace { enabled = true file = "${baseDir}/Resource_Usage/execution_trace_${trace_timestamp}.txt" } -dag { - enabled = false // TODO: DISCUSS, setting up nextflow env with graphviz to output the svg diagram - file = "${baseDir}/Resource_Usage/pipeline_dag_${trace_timestamp}.svg" -} - - manifest { - author = 'Olabiyi Aderemi Obayomi, Mike Lee' + author = 'Olabiyi Aderemi Obayomi, Mike Douglas Lee' homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/' description = 'GeneLab bioinformatics processing pipelines for metagenomics sequencing data' mainScript = 'main.nf' From d4fcd127facef811838203e22dfed1e39cc82138 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 16 May 2024 11:55:32 -0700 Subject: [PATCH 03/48] fixed params.conda --- .../SW_MGIllumina/workflow_code/nextflow.config | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index fe64672c..5265f968 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -110,15 +110,20 @@ params { // Setting the default container engine as singularity containerEngine = "singularity" +// Conda shouldn't be used be default except when using conda-based profiles +// i.e., slurm_conda and conda +params.use_conda = false profiles { slurm_conda { executor = 'slurm' - conda.enabled = true + conda.enabled = true + params.use_conda = true } conda { - conda.enabled = true + conda.enabled = true + params.use_conda = true } slurm_sing { @@ -145,8 +150,6 @@ profiles { } params.DB_ROOT = "${baseDir}/Reference_DBs" -params.use_conda = false - chocophlanDirExists = params.database.chocophlan_dir != null unirefDirExists = params.database.uniref_dir != null From f239d0e5855a6d3ab74ecb1d5ff677a6af6aa5bb Mon Sep 17 00:00:00 2001 From: olabiyi Date: Thu, 16 May 2024 14:20:39 -0700 Subject: [PATCH 04/48] Fixed paths and executor issues --- .../SW_MGIllumina/workflow_code/PE_file.csv | 4 +- .../SW_MGIllumina/workflow_code/SE_file.csv | 4 +- .../SW_MGIllumina/workflow_code/main.nf | 70 +++++++++---------- .../workflow_code/nextflow.config | 16 ++--- .../workflow_code/slurm_submit.slurm | 2 +- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv index 53f4c6fe..99c223ed 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv @@ -1,3 +1,3 @@ sample_id,forward,reverse,paired -Sample-1,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-1_R1_raw.fastq.gz,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-1_R2_raw.fastq.gz,true -Sample-2,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-2_R1_raw.fastq.gz,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-2_R2_raw.fastq.gz,true +Sample-1,/path/to/Sample-1_R1_raw.fastq.gz,/path/to/Sample-1_R2_raw.fastq.gz,true +Sample-2,/path/to/Sample-2_R1_raw.fastq.gz,/path/to/Sample-2_R2_raw.fastq.gz,true \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv index fa3e269c..2da86a1f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv @@ -1,3 +1,3 @@ sample_id,forward,paired -Sample-1,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-1_R1_raw.fastq.gz,false -Sample-2,/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/test/example-metagenomic-reads/Sample-2_R1_raw.fastq.gz,false +Sample-1,/path/to/Sample-1_R1_raw.fastq.gz,false +Sample-2,/path/to/Sample-2_R1_raw.fastq.gz,false diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 49ed41b3..4c419449 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -36,64 +36,64 @@ if (params.help) { println("Optional arguments:") println(" --help Print this help message and exit") - println(" --workflow [STRING] Which workflow should be run. Options are one of [read-based, assembly-based, both]. Default: both") + println(" --workflow [STRING] Which workflow should be run. Options are one of [read-based, assembly-based, both]. Default: both.") println(" --publishDir_mode [STRING] How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") println(" --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: ignore") println(" --swift_1S [BOOLEAN] Setting for trimming recommended when working with Swift 1S libraries.") println(" adds `swift=t` setting to bbduk quality trimming/filtering command. For info on this, see example, ") println(" https://swiftbiosci.com/wp-content/uploads/2019/03/16-0853-Tail-Trim-Final-442019.pdf.") - println(" Set to true if data was generated with Swift 1S library prep. Default: false") - println(" --adapters [PATH] Path to BBtools adapters for reads filtering. Default: config/bbtools_adapters.fa") - println(" --multiqc_config [PATH] Path to a custom multiqc config file. Default: config/multiqc.config") + println(" Set to true if data was generated with Swift 1S library prep. Default: false.") + println(" --adapters [PATH] Path to BBtools adapters for reads filtering. Default: config/bbtools_adapters.fa.") + println(" --multiqc_config [PATH] Path to a custom multiqc config file. Default: config/multiqc.config.") println(" --use_gtdbtk_scratch_location [BOOLEAN] Should a scratch location be used to store GTDBTK temp files? true or false.") println(" Scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive;") println(" see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes") println(" leave empty if wanting to use memory, the default, put in quotes the path to a directory that") - println(" already exists if wanting to use disk space. Default: false") + println(" already exists if wanting to use disk space. Default: false.") - println("MAG parameters: MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics") - println(" --min_est_comp [INT] Minimum estimated completion. Default: 90") - println(" --max_est_redund [INT] Minimum estimated redundancy. Default: 10") - println(" --max_est_strain_het [INT] Minimum estimated strain heterogeneity. Default: 50") + println("MAG parameters: MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics.") + println(" --min_est_comp [INT] Minimum estimated completion. Default: 90.") + println(" --max_est_redund [INT] Minimum estimated redundancy. Default: 10.") + println(" --max_est_strain_het [INT] Minimum estimated strain heterogeneity. Default: 50.") println(" --reduced_tree [STRING] reduced_tree option for checkm, limits the RAM usage to 16GB; https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands#tree.") - println(" 'True' for yes, anything else will be considered 'False' and the default full tree will be used. Default: 'True' ") + println(" 'True' for yes, anything else will be considered 'False' and the default full tree will be used. Default: 'True'. ") println(" --max_mem [INT] Maximum memory allowed passed to megahit assembler. Can be set either by proportion of available on system, e.g. 0.5") - println(" or by absolute value in bytes, e.g. 100e9 would be 100 GB. Default: 100e9") + println(" or by absolute value in bytes, e.g. 100e9 would be 100 GB. Default: 100e9.") println(" --pileup_mem [STRING] pileup.sh paramater for calculating contig coverage and depth. Memory used by bbmap's pileup.sh (within the GET_COV_AND_DET process). ") println(" passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes.") - println(" 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased.Default: '5g' ") - println(" --block_size [int] Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options. Default: 4") + println(" 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased.Default: '5g' .") + println(" --block_size [int] Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options. Default: 4.") println("File Suffixes:") - println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Only applicable when input reads are single-end. Default: _filtered.fastq.gz") - println(" --filtered_R1_suffix [STRING] Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz") - println(" --filtered_R2_suffix [STRING] Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz") + println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Only applicable when input reads are single-end. Default: _filtered.fastq.gz.") + println(" --filtered_R1_suffix [STRING] Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz.") + println(" --filtered_R2_suffix [STRING] Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz.") println("Output directories:") - println(" --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/") - println(" --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: FastQC_Outputs/") - println(" --filtered_reads_dir [PATH] Where should your filtered reads be stored. Default: Filtered_Sequence_Data/") - println(" --assembly_based_dir [PATH] Where should the results of assembly-based analysis be stored. Default: Assembly-based_Processing/") - println(" --assemblies_dir [PATH] Where should your assemblies be stored. Default: Assembly-based_Processing/assemblies/") - println(" --genes_dir [PATH] Where should the predicted genes from your assemblies be stored. Default: Assembly-based_Processing/predicted-genes/") - println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: Assembly-based_Processing/annotations-and-taxonomy/") - println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: Assembly-based_Processing/read-mapping/") - println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: Assembly-based_Processing/combined-outputs/") - println(" --bins_dir [PATH] Assembly bins directory. Default: Assembly-based_Processing/bins/") - println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: Assembly-based_Processing/MAGs/") - println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: Read-based_Processing/") + println(" --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/.") + println(" --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: FastQC_Outputs/.") + println(" --filtered_reads_dir [PATH] Where should your filtered reads be stored. Default: Filtered_Sequence_Data/.") + println(" --assembly_based_dir [PATH] Where should the results of assembly-based analysis be stored. Default: Assembly-based_Processing/.") + println(" --assemblies_dir [PATH] Where should your assemblies be stored. Default: Assembly-based_Processing/assemblies/.") + println(" --genes_dir [PATH] Where should the predicted genes from your assemblies be stored. Default: Assembly-based_Processing/predicted-genes/.") + println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: Assembly-based_Processing/annotations-and-taxonomy/.") + println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: Assembly-based_Processing/read-mapping/.") + println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: Assembly-based_Processing/combined-outputs/.") + println(" --bins_dir [PATH] Assembly bins directory. Default: Assembly-based_Processing/bins/.") + println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: Assembly-based_Processing/MAGs/.") + println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: Read-based_Processing/.") println("Genelab specific arguements:") println(" --GLDS_accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") println(" --assay_suffix [STRING] Genelabs assay suffix. Default: _GLmetagenomics.") - println(" --additional_filename_prefix [STRING] additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets)") - println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: '' ") + println(" --additional_filename_prefix [STRING] additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets).") + println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: '' .") println("Paths to existing databases and database links.") println("CAT database directory strings:") - println("The strings below will be added to the end of the --database.cat_db path arguement provided below") - println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/") - println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/") + println("The strings below will be added to the end of the --database.cat_db path arguement provided below.") + println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/.") + println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/.") println(" --CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") println(" --database.cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") println(" --database.ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") @@ -113,10 +113,10 @@ if (params.help) { println(" --conda.kegg_decoder [PATH] Path to a conda environment with kegg_decoder installed. Default: null.") println(" --conda.megahit [PATH] Path to a conda environment containing megahit. Default: null.") println(" --conda.bit [PATH] Path to a conda environment with bit installed. Default: null.") - println(" --conda.kofamscan [PATH] Path to a conda environment containing KOFAM SCAN . Default: null.") + println(" --conda.kofamscan [PATH] Path to a conda environment containing KOFAM SCAN. Default: null.") println(" --conda.mapping [PATH] Path to a conda environment with bowtie and samtools installed. Default: null.") println(" --conda.checkm [PATH] Path to a conda environment with checkm installed. Default: null.") - print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number cpus, memory per task etc.") + print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number of cpus, memory per task etc.") exit 0 } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index 5265f968..9b9d0436 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -3,7 +3,7 @@ params { // input file // a 3-column (single-end) or 4-column (paired-end) file - csv_file = "${baseDir}/file.csv" + csv_file = "${baseDir}/PE_file.csv" /* Run assembly-based workflow, read-based, or both (values need to be one of: "assembly-based", "read-based", or "both") It runs both by default @@ -105,7 +105,7 @@ params { } GLDS_accession = false - errorStrategy = "ignore" + errorStrategy = "terminate" } // Setting the default container engine as singularity @@ -117,7 +117,7 @@ params.use_conda = false profiles { slurm_conda { - executor = 'slurm' + process.executor = 'slurm' conda.enabled = true params.use_conda = true } @@ -127,7 +127,7 @@ profiles { } slurm_sing { - executor = 'slurm' + process.executor = 'slurm' singularity.enabled = true singularity.autoMounts = true singularity.cacheDir = "singularity/" // local singularity images location @@ -151,10 +151,10 @@ profiles { params.DB_ROOT = "${baseDir}/Reference_DBs" -chocophlanDirExists = params.database.chocophlan_dir != null -unirefDirExists = params.database.uniref_dir != null -metaphlanDirExists = params.database.metaphlan_db_dir != null -utilitiesDirExists = params.database.utilities_dir != null +chocophlanDirExists = {params.database.chocophlan_dir != null} +unirefDirExists = {params.database.uniref_dir != null} +metaphlanDirExists = {params.database.metaphlan_db_dir != null} +utilitiesDirExists = {params.database.utilities_dir != null} // Mount the databases to their predefined locations in the Biobakery container diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm index 24c812c5..a7ad4d3b 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm @@ -40,7 +40,7 @@ echo "" ## The command(s) that you want to run in this slurm job ## export NXF_SINGULARITY_CACHEDIR=singularity/ -nextflow run main.nf -profile singularity -resume --csv_file SE_file.csv ## Replace command with the command(s) you want to run ## +nextflow run main.nf -profile slurm_sing -resume --csv_file PE_file.csv ## Replace command with the command(s) you want to run ## ## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## From 2bb804911ba67ebdfb4be7cccee5cf3735eb9274 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 17 May 2024 01:28:16 -0700 Subject: [PATCH 05/48] Limited queue size --- .../SW_MGIllumina/workflow_code/main.nf | 15 ++++++++------- .../SW_MGIllumina/workflow_code/nextflow.config | 5 ++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 4c419449..2dd10e63 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -33,7 +33,7 @@ if (params.help) { println(" The sample_id column should contain unique sample ids.") println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") println(" The paired column should be true for paired-end or anything else for single-end reads.") - + println() println("Optional arguments:") println(" --help Print this help message and exit") println(" --workflow [STRING] Which workflow should be run. Options are one of [read-based, assembly-based, both]. Default: both.") @@ -50,7 +50,7 @@ if (params.help) { println(" see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes") println(" leave empty if wanting to use memory, the default, put in quotes the path to a directory that") println(" already exists if wanting to use disk space. Default: false.") - + println() println("MAG parameters: MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics.") println(" --min_est_comp [INT] Minimum estimated completion. Default: 90.") println(" --max_est_redund [INT] Minimum estimated redundancy. Default: 10.") @@ -59,16 +59,17 @@ if (params.help) { println(" 'True' for yes, anything else will be considered 'False' and the default full tree will be used. Default: 'True'. ") println(" --max_mem [INT] Maximum memory allowed passed to megahit assembler. Can be set either by proportion of available on system, e.g. 0.5") println(" or by absolute value in bytes, e.g. 100e9 would be 100 GB. Default: 100e9.") - + println() println(" --pileup_mem [STRING] pileup.sh paramater for calculating contig coverage and depth. Memory used by bbmap's pileup.sh (within the GET_COV_AND_DET process). ") println(" passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes.") println(" 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased.Default: '5g' .") println(" --block_size [int] Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options. Default: 4.") - + println() println("File Suffixes:") println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Only applicable when input reads are single-end. Default: _filtered.fastq.gz.") println(" --filtered_R1_suffix [STRING] Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz.") println(" --filtered_R2_suffix [STRING] Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz.") + println() println("Output directories:") println(" --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/.") println(" --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: FastQC_Outputs/.") @@ -82,13 +83,13 @@ if (params.help) { println(" --bins_dir [PATH] Assembly bins directory. Default: Assembly-based_Processing/bins/.") println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: Assembly-based_Processing/MAGs/.") println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: Read-based_Processing/.") - + println() println("Genelab specific arguements:") println(" --GLDS_accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") println(" --assay_suffix [STRING] Genelabs assay suffix. Default: _GLmetagenomics.") println(" --additional_filename_prefix [STRING] additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets).") println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: '' .") - + println() println("Paths to existing databases and database links.") println("CAT database directory strings:") println("The strings below will be added to the end of the --database.cat_db path arguement provided below.") @@ -102,7 +103,7 @@ if (params.help) { println(" --database.uniref_dir [PATH] Path to Humann's Uniref protein database. Example, /path/to/Reference_DBs/humann3-db/uniref/. Default: null.") println(" --database.utilities_dir [PATH] Path to Humann's untilities database. Example, /path/to/Reference_DBs/humann3-db/utility_mapping/. Default: null.") println(" --database.gtdbtk_db_dir [PATH] Path to GTDBTK database. Example, /path/Reference_DBs/GTDB-tk-ref-db/. Default: null.") - + println() println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") println(" --conda.humann3 [PATH] Path to a conda environment with humann3 installed. Default: null.") diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index 9b9d0436..6134bfd4 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -114,6 +114,7 @@ containerEngine = "singularity" // i.e., slurm_conda and conda params.use_conda = false + profiles { slurm_conda { @@ -150,6 +151,8 @@ profiles { } params.DB_ROOT = "${baseDir}/Reference_DBs" +// Number of jobs to run in parallel +executor.queueSize = 10 chocophlanDirExists = {params.database.chocophlan_dir != null} unirefDirExists = {params.database.uniref_dir != null} @@ -243,7 +246,7 @@ process { conda = {params.conda.qc ? params.conda.qc: "envs/qc.yaml"} container = "staphb/bbtools:38.86" cpus = 5 - memory = "20 GB" + memory = "50 GB" publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode] } From ea51ab3f704138adcd6a7ba404bda6bb26b9ceb2 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 17 May 2024 13:21:42 -0700 Subject: [PATCH 06/48] Enabled retrial of failed processes --- .../SW_MGIllumina/workflow_code/main.nf | 2 +- .../SW_MGIllumina/workflow_code/nextflow.config | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 2dd10e63..0f54be20 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -122,7 +122,7 @@ if (params.help) { } log.info """ - Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version + Nextflow MGIllumina Consensus Pipeline: $workflow.manifest.version You have set the following parameters: Profile: ${workflow.profile} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index 6134bfd4..a8470789 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -246,7 +246,9 @@ process { conda = {params.conda.qc ? params.conda.qc: "envs/qc.yaml"} container = "staphb/bbtools:38.86" cpus = 5 - memory = "50 GB" + errorStrategy = "retry" + maxRetries = 2 + memory = {50.GB * task.attempt} publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode] } @@ -287,7 +289,9 @@ process { withLabel: mapping { conda = {params.conda.mapping ? params.conda.mapping : "envs/mapping.yaml"} cpus = 8 - memory = "5 GB" + errorStrategy = "retry" + maxRetries = 2 + memory = {20.GB * task.attempt} } withName: MAPPING { @@ -317,8 +321,10 @@ process { conda = {params.conda.kofamscan ? params.conda.kofamscan : "envs/kofamscan.yaml"} container = "quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2" cpus = 8 - memory = "10 GB" - disk = "20 GB" + errorStrategy = "retry" + maxRetries = 2 + memory = {20.GB * task.attempt} + disk = {30.GB * task.attempt} publishDir = [path: params.annotations_and_tax_dir, mode: params.publishDir_mode] } From 9f07478e343e491328bafd9ac1b4a9639f3087c5 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 20 May 2024 13:57:49 -0700 Subject: [PATCH 07/48] Edited container engine variable --- .../SW_MGIllumina/workflow_code/main.nf | 4 ++-- .../SW_MGIllumina/workflow_code/nextflow.config | 14 +++++++------- .../SW_MGIllumina/workflow_code/slurm_submit.slurm | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 0f54be20..17268b92 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -7,13 +7,13 @@ c_bright_green = "\u001b[32;1m"; c_blue = "\033[0;34m"; c_reset = "\033[0m"; - +params.help = false /************************************************** * HELP MENU ************************************** **************************************************/ if (params.help) { println() - println("Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version") + println("Nextflow MGIllumina Consensus Pipeline: $workflow.manifest.version") println("USAGE:") println("Example 1: Submit and run jobs with slurm in singularity containers.") println(" > nextflow run main.nf -resume -profile slurm_sing --csv_file PE_file.csv") diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index a8470789..b321090c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -109,7 +109,7 @@ params { } // Setting the default container engine as singularity -containerEngine = "singularity" +params.containerEngine = "singularity" // Conda shouldn't be used be default except when using conda-based profiles // i.e., slurm_conda and conda params.use_conda = false @@ -132,20 +132,20 @@ profiles { singularity.enabled = true singularity.autoMounts = true singularity.cacheDir = "singularity/" // local singularity images location - containerEngine = "singularity" + params.containerEngine = "singularity" } singularity { singularity.enabled = true singularity.autoMounts = true singularity.cacheDir = "singularity/" // local singularity images location - containerEngine = "singularity" + params.containerEngine = "singularity" } docker { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g)' docker.userEmulation = true - containerEngine = "docker" + params.containerEngine = "docker" } } @@ -182,7 +182,7 @@ if(!chocophlanDirExists ||!unirefDirExists || !metaphlanDirExists || !utilitiesD process { errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"} - queue = "normal,priority" + //queue = "normal,priority" maxRetries = 2 memory = '5 GB' cache = 'lenient' @@ -269,7 +269,7 @@ process { } withName: GEN_READ_BASED_PROCESSING_KO_TABLE { - containerOptions = { ${containerEngine} == 'singularity' ? "-B ${utilities}" : "-v ${utilities}"} + containerOptions = { params.containerEngine == "singularity" ? "-B ${utilities}" : "-v ${utilities}"} } @@ -376,7 +376,7 @@ process { withName: GTDBTK_ON_MAGS { conda = {params.conda.gtdbtk ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" - containerOptions = { ${containerEngine} == 'singularity' ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } + containerOptions = { params.containerEngine == "singularity" ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } cpus = 8 memory = "600 GB" disk = "700 GB" diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm index a7ad4d3b..beb00294 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm @@ -6,7 +6,7 @@ #SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ## #SBATCH --mem=2G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## #SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ## -#SBATCH --mail-user=olabiyi.a.obayomi@nasa.gov ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## +#SBATCH --mail-user=email@domain.com ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## #SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ## . ~/.profile @@ -28,7 +28,7 @@ echo $HOSTNAME ## You can see a list of all available environments by running the command: conda env list ## ## If you need a conda envrionment installed request it using JIRA ## -source activate /global/smf/miniconda38_admin/envs/genelab-utils ## Replace conda_env_name with the name of the environment ## +source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the environment ## ## Print the version of the tool you are using to ensure the tool version is recorded ## From deea523bd8ceb5160d75e09968f8dd2592406e2a Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 21 May 2024 12:31:35 -0700 Subject: [PATCH 08/48] Handled pyfastx fasta index error --- .../SW_MGIllumina/workflow_code/modules/summarize_MAG.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf index 746ab665..ea87c6dc 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf @@ -138,6 +138,8 @@ process SUMMARIZE_MAG_ASSEMBLIES { # Only running if any MAGs were recovered if [ `find -L ${MAGs_dir} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + # Remove fasta index if it exists + rm -rf ${MAGs_dir}/*.fxi bit-summarize-assembly ${MAGs_dir}/*.fasta -o MAG-summaries.tmp -t # Slimming down the output From 6e68d0e859956338be8a6f6ae8582fd1abda5442 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 24 Jun 2024 15:23:20 -0700 Subject: [PATCH 09/48] Now accepts GLDS accessions --- .../SW_MGIllumina/workflow_code/PE_file.csv | 31 +- .../SW_MGIllumina/workflow_code/SE_file.csv | 31 +- .../workflow_code/bin/create_runsheet.py | 513 ------------------ .../workflow_code/bin/create_runsheet.sh | 15 + .../workflow_code/bin/download-GTDBTK-db.sh | 27 + .../workflow_code/bin/download-db.sh | 19 - .../workflow_code/envs/genelab.yaml | 8 + .../SW_MGIllumina/workflow_code/main.nf | 128 +++-- .../workflow_code/modules/assembly.nf | 19 +- .../modules/assembly_annotation.nf | 36 +- .../modules/assembly_based_processing.nf | 95 +++- .../workflow_code/modules/binning.nf | 8 +- .../modules/combine_contig_annotation.nf | 20 +- .../workflow_code/modules/coverage.nf | 7 +- .../workflow_code/modules/create_runsheet.nf | 47 +- .../modules/database_creation.nf | 45 +- .../modules/quality_assessment.nf | 46 +- .../modules/read_based_processing.nf | 80 ++- .../workflow_code/modules/read_mapping.nf | 10 +- .../workflow_code/modules/summarize_MAG.nf | 57 +- .../workflow_code/modules/summarize_bins.nf | 37 +- .../workflow_code/modules/zip_fasta.nf | 63 +++ .../workflow_code/nextflow.config | 175 +++--- .../workflow_code/slurm_submit.slurm | 9 +- 24 files changed, 734 insertions(+), 792 deletions(-) delete mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py create mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.sh create mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh delete mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/genelab.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/zip_fasta.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv index 99c223ed..06cdd986 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv @@ -1,3 +1,28 @@ -sample_id,forward,reverse,paired -Sample-1,/path/to/Sample-1_R1_raw.fastq.gz,/path/to/Sample-1_R2_raw.fastq.gz,true -Sample-2,/path/to/Sample-2_R1_raw.fastq.gz,/path/to/Sample-2_R2_raw.fastq.gz,true \ No newline at end of file +sample_id,forward,reverse,paired +RR23_FCS_FLT_F1,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F1_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F1_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F2,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F2_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F2_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F3,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F3_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F3_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F4,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F4_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F4_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F5,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F5_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F5_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F6,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F6_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F6_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F7,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F7_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F7_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F8,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F8_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F8_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_FLT_F9,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F9_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F9_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G1,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G1_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G1_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G2,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G2_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G2_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G3,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G3_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G3_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G4,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G4_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G4_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G5,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G5_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G5_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G6,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G6_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G6_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G7,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G7_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G7_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G8,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G8_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G8_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_GC_G9,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G9_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G9_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V1,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V1_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V1_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V2,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V2_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V2_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V3,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V3_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V3_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V4,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V4_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V4_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V5,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V5_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V5_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V6,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V6_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V6_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V7,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V7_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V7_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V8,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V8_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V8_R2_HRremoved_raw.fastq.gz,true +RR23_FCS_VIV_V9,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V9_R1_HRremoved_raw.fastq.gz,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V9_R2_HRremoved_raw.fastq.gz,true diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv index 2da86a1f..99b6e25f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv @@ -1,3 +1,28 @@ -sample_id,forward,paired -Sample-1,/path/to/Sample-1_R1_raw.fastq.gz,false -Sample-2,/path/to/Sample-2_R1_raw.fastq.gz,false +sample_id,forward,reverse,paired +RR23_FCS_FLT_F1,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F1_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F2,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F2_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F3,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F3_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F4,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F4_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F5,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F5_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F6,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F6_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F7,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F7_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F8,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F8_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_FLT_F9,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F9_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G1,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G1_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G2,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G2_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G3,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G3_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G4,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G4_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G5,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G5_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G6,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G6_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G7,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G7_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G8,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G8_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_GC_G9,/path/to/Raw_Sequence_Data/RR23_FCS_GC_G9_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V1,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V1_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V2,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V2_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V3,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V3_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V4,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V4_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V5,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V5_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V6,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V6_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V7,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V7_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V8,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V8_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V9,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V9_R1_HRremoved_raw.fastq.gz,false diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py deleted file mode 100755 index b0b4a3cb..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.py +++ /dev/null @@ -1,513 +0,0 @@ -#!/usr/bin/env python - -import argparse -import subprocess -import os -import sys -import tempfile -import re -import shutil -import pandas as pd -import requests - - -#################### -## 1. For OSD ARG # -#################### -# 1. Process the OSD arg to proper format -# 2. Download the ISA file -# 3. Convert to runsheet(s) -# 4. Select which runsheet to use - -######################## -## 1. For runsheet arg # -######################## -# 1. Select which runsheet to use - -########################## -## 2. Neutral flow after # -########################## -# 1. Validate schema of runsheet -# 2. Check if read_paths are URLs, prompt for download - - -# Process OSD arg: if numeric, append OSD-, if OSD-# or GLDS-#, leave it -def process_osd_argument(osd_arg): - # Check if the argument is just numeric - if osd_arg.isdigit(): - return f"OSD-{osd_arg}" - # Check if it's already in the correct format (OSD-numeric or GLDS-numeric) - elif re.match(r'^(OSD|GLDS)-\d+$', osd_arg): - return osd_arg - else: - print("Invalid format for --OSD argument. Use 'numeric', 'OSD-numeric', or 'GLDS-numeric'.") - sys.exit(1) - -# Check provided OSD/GLDS is not on the list of those that can't be autoprocessed -def check_provided_osd_or_glds(osd_arg): - # dictionaries of OSD/GLDS accessions and reason for not running, key = ID: value = reason - # there are 3 because ID can be provided prefixed with "OSD-", "GLDS-", or nothing - not the most efficient here, but ¯\_(ツ)_/¯ - not_autoprocessable_OSD_dict = { - "OSD-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", - "OSD-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", - "OSD-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." - } - - not_autoprocessable_GLDS_dict = { - "GLDS-65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", - "GLDS-66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", - "GLDS-82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." - } - - not_autoprocessable_dict = { - "65": "This dataset has multiple different primers mixed in different orientations in each individual sample, and the workflow is unable to handle it in an automated fashion.", - "66": "This dataset is not a standard amplicon dataset. It is comprised of hundreds of different primers targeting different regions of specific organisms, and the workflow is unable to handle it.", - "82": "This dataset is still multiplexed, and we don't yet have the mapping information to split the samples apart appropriately." - } - - # Checking based on OSD IDs - if osd_arg in not_autoprocessable_OSD_dict: - print(f"\nThe specified dataset {osd_arg} is unable to be processed with this workflow.") - print(f" Reason: {not_autoprocessable_OSD_dict[osd_arg]}\n") - sys.exit(1) - - # checking based on GLDS IDs - if osd_arg in not_autoprocessable_GLDS_dict: - print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") - print(f" Reason: {not_autoprocessable_GLDS_dict[osd_arg]}\n") - sys.exit(1) - - # checking based on plain IDs - if osd_arg in not_autoprocessable_dict: - print(f"\n The specified dataset {osd_arg} is unable to be processed with this workflow.") - print(f" Reason: {not_autoprocessable_dict[osd_arg]}\n") - sys.exit(1) - -# Run dpt-get-isa-archive in a temp folder, move it back to cd, return the filename -def download_isa_archive(accession_number): - with tempfile.TemporaryDirectory() as temp_dir: - try: - # Run the command in the temporary directory - subprocess.run( - ["dpt-get-isa-archive", "--accession", str(accession_number)], - check=True, - text=True, - cwd=temp_dir - ) - - # Find the downloaded zip file in the temp directory - downloaded_files = [f for f in os.listdir(temp_dir) if f.endswith('.zip')] - if not downloaded_files: - print("No ISA archive file was downloaded.", file=sys.stderr) - return None - - # Assuming there's only one file, get its name - downloaded_file = downloaded_files[0] - - # Move the file back to the current directory - shutil.move(os.path.join(temp_dir, downloaded_file), downloaded_file) - - full_path = os.path.abspath(downloaded_file) - return full_path - - except subprocess.CalledProcessError as e: - print("An error occurred while downloading ISA archive.", file=sys.stderr) - sys.exit(1) - -# Run dpt-isa-to-runsheet in a temp folder, move runsheet(s) back to cd, return list of runsheet(s) -def convert_isa_to_runsheet(accession_number, isa_zip): - with tempfile.TemporaryDirectory() as temp_dir: - # Copy the ISA archive to the temporary directory - temp_isa_zip_path = shutil.copy(isa_zip, temp_dir) - - try: - # Run the dpt-isa-to-runsheet command in the temporary directory - subprocess.run( - ["dpt-isa-to-runsheet", "--accession", accession_number, "--config-type", "amplicon", "--config-version", "Latest", "--isa-archive", os.path.basename(temp_isa_zip_path)], - check=True, - cwd=temp_dir, - stdout=sys.stdout, - stderr=sys.stderr - ) - - # Get the list of created files in the temp directory - created_files = [f for f in os.listdir(temp_dir) if os.path.isfile(os.path.join(temp_dir, f)) and f != os.path.basename(temp_isa_zip_path)] - - # Move the created files back to the current directory - moved_files = [] - for file in created_files: - shutil.move(os.path.join(temp_dir, file), file) - moved_files.append(file) - - return moved_files - - except subprocess.CalledProcessError as e: - print("An error occurred while converting ISA archive to runsheet.", file=sys.stderr) - sys.exit(1) - - -def handle_runsheet_selection(runsheet_files, target=None, specified_runsheet=None): - selected_runsheet = None - - # Use the specified runsheet if provided - if specified_runsheet and specified_runsheet in runsheet_files: - selected_runsheet = specified_runsheet - print(f"Using specified runsheet: {selected_runsheet}") - return selected_runsheet - - if len(runsheet_files) == 1: - if target: - runsheet = runsheet_files[0] - try: - runsheet_df = pd.read_csv(runsheet) - target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] - if target.lower() == target_region.lower(): - selected_runsheet = runsheet - except Exception as e: - print(f"Error reading {runsheet}: {e}") - print(f"Using runsheet: {selected_runsheet}") - - elif len(runsheet_files) > 1: - if target: - matching_runsheets = [] - for runsheet in runsheet_files: - try: - runsheet_df = pd.read_csv(runsheet) - target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] - if target.lower() == target_region.lower(): - matching_runsheets.append(runsheet) - except Exception as e: - print(f"Error reading {runsheet}: {e}") - - if len(matching_runsheets) == 1: - # One matching runsheet found - selected_runsheet = matching_runsheets[0] - print(f"Using runsheet: {selected_runsheet}") - - elif len(matching_runsheets) > 1: - # Multiple matching runsheets found - print("The study contains multiple assays with the same target. Please specify one of the following runsheet names as a parameter for the --specify-runsheet argument:") - for rs in matching_runsheets: - print(rs) - return None - - else: - # No matching runsheets found - print("No runsheet matches the specified genomic target. Please check the target or specify a runsheet using --specify-runsheet.") - return None - - else: - # No target specified and multiple runsheets are available - print("Multiple runsheets found but no genomic target specified. Cannot proceed. Use -t {16S, 18S, ITS} or --target {16S, 18S, ITS} to specify which assay/dataset to use.") - return None - - # Remove unselected runsheet files if a runsheet was selected - if selected_runsheet: - unselected_runsheets = [file for file in runsheet_files if file != selected_runsheet] - for file in unselected_runsheets: - try: - os.remove(file) - except Exception as e: - pass - - return selected_runsheet - -def check_runsheet_read_paths(runsheet_df): - # Check if a string is a URL / genelab URL - def is_url(s): - return "http://" in s or "https://" in s or "genelab-data.ndc.nasa.gov" in s - - - # Check if 'read2_path' column exists - paired_end = runsheet_df['paired_end'].eq(True).all() - - # Check the first row to determine if the paths are URLs or local paths - first_row = runsheet_df.iloc[0] - - uses_url = is_url(first_row['read1_path']) - if uses_url: - print("Runsheet references URLs.") - else: - print("Runsheet references local read files.") - - return uses_url - -def sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt'): - # Check if the DataFrame is paired-end - paired_end = runsheet_df['paired_end'].eq(True).all() - - with open(output_file, 'w') as file: - for index, row in runsheet_df.iterrows(): - # Extract base names minus the suffixes - base_read1 = os.path.basename(row['read1_path']).replace(row['raw_R1_suffix'], '') - - if paired_end: - base_read2 = os.path.basename(row['read2_path']).replace(row['raw_R2_suffix'], '') - # Check if base names match for paired-end data, necessary for snakemake arg expansion - if base_read1 != base_read2: - print(f"Mismatch in sample IDs in row {index}: {base_read1} vs {base_read2}") - sys.exit(1) - - # Write the base name to the file - file.write(f"{base_read1}\n") - - print(f"Unique sample IDs written to {output_file}") - -def handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt'): - print("Downloading read files...") - # Check if the DataFrame is paired-end - paired_end = runsheet_df['paired_end'].eq(True).all() - # Write 'Sample Name' into unique-sample-IDs.txt - with open(output_file, 'w') as file: - for sample_name in runsheet_df['Sample Name']: - file.write(sample_name + '\n') - - # Create ./raw_reads/ directory if it does not exist - raw_reads_dir = os.path.abspath('./raw_reads/') - if not os.path.exists(raw_reads_dir): - os.makedirs(raw_reads_dir) - - # Initialize count for skipped downloads - skipped_downloads_count = 0 - # Iterate over each row and download files if they don't exist - for _, row in runsheet_df.iterrows(): - sample_id = row['Sample Name'] - read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) - read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) if paired_end else None - - # Download Read 1 if it doesn't exist - if not os.path.exists(read1_path): - download_url_to_file(row['read1_path'], read1_path) - else: - skipped_downloads_count += 1 - - # Download Read 2 if it doesn't exist and if paired_end - if paired_end and read2_path and not os.path.exists(read2_path): - download_url_to_file(row['read2_path'], read2_path) - elif paired_end and read2_path: - skipped_downloads_count += 1 - - # Print the number of skipped downloads - if skipped_downloads_count > 0: - print(f"{skipped_downloads_count} read file(s) were already present and were not downloaded.") - -def download_url_to_file(url, file_path, max_retries=3, timeout_seconds=120): - retries = 0 - success = False - - while retries < max_retries and not success: - try: - response = requests.get(url, stream=True, timeout=timeout_seconds) - response.raise_for_status() # Raises an HTTPError for bad status codes - - with open(file_path, 'wb') as file: - shutil.copyfileobj(response.raw, file) - success = True - - except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: - retries += 1 - print(f"Attempt {retries}: Error occurred: {e}") - - except requests.exceptions.RequestException as e: - print(f"An unexpected error occurred: {e}") - break - - if not success: - print("Failed to download the read files.") - - -def write_params(runsheet_df, uses_urls): - - # Extract necessary variables from runsheet_df - data_type = "PE" if runsheet_df['paired_end'].eq(True).all() else "SE" - raw_R1_suffix = runsheet_df['raw_R1_suffix'].unique()[0] - raw_R2_suffix = runsheet_df['raw_R2_suffix'].unique()[0] if data_type == "PE" else "" - f_primer = runsheet_df['F_Primer'].unique()[0] - r_primer = runsheet_df['R_Primer'].unique()[0] if data_type == "PE" else "" - target_region = runsheet_df['Parameter Value[Library Selection]'].unique()[0] - - # Determine raw_reads_directory - if uses_urls: - raw_reads_directory = os.path.abspath('./raw_reads/') + '/' - else: - read1_path_dir = os.path.dirname(runsheet_df['read1_path'].iloc[0]) - raw_reads_directory = os.path.abspath(read1_path_dir) + '/' if read1_path_dir else "./" - - with open("GLparams_file.csv", "w") as f: - f.write("raw_reads_directory,raw_R1_suffix,raw_R2_suffix,f_primer,r_primer,target_region,data_type\n") - if data_type == "PE": - f.write(f"{raw_reads_directory},{raw_R1_suffix},{raw_R2_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") - else: - f.write(f"{raw_reads_directory},{raw_R1_suffix},{f_primer},{r_primer},{target_region},{data_type}\n") - - - -def write_input_file(runsheet_df): - """ Write input file for the workflow...""" - - print("writing out GLfile.csv...") - # Check if the DataFrame is paired-end - paired_end = runsheet_df['paired_end'].eq(True).all() - - # Create ./raw_reads/ directory if it does not exist - raw_reads_dir = os.path.abspath('./raw_reads/') - if not os.path.exists(raw_reads_dir): - os.makedirs(raw_reads_dir) - - # Create input file - with open("GLfile.csv", 'w') as file: - - if paired_end: - file.write(f"sample_id,forward,reverse,paired\n") - # Iterate over each row and download files if they don't exist - for _, row in runsheet_df.iterrows(): - sample_id = row['Sample Name'] - read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) - read2_path = os.path.join(raw_reads_dir, sample_id + row['raw_R2_suffix']) - file.write(f"{sample_id},{read1_path},{read2_path},true\n") - else: - file.write(f"sample_id,forward,paired\n") - for _, row in runsheet_df.iterrows(): - sample_id = row['Sample Name'] - read1_path = os.path.join(raw_reads_dir, sample_id + row['raw_R1_suffix']) - file.write(f"{sample_id},{read1_path},false\n") - - -# Check for single primer set, also check for invalid characters in primers used, exit if either -def validate_primer_sequences(runsheet_df): - errors = [] - - # Check that there is only 1 entry in each primer column - if len(runsheet_df['F_Primer'].unique()) > 1: - errors.append(f"Multiple primer sequences present in F_Primer: {runsheet_df['F_Primer'].unique()}.") - - if len(runsheet_df['R_Primer'].unique()) > 1: - errors.append(f"Multiple primer sequences present in R_primer: {runsheet_df['R_Primer'].unique()}.") - - - # Check for non-letter characters in primer sequences - def has_non_letter_characters(primer): - # Pattern to find any character that is not a letter - non_letter_pattern = re.compile(r'[^A-Za-z]') - return non_letter_pattern.search(primer) - - # Check each unique primer in the F_Primer and R_Primer columns - for f_primer in runsheet_df['F_Primer'].unique(): - if has_non_letter_characters(f_primer): - errors.append(f"Non-letter characters detected in F_Primer: '{f_primer}'") - - for r_primer in runsheet_df['R_Primer'].unique(): - if has_non_letter_characters(r_primer): - errors.append(f"Non-letter characters detected in R_Primer: '{r_primer}'") - - if errors: - print("Error: Invalid primer sequence(s) detected in the runsheet.") - for error in errors: - print(f" - {error}") - print("Correct the primer sequences in the runsheet and rerun the workflow from the runsheet using the --runsheetPath argument.") - sys.exit(1) - - -def main(): - # Argument parser setup with short argument names and an automatic help option - parser = argparse.ArgumentParser( - description='Create Runsheet from Genelab ID.', - add_help=True, - usage='%(prog)s [options]' # Custom usage message - ) - - parser.add_argument('-o', '--OSD', - metavar='osd_number', - help='A GeneLab OSD dataset accession number to pull its read files and associated metadata. Acceptable formats: ###, OSD-###, GLDS-###', - type=str) - - parser.add_argument('-t', '--target', - choices=['16S', '18S', 'ITS'], - help='Specify the amplicon target for the assay. Options: 16S, 18S, ITS. This is used to select the appropriate dataset from an OSD study when multiple options are available.', - type=str) - - parser.add_argument('-r', '--runsheetPath', - metavar='/path/to/runsheet.csv', - help='Set up the Snakemake workflow using a specified runsheet file.', - type=str) - - - parser.add_argument('--specify-runsheet', - help='Specifies the runsheet for an OSD dataset by name. Only used if there are multiple datasets with the same target in the study.', - metavar='runsheet_name', - type=str) - - - # Check if no arguments were provided - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - - try: - args = parser.parse_args() - except SystemExit: - parser.print_help() - sys.exit(1) - - target = args.target - isa_zip = "" - - # If OSD is used, pull ISA metadata for the study, create and select the runsheet - if args.OSD: - accession_number = process_osd_argument(args.OSD) - - # checking OSD/GLDS ID is not on the list of those the workflow definitely can't handle - check_provided_osd_or_glds(args.OSD) - - isa_zip = download_isa_archive(accession_number) - if isa_zip: - runsheet_files = convert_isa_to_runsheet(accession_number, isa_zip) - if runsheet_files: - runsheet_file = handle_runsheet_selection(runsheet_files, target, args.specify_runsheet) - if runsheet_file is None: - sys.exit() - else: - print("No runsheet files were created.") - else: - print("No ISA archive was downloaded. Cannot proceed to runsheet conversion.", file=sys.stderr) - sys.exit(1) - - # If a runsheet is specified, use that runsheet - elif args.runsheetPath: - runsheet_file = args.runsheetPath - - # Load the runsheet if a file is specified - # Create unique-sample-IDs.txt based on filenames or 'Sample Name' if URLs - # Download files if necessary - if args.OSD or args.runsheetPath: - if runsheet_file: - #runsheet_df = validate_runsheet_schema(runsheet_file) - runsheet_df = pd.read_csv(runsheet_file) - if runsheet_df is not None: - uses_urls = check_runsheet_read_paths(runsheet_df) - - # Check for primer file / invalid primers - validate_primer_sequences(runsheet_df) - - # Create the 'unique-sample-IDs.txt' file and download read files if necessary - if uses_urls: - handle_url_downloads(runsheet_df, output_file='unique-sample-IDs.txt') - else: - sample_IDs_from_local(runsheet_df, output_file='unique-sample-IDs.txt') - - # Create the config.yaml file - write_params(runsheet_df=runsheet_df, uses_urls=uses_urls) - # Create input file required by the workflow - write_input_file(runsheet_df=runsheet_df) - else: - print("Failed to validate the runsheet file.", file=sys.stderr) - sys.exit(1) - else: - print("No runsheet file specified.", file=sys.stderr) - sys.exit(1) - - - - -if __name__ == "__main__": - main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.sh new file mode 100755 index 00000000..dc66a1e3 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# A script to the the input csv file rtequired by this pipeline when a GLDS accession is provided +# Rather than the required input csv file + +ASSAY_TABLE=$1 # a_GLDS-466_metagenome-sequencing_whole-genome-shotgun-sequencing_illumina.txt +awk -v PWD=$PWD -F "\t" ' + BEGIN{print "sample_id,forward,reverse,paired"} \ + NR==1{for (i=1; i<=NF; i++) {ix[$i] = i}} \ + NR>1{gsub(" ", "", $ix["Raw Data File"]); \ + split($ix["Raw Data File"], reads_path, ","); \ + gsub("PAIRED","true",$ix["Parameter Value[Library Layout]"]); \ + gsub("SINGLE","false",$ix["Parameter Value[Library Layout]"]); \ + printf "%s,%s/Raw_Sequence_Data/%s,%s/Raw_Sequence_Data/%s,%s\n",$ix["Sample Name"], PWD, reads_path[1], PWD,reads_path[2],$ix["Parameter Value[Library Layout]"]} + ' ${ASSAY_TABLE} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh new file mode 100755 index 00000000..943f1fb4 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e + +echo "Downloading the GTDB-Tk database to ${GTDBTK_DATA_PATH}..." + +# GTDBTK_DB_PATH is defined in build.sh, store the db there + + +DB_URL=$1 +TAR_FILE=$(basename ${db_url}) + + +downloadFile=true + +while ${downloadFile};do + + wget --timeout=3600 --tries=0 --continue ${DB_URL} -P ${GTDBTK_DATA_PATH} && downloadFile=false + +done + +tar xvzf ${GTDBTK_DATA_PATH}/${TAR_FILE} -C ${GTDBTK_DATA_PATH} --strip 1 +rm ${GTDBTK_DATA_PATH}/${TAR_FILE} + +echo "GTDB-Tk database has been successfully downloaded." + +exit 0 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh deleted file mode 100644 index 53a711e4..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-db.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash - -set -e - -echo "Downloading the GTDB-Tk database to ${GTDBTK_DATA_PATH}..." - -# GTDBTK_DB_PATH is defined in build.sh, store the db there - - -db_url=https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz - - -wget $db_url -P ${GTDBTK_DATA_PATH} -tar xvzf ${GTDBTK_DATA_PATH}/gtdbtk_r202_data.tar.gz -C ${GTDBTK_DATA_PATH} --strip 1 -rm ${GTDBTK_DATA_PATH}/gtdbtk_r202_data.tar.gz - -echo "GTDB-Tk database has been successfully downloaded." - -exit 0 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/genelab.yaml b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/genelab.yaml new file mode 100644 index 00000000..9f2ec80b --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/genelab.yaml @@ -0,0 +1,8 @@ +name: genelab-utils +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - genelab-utils==1.3.22=py312_1 \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 17268b92..8ea49b02 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -8,26 +8,27 @@ c_blue = "\033[0;34m"; c_reset = "\033[0m"; params.help = false +params.debug = false /************************************************** * HELP MENU ************************************** **************************************************/ if (params.help) { println() - println("Nextflow MGIllumina Consensus Pipeline: $workflow.manifest.version") + println("Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version") println("USAGE:") println("Example 1: Submit and run jobs with slurm in singularity containers.") - println(" > nextflow run main.nf -resume -profile slurm_sing --csv_file PE_file.csv") + println(" > nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv") println() println("Example 2: : Submit and run jobs with slurm in conda environments.") - println(" > nextflow run main.nf -resume -profile slurm_conda --csv_file SE_file.csv") + println(" > nextflow run main.nf -resume -profile slurm,conda --csv_file SE_file.csv") println() println("Example 3: Run jobs locally in conda environments, supply a GLDS accession, and specify the path to an existing conda environment.") - println(" > nextflow run main.nf -resume -profile conda --GLDS_accession OSD-456 --conda.qc ") + println(" > nextflow run main.nf -resume -profile conda --GLDS_accession OSD-574 --conda.qc ") println() println("Required arguments:") - println("""-profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda]. + println("""-profile [STRING] What profile should be used to run the workflow. Options are [slurm, singularity, docker, and conda]. singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. - slurm_sing and slurm_conda will submit and run jobs using slurm in singularity containers and conda environments, respectively. """) + To combine profiles, separate them comma. For example for to combine slurm and singularity profiels, pass 'slurm,singularity' as arguement. """) println("--csv_file [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired). Mandatory if a GLDS accession is not provided.") println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") println(" The sample_id column should contain unique sample ids.") @@ -59,7 +60,7 @@ if (params.help) { println(" 'True' for yes, anything else will be considered 'False' and the default full tree will be used. Default: 'True'. ") println(" --max_mem [INT] Maximum memory allowed passed to megahit assembler. Can be set either by proportion of available on system, e.g. 0.5") println(" or by absolute value in bytes, e.g. 100e9 would be 100 GB. Default: 100e9.") - println() + println(" --pileup_mem [STRING] pileup.sh paramater for calculating contig coverage and depth. Memory used by bbmap's pileup.sh (within the GET_COV_AND_DET process). ") println(" passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes.") println(" 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased.Default: '5g' .") @@ -86,7 +87,11 @@ if (params.help) { println() println("Genelab specific arguements:") println(" --GLDS_accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") - println(" --assay_suffix [STRING] Genelabs assay suffix. Default: _GLmetagenomics.") + println(" --RawFilePattern [STRING] If we do not want to download all files (which we often won't), we can specify a pattern here to subset the total files.") + println(" For example, if we know we want to download just the fastq.gz files, we can say 'fastq.gz'. We can also provide multiple patterns") + println(" as a comma-separated list. For example, If we want to download the fastq.gz files that also have 'NxtaFlex', 'metagenomics', and 'raw' in") + println(" their filenames, we can provide '-p fastq.gz,NxtaFlex,metagenomics,raw'. Default: null.") + println(" --assay_suffix [STRING] Genelab's assay suffix. Default: _GLmetagenomics.") println(" --additional_filename_prefix [STRING] additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets).") println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: '' .") println() @@ -95,13 +100,14 @@ if (params.help) { println("The strings below will be added to the end of the --database.cat_db path arguement provided below.") println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/.") println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/.") - println(" --CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") + println(" --database.CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") println(" --database.cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") println(" --database.ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") println(" --database.metaphlan_db_dir [PATH] Path to metaphlan database. Example, /path/to/Reference_DBs/metaphlan4-db/. Default: null.") println(" --database.chocophlan_dir [PATH] Path to Humann's chocophlan nucleotide database. Example, /path/to/Reference_DBs/humann3-db/chocophlan/. Default: null.") println(" --database.uniref_dir [PATH] Path to Humann's Uniref protein database. Example, /path/to/Reference_DBs/humann3-db/uniref/. Default: null.") println(" --database.utilities_dir [PATH] Path to Humann's untilities database. Example, /path/to/Reference_DBs/humann3-db/utility_mapping/. Default: null.") + println(" --database.GTDBTK_LINK [URL] GTDBTK database online download link. Default: https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz.") println(" --database.gtdbtk_db_dir [PATH] Path to GTDBTK database. Example, /path/Reference_DBs/GTDB-tk-ref-db/. Default: null.") println() println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") @@ -121,13 +127,19 @@ if (params.help) { exit 0 } +/************************************************ +*********** Show pipeline parameters ************ +*************************************************/ + +if (params.debug) { log.info """ - Nextflow MGIllumina Consensus Pipeline: $workflow.manifest.version + Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version You have set the following parameters: Profile: ${workflow.profile} Input csv file : ${params.csv_file} GLDS Accession : ${params.GLDS_accession} + GLDS Raw File Pattern: ${params.RawFilePattern} Workflow : ${params.workflow} Nextflow Directory publishing mode: ${params.publishDir_mode} Swift 1S Libraries: ${params.swift_1S} @@ -191,13 +203,20 @@ log.info """ Chocophlan: ${params.database.chocophlan_dir} Uniref: ${params.database.uniref_dir} Utilities: ${params.database.utilities_dir} - GTDBTK: ${params.database.gtdbtk_db_dir} + GTDBTK URL: ${params.database.GTDBTK_LINK} + GTDBTK DB: ${params.database.gtdbtk_db_dir} """.stripIndent() +} + +// Create GLDS runsheet +include { GET_RUNSHEET } from "./modules/create_runsheet.nf" // Processes to create the required database(s) if not provided +/* include { SETUP_CAT_DB; SETUP_KOFAMSCAN_DB; SETUP_GTDBTK_DB; SETUP_CHOCOPHLAN; SETUP_UNIREF; SETUP_UTILITY_MAPPING; SETUP_METAPHLAN } from "./modules/database_creation.nf" +*/ include { make_humann_db } from "./modules/database_creation.nf" // Read quality check and filtering @@ -235,6 +254,11 @@ workflow run_read_based_analysis { make_humann_db.out.uniref_dir, make_humann_db.out.metaphlan_db_dir, make_humann_db.out.utilities_dir) + + software_versions_ch = Channel.empty() + make_humann_db.out.versions | mix(software_versions_ch) | set{software_versions_ch} + read_based.out.versions | mix(software_versions_ch) | set{software_versions_ch} + }else{ read_based(filtered_ch, @@ -242,8 +266,13 @@ workflow run_read_based_analysis { params.database.uniref_dir, params.database.metaphlan_db_dir, params.database.utilities_dir) + + software_versions_ch = read_based.out.versions } + emit: + versions = software_versions_ch + } // Workflow to perform assembly-based analysis @@ -255,29 +284,23 @@ workflow run_assembly_based_analysis { main: - kofam_db = params.database.ko_db_dir - if(params.database.ko_db_dir == null) { - SETUP_KOFAMSCAN_DB() - kofam_db = SETUP_KOFAMSCAN_DB.out.ko_db_dir - } - - cat_db = params.database.cat_db - if(params.database.cat_db == null){ + software_versions_ch = Channel.empty() - SETUP_CAT_DB(params.dataase.CAT_DB_LINK) - cat_db = SETUP_CAT_DB.out.cat_db - } - - gtdbtk_db_dir = params.database.gtdbtk_db_dir - if(params.database.gtdbtk_db_dir == null){ - SETUP_GTDBTK_DB() - gtdbtk_db_dir = SETUP_GTDBTK_DB.out.gtdbtk_db_dir - } + kofam_db = params.database.ko_db_dir + cat_db = params.database.cat_db + gtdbtk_db_dir = params.database.gtdbtk_db_dir // Run assembly based workflow assembly_based(file_ch, filtered_ch, kofam_db, cat_db, gtdbtk_db_dir, params.use_gtdbtk_scratch_location) + + assembly_based.out.versions | mix(software_versions_ch) | set{software_versions_ch} + + + emit: + versions = software_versions_ch + } // A function to delete white spaces from an input string and covert it to lower case @@ -293,15 +316,11 @@ workflow { // Parse file input if(params.GLDS_accession){ - GET_RUNSHEET() + GET_RUNSHEET(params.GLDS_accession) GET_RUNSHEET.out.input_file .splitCsv(header:true) .set{file_ch} - GET_RUNSHEET.out.params_file - .splitCsv(header:true) - .set{params_ch} - }else{ Channel.fromPath(params.csv_file, checkIfExists: true) @@ -311,26 +330,59 @@ workflow { file_ch.map{ - row -> deleteWS(row.paired) == 'true' ? tuple( "${row.sample_id}", [file("${row.forward}"), file("${row.reverse}")], deleteWS(row.paired)) : - tuple( "${row.sample_id}", [file("${row.forward}")], deleteWS(row.paired)) + row -> deleteWS(row.paired) == 'true' ? tuple( "${row.sample_id}", [file("${row.forward}", checkIfExists: true), file("${row.reverse}", checkIfExists: true)], deleteWS(row.paired)) : + tuple( "${row.sample_id}", [file("${row.forward}", checkIfExists: true)], deleteWS(row.paired)) }.set{reads_ch} - // Qality check and trim the input reads + + // Software Version Capturing - runsheet + software_versions_ch = Channel.empty() + GET_RUNSHEET.out.version | mix(software_versions_ch) | set{software_versions_ch} + + // Quality check and trim the input reads raw_qc(Channel.of("raw"), params.multiqc_config,reads_ch) - filtered_ch = BBDUK(reads_ch, params.adapters) + BBDUK(reads_ch, params.adapters) + filtered_ch = BBDUK.out.reads filtered_qc(Channel.of("filtered"), params.multiqc_config, filtered_ch) - // Run the analysis based on selection i.e, read-based, assembly-based or both + // Quality check software capturing + raw_qc.out.versions | mix(software_versions_ch) | set{software_versions_ch} + BBDUK.out.version | mix(software_versions_ch) | set{software_versions_ch} + filtered_qc.out.versions | mix(software_versions_ch) | set{software_versions_ch} + + // Run the analysis based on selection i.e, read-based, assembly-based or both // it will run both by default if(params.workflow == 'read-based'){ + run_read_based_analysis(filtered_ch) + run_read_based_analysis.out.versions | mix(software_versions_ch) | set{software_versions_ch} + }else if(params.workflow == 'assembly-based') { + run_assembly_based_analysis(file_ch,filtered_ch) + run_assembly_based_analysis.out.versions | mix(software_versions_ch) | set{software_versions_ch} + }else{ + run_read_based_analysis(filtered_ch) run_assembly_based_analysis(file_ch, filtered_ch) + + run_read_based_analysis.out.versions | mix(software_versions_ch) | set{software_versions_ch} + run_assembly_based_analysis.out.versions | mix(software_versions_ch) | set{software_versions_ch} } + + // Software Version Capturing - combining all captured sofware versions + nf_version = "Nextflow Version:".concat("${nextflow.version}\n<><><>\n") + nextflow_version_ch = Channel.value(nf_version) + + // Write software versions to file + software_versions_ch | map { it.text + "\n<><><>\n"} + | unique + | mix(nextflow_version_ch) + | collectFile(name: "${params.metadata_dir}/software_versions.txt", newLine: true, cache: false) + | set{final_software_versions_ch} + } workflow.onComplete { diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf index af87331a..b1fc59d1 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf @@ -15,7 +15,9 @@ process ASSEMBLE { input: tuple val(sample_id), path(reads), val(isPaired) output: - tuple val(sample_id), path("${sample_id}_final.contigs.fa") + tuple val(sample_id), path("${sample_id}_final.contigs.fa"), emit: contigs + path("${sample_id}-assembly.log"), emit: log + path("versions.txt"), emit: version script: """ # Removing output directory if exists already but process still needs to be @@ -32,17 +34,18 @@ process ASSEMBLE { megahit -1 \${BASENAME_FORWARD} -2 \${BASENAME_REVERSE} \\ -m ${params.max_mem} -t ${task.cpus} \\ - --min-contig-len 500 -o ${sample_id}-megahit-out - + --min-contig-len 500 -o ${sample_id}-megahit-out > ${sample_id}-assembly.log 2>&1 + else BASENAME=`basename -s '.gz' ${reads[0]}` zcat ${reads[0]} > \${BASENAME} megahit -r \${BASENAME} -m ${params.max_mem} -t ${task.cpus} \\ - --min-contig-len 500 -o ${sample_id}-megahit-out + --min-contig-len 500 -o ${sample_id}-megahit-out > ${sample_id}-assembly.log 2>&1 fi mv ${sample_id}-megahit-out/final.contigs.fa ${sample_id}_final.contigs.fa + megahit -v > versions.txt """ } @@ -56,7 +59,8 @@ process RENAME_HEADERS { input: tuple val(sample_id), path(assembly) output: - tuple val(sample_id), path("${sample_id}-assembly.fasta") + tuple val(sample_id), path("${sample_id}-assembly.fasta"), emit: contigs + path("versions.txt"), emit: version script: """ bit-rename-fasta-headers -i ${assembly} \\ @@ -68,6 +72,7 @@ process RENAME_HEADERS { if [ ! -s ${sample_id}-assembly.fasta ]; then printf "${sample_id}\\tNo contigs assembled\\n" > Failed-assemblies.tsv fi + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -82,11 +87,13 @@ process SUMMARIZE_ASSEMBLIES { input: path(assemblies) output: - path("${params.additional_filename_prefix}assembly-summaries${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}assembly-summaries${params.assay_suffix}.tsv"), emit: summary + path("versions.txt"), emit: version script: """ bit-summarize-assembly \\ -o ${params.additional_filename_prefix}assembly-summaries${params.assay_suffix}.tsv \\ ${assemblies} + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf index 62352746..b05ec015 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf @@ -17,7 +17,8 @@ process CALL_GENES { tuple val(sample_id), path(assembly) output: // Amino acids, nucleotides and gff - tuple val(sample_id), path("${sample_id}-genes.faa"), path("${sample_id}-genes.fasta"), path("${sample_id}-genes.gff") + tuple val(sample_id), path("${sample_id}-genes.faa"), path("${sample_id}-genes.fasta"), path("${sample_id}-genes.gff"), emit: genes + path("versions.txt"), emit: version script: """ # Only running if assembly produced any contigs @@ -33,6 +34,7 @@ process CALL_GENES { printf "Gene-calling not performed because the assembly didn't produce anything.\\n" fi + prodigal -v 2>&1 | grep Prodigal > versions.txt """ } @@ -47,8 +49,8 @@ process REMOVE_LINEWRAPS { tuple val(sample_id), path(aa), path(nt), path(gff) output: - tuple val(sample_id), path("${sample_id}-genes.faa"), path("${sample_id}-genes.fasta"), path(gff) - + tuple val(sample_id), path("${sample_id}-genes.faa"), path("${sample_id}-genes.fasta"), emit: genes + path("versions.txt"), emit: version script: """ if [ -s ${aa} ] && [ -s ${nt} ]; then @@ -63,6 +65,7 @@ process REMOVE_LINEWRAPS { touch ${sample_id}-genes.faa ${sample_id}-genes.fasta printf "Line wrapping not performed because gene-calling wasn't performed on ${sample_id}.\\n" fi + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -75,11 +78,11 @@ process KO_ANNOTATION { //label "contig_annotation" input: - tuple val(sample_id), path(assembly), path(aa), path(nt), path(gff) + tuple val(sample_id), path(assembly), path(aa), path(nt) path(ko_db_dir) output: - tuple val(sample_id), path("${sample_id}-KO-tab.tmp") - + tuple val(sample_id), path("${sample_id}-KO-tab.tmp"), emit: temp_table + path("versions.txt"), emit: version script: """ # only running if assembly produced any contigs and genes were identified (they are required for this) @@ -97,6 +100,7 @@ process KO_ANNOTATION { printf "Functional annotations not performed because the assembly didn't produce anything and/or no genes were identified.\\n" fi + exec_annotation -v > versions.txt """ } @@ -110,8 +114,8 @@ process FILTER_KFAMSCAN { input: tuple val(sample_id), path(KO_tab_tmp) output: - tuple val(sample_id), path("${sample_id}-annotations.tsv") - + tuple val(sample_id), path("${sample_id}-annotations.tsv"), emit: ko_annotation + path("versions.txt"), emit: version script: """ if [ -s ${KO_tab_tmp} ]; then @@ -124,6 +128,7 @@ process FILTER_KFAMSCAN { printf "Nothing to filter since functional annotation was not performed.\\n" fi + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -135,11 +140,12 @@ process TAX_CLASSIFICATION { label "contig_annotation" input: - tuple val(sample_id), path(assembly), path(aa), path(nt), path(gff) + tuple val(sample_id), path(assembly), path(aa), path(nt) path(cat_db) output: // Gene and contig taxonomy - tuple val(sample_id), path("${sample_id}-gene-tax.tsv"), path("${sample_id}-contig-tax.tsv") + tuple val(sample_id), path("${sample_id}-gene-tax.tsv"), path("${sample_id}-contig-tax.tsv"), emit: taxonomy + path("versions.txt"), emit: version script: """ # Only running if assembly produced any contigs and @@ -177,6 +183,7 @@ process TAX_CLASSIFICATION { printf "Assembly-based taxonomic classification not performed because the assembly didn't produce anything and/or no genes were identified.\\n" fi + CAT --version | sed -E 's/(CAT v.+)\\s\\(.+/\\1/' > versions.txt """ } @@ -187,9 +194,10 @@ workflow annotate_assembly { cat_db main: - genes_ch = CALL_GENES(assembly_ch) | REMOVE_LINEWRAPS - - KO_ANNOTATION(assembly_ch.join(genes_ch) ko_db_dir) | FILTER_KFAMSCAN - TAX_CLASSIFICATION(assembly_ch, genes_ch, cat_db) + CALL_GENES(assembly_ch) + genes_ch = CALL_GENES.out.genes | REMOVE_LINEWRAPS.out.genes + KO_ANNOTATION(assembly_ch.join(genes_ch) ko_db_dir) + KO_ANNOTATION.out.temp_table | FILTER_KFAMSCAN.out.ko_annotation + TAX_CLASSIFICATION(assembly_ch, genes_ch, cat_db) } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf index 2e8287e1..a4d54ab2 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf @@ -6,6 +6,10 @@ nextflow.enable.dsl = 2 ****************************************************************************************/ // Assembly-based workflow +// Processes to create the required database(s) if not provided +include { SETUP_CAT_DB; SETUP_KOFAMSCAN_DB; SETUP_GTDBTK_DB; + SETUP_CHOCOPHLAN} from "./database_creation.nf" + include { ASSEMBLE; RENAME_HEADERS; SUMMARIZE_ASSEMBLIES } from "./assembly.nf" include { MAPPING; SAM_TO_BAM } from "./read_mapping.nf" include { CALL_GENES; REMOVE_LINEWRAPS } from "./assembly_annotation.nf" @@ -33,37 +37,62 @@ workflow assembly_based { /***************************************************** ************* Assembly-based analysis **************** *****************************************************/ + + software_versions_ch = Channel.empty() + // Assemble reads to contigs - assembly_ch = ASSEMBLE(filtered_ch) | RENAME_HEADERS + ASSEMBLE(filtered_ch) + ASSEMBLE.out.contigs | RENAME_HEADERS + assembly_ch = RENAME_HEADERS.out.contigs assemblies_ch = assembly_ch.map{ sample_id, assembly -> file("${assembly}") }.collect() SUMMARIZE_ASSEMBLIES(assemblies_ch) // Map reads to assembly - read_mapping_ch = MAPPING(assembly_ch.join(filtered_ch)) | SAM_TO_BAM + MAPPING(assembly_ch.join(filtered_ch)) + MAPPING.out.sam | SAM_TO_BAM + read_mapping_ch = SAM_TO_BAM.out.bam // Annotate assembly - genes_ch = CALL_GENES(assembly_ch) | REMOVE_LINEWRAPS - if (ko_db_dir){ - annotations_ch = KO_ANNOTATION(assembly_ch.join(genes_ch), ko_db_dir) | FILTER_KFAMSCAN + CALL_GENES(assembly_ch) + CALL_GENES.out.genes | REMOVE_LINEWRAPS + genes_ch = REMOVE_LINEWRAPS.out.genes + + if (ko_db_dir != null){ + + KO_ANNOTATION(assembly_ch.join(genes_ch), ko_db_dir) + KO_ANNOTATION.out.temp_table | FILTER_KFAMSCAN + annotations_ch = FILTER_KFAMSCAN.out.ko_annotation + }else{ + SETUP_KOFAMSCAN_DB() - annotations_ch = KO_ANNOTATION(assembly_ch.join(genes_ch), - SETUP_KOFAMSCAN_DB.out.ko_db_dir) | FILTER_KFAMSCAN + SETUP_KOFAMSCAN_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} + KO_ANNOTATION(assembly_ch.join(genes_ch), SETUP_KOFAMSCAN_DB.out.ko_db_dir) + KO_ANNOTATION.out.temp_table | FILTER_KFAMSCAN + annotations_ch = FILTER_KFAMSCAN.out.ko_annotation + } - if (cat_db){ - taxonomy_ch = TAX_CLASSIFICATION(assembly_ch.join(genes_ch), cat_db) + if (cat_db != null){ + + TAX_CLASSIFICATION(assembly_ch.join(genes_ch), cat_db) + taxonomy_ch = TAX_CLASSIFICATION.out.taxonomy + }else{ + SETUP_CAT_DB(params.database.CAT_DB_LINK) - taxonomy_ch = TAX_CLASSIFICATION(assembly_ch.join(genes_ch), SETUP_CAT_DB.out.cat_db) + SETUP_CAT_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} + TAX_CLASSIFICATION(assembly_ch.join(genes_ch), SETUP_CAT_DB.out.cat_db) + taxonomy_ch = TAX_CLASSIFICATION.out.taxonomy } // Calculate gene coverage and depth - coverage_ch = GET_COV_AND_DET(read_mapping_ch - .join(assembly_ch) - .join(genes_ch)) + GET_COV_AND_DET(read_mapping_ch + .join(assembly_ch) + .join(genes_ch)) + coverage_ch = GET_COV_AND_DET.out.coverages // Combine contig annotation tax_and_cov_ch = COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE(coverage_ch @@ -88,24 +117,27 @@ workflow assembly_based { }.collect()) // Assembly binning - binning_ch = METABAT_BINNING(assembly_ch.join(read_mapping_ch)) + METABAT_BINNING(assembly_ch.join(read_mapping_ch)) + binning_ch = METABAT_BINNING.out.bins binning_ch | summarize_bins - metabat_assembly_depth_files_ch = binning_ch.map{ - sample_id, depth, bins -> file("${depth}") + depth_ch = METABAT_BINNING.out.depth + metabat_assembly_depth_files_ch = depth_ch.map{ + sample_id, depth -> file("${depth}") }.collect() bins_ch = binning_ch.map{ - sample_id, depth, bins -> bins instanceof List ? bins.each{it}: bins + sample_id, bins -> bins instanceof List ? bins.each{it}: bins }.flatten().collect() // Check Bins and Summarize MAGs - if(gtdbtk_db_dir){ + if(gtdbtk_db_dir != null){ summarize_mags(summarize_bins.out.bins_checkm_results, bins_ch, gtdbtk_db_dir, use_gtdbtk_scratch_location, gene_coverage_annotation_and_tax_files_ch) }else{ - SETUP_GTDBTK_DB() + SETUP_GTDBTK_DB(params.database.GTDBTK_LINK) + SETUP_GTDBTK_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} summarize_mags(summarize_bins.out.bins_checkm_results, bins_ch, SETUP_GTDBTK_DB.out.gtdbtk_db_dir, use_gtdbtk_scratch_location, @@ -113,7 +145,7 @@ workflow assembly_based { } // Get the predicted amino acids for all the samples - genes_aa_ch = genes_ch.map{sample_id, aa, nt, gff -> file("${aa}")}.collect() + genes_aa_ch = genes_ch.map{sample_id, aa, nt -> file("${aa}")}.collect() // Generating a file with sample ids on a new line file_ch.map{row -> "${row.sample_id}"} @@ -129,4 +161,27 @@ workflow assembly_based { metabat_assembly_depth_files_ch, bins_ch, bam_files) + + // Capture software versions + ASSEMBLE.out.version | mix(software_versions_ch) | set{software_versions_ch} + RENAME_HEADERS.out.version | mix(software_versions_ch) | set{software_versions_ch} + SUMMARIZE_ASSEMBLIES.out.version | mix(software_versions_ch) | set{software_versions_ch} + MAPPING.out.version | mix(software_versions_ch) | set{software_versions_ch} + SAM_TO_BAM.out.version | mix(software_versions_ch) | set{software_versions_ch} + CALL_GENES.out.version | mix(software_versions_ch) | set{software_versions_ch} + REMOVE_LINEWRAPS.out.version | mix(software_versions_ch) | set{software_versions_ch} + KO_ANNOTATION.out.version | mix(software_versions_ch) | set{software_versions_ch} + FILTER_KFAMSCAN.out.version | mix(software_versions_ch) | set{software_versions_ch} + TAX_CLASSIFICATION.out.version | mix(software_versions_ch) | set{software_versions_ch} + GET_COV_AND_DET.out.version | mix(software_versions_ch) | set{software_versions_ch} + MAKE_COMBINED_GENE_LEVEL_TABLES.out.version | mix(software_versions_ch) | set{software_versions_ch} + MAKE_COMBINED_CONTIG_TAX_TABLES.out.version | mix(software_versions_ch) | set{software_versions_ch} + METABAT_BINNING.out.version | mix(software_versions_ch) | set{software_versions_ch} + summarize_bins.out.versions | mix(software_versions_ch) | set{software_versions_ch} + summarize_mags.out.versions | mix(software_versions_ch) | set{software_versions_ch} + + + emit: + versions = software_versions_ch + } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf index db175ce0..c851b3dd 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf @@ -15,8 +15,9 @@ process METABAT_BINNING { input: tuple val(sample_id), path(assembly), path(bam) output: - tuple val(sample_id), path("${sample_id}-metabat-assembly-depth.tsv"), path("${sample_id}-bin*") - + tuple val(sample_id), path("${sample_id}-metabat-assembly-depth.tsv"), emit: depth + tuple val(sample_id), path("${sample_id}-bin*"), emit: bins, optional: true + path("versions.txt"), emit: version script: """ # Only running if the assembly produced anything @@ -62,6 +63,7 @@ process METABAT_BINNING { touch ${sample_id}-metabat-assembly-depth.tsv printf "Binning not performed because the assembly didn't produce anything.\\n" fi + echo metabat2 \$(metabat2 --help 2>&1 | head -n 2 | tail -n 1| sed 's/.*\\:\\([0-9]*\\.[0-9]*\\).*/\\1/') > versions.txt """ } @@ -79,7 +81,7 @@ workflow binning { emit: - binning_results = binning_ch + binning_results = binning_ch.out.bins } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf index 22897d57..9d7893c0 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf @@ -20,7 +20,7 @@ process COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE { input: tuple val(sample_id), path(gene_coverages), path(contig_coverages), path(annotations), path(gene_tax), path(contig_tax), - path(aa), path(nt), path(gff), path(assembly) + path(aa), path(nt), path(assembly) output: tuple val(sample_id), path("${sample_id}-gene-coverage-annotation-and-tax.tsv") script: @@ -65,10 +65,11 @@ process MAKE_COMBINED_GENE_LEVEL_TABLES { input: path(gene_coverage_annotation_and_tax_files) output: - path("${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages${params.assay_suffix}.tsv") - path("${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages-CPM${params.assay_suffix}.tsv") - path("${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages${params.assay_suffix}.tsv") - path("${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages${params.assay_suffix}.tsv"), emit: raw_function_coverages + path("${params.additional_filename_prefix}Combined-gene-level-KO-function-coverages-CPM${params.assay_suffix}.tsv"), emit: norm_function_coverages + path("${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages${params.assay_suffix}.tsv"), emit: raw_taxonomy_coverages + path("${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv"), emit: norm_taxonomy_coverages + path("versions.txt"), emit: version script: """ bit-GL-combine-KO-and-tax-tables ${gene_coverage_annotation_and_tax_files} -o ${params.additional_filename_prefix}Combined @@ -85,6 +86,7 @@ process MAKE_COMBINED_GENE_LEVEL_TABLES { mv "${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM.tsv" \\ "${params.additional_filename_prefix}Combined-gene-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv" + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -99,7 +101,7 @@ process COMBINE_CONTIG_TAX_AND_COVERAGE { input: tuple val(sample_id), path(gene_coverages), path(contig_coverages), path(gene_tax), path(contig_tax), - path(aa), path(nt), path(gff), path(assembly) + path(aa), path(nt), path(assembly) output: tuple val(sample_id), path("${sample_id}-contig-coverage-and-tax.tsv") script: @@ -157,8 +159,9 @@ process MAKE_COMBINED_CONTIG_TAX_TABLES { input: path(contig_coverage_and_tax_files) output: - path("${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages${params.assay_suffix}.tsv") - path("${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages${params.assay_suffix}.tsv"), emit: raw_taxonomy + path("${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv"), emit: norm_taxonomy + path("versions.txt"), emit: version script: """ bit-GL-combine-contig-tax-tables ${contig_coverage_and_tax_files} -o ${params.additional_filename_prefix}Combined @@ -169,6 +172,7 @@ process MAKE_COMBINED_CONTIG_TAX_TABLES { mv "${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM.tsv" \\ "${params.additional_filename_prefix}Combined-contig-level-taxonomy-coverages-CPM${params.assay_suffix}.tsv" + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf index 1c34c086..0eb7ab42 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf @@ -12,10 +12,11 @@ process GET_COV_AND_DET { tag "Calculating gene and contig coverage for ${sample_id}..." input: - tuple val(sample_id), path(bam), path(assembly), path(aa), path(nt), path(gff) + tuple val(sample_id), path(bam), path(assembly), path(aa), path(nt) output: // Gene_covs and contig_covs - tuple val(sample_id), path("${sample_id}-gene-coverages.tsv"), path("${sample_id}-contig-coverages.tsv") + tuple val(sample_id), path("${sample_id}-gene-coverages.tsv"), path("${sample_id}-contig-coverages.tsv"), emit: coverages + path("versions.txt"), emit: version script: """ # get-cov-and-depth.sh ${sample_id} ${assembly} ${nt} ${bam} ${params.pileup_mem} @@ -76,5 +77,7 @@ process GET_COV_AND_DET { printf "Coverage info not recovered because the assembly didn't produce anything.\\n" fi + VERSION=`bbversion.sh` + echo "bbtools \${VERSION}" > versions.txt """ } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf index 5883b490..8dfe2a08 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf @@ -1,32 +1,59 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.GLDS_accession = "OSS-466" +params.GLDS_accession = "OSD-574" +params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process process GET_RUNSHEET { - beforeScript "chmod +x ${baseDir}/bin/create_runsheet.py" + beforeScript "chmod +x ${baseDir}/bin/create_runsheet.sh" + input: + val(GLDS_accession) output: - path("*_runsheet.csv"), emit: runsheet + path("a_*metagenomic*.txt"), emit: assay_TABLE path("*.zip"), emit: zip - path("GLparams_file.csv"), emit: params_file path("GLfile.csv"), emit: input_file - + path("versions.txt"), emit: version script: """ - create_runsheet.py --OSD ${params.GLDS_accession} + # Download ISA zip file for the GLDS_accession then unzip it + GL-download-GLDS-data -g ${GLDS_accession} -p ISA -f && unzip *-ISA.zip + + if [ ${params.RawFilePattern} == null ];then + + # Attempt to download the sequences using the assay table, if that fails then + # attempt retrieving all fastq.gz files + GL-download-GLDS-data -f -g ${GLDS_accession} -a a_*metagenomic*.txt -o Raw_Sequence_Data || \\ + GL-download-GLDS-data -f -g ${GLDS_accession} -p ".fastq.gz" -o Raw_Sequence_Data + + else + + + GL-download-GLDS-data -f -g ${GLDS_accession} -p ${params.RawFilePattern} -o Raw_Sequence_Data + + fi + + # Handle case where URLs contain the "+" sign and replaces it with %2B + if grep -q '+' *wanted-file-download-commands.sh;then + grep '+' *wanted-file-download-commands.sh | \\ + sort -u | \\ + awk '{gsub(/\\+/,"%2B", \$NF);print}' \\ + > plus_containing_${GLDS_accession}-wanted-file-download-commands.sh + cat plus_containing_${GLDS_accession}-wanted-file-download-commands.sh | parallel -j $task.cpus + fi + + # Create runsheet from the assay table + create_runsheet.sh a_*metagenomic*.txt > GLfile.csv + GL-version | grep "GeneLab utils"| sed -E 's/^\\s+//' > versions.txt """ } workflow { - GET_RUNSHEET() + GET_RUNSHEET(params.GLDS_accession) file_ch = GET_RUNSHEET.out.input_file - .splitCsv() - - params_ch = GET_RUNSHEET.out.params_file .splitCsv(header:true) } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf index e77e6965..6a2b1693 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf @@ -18,6 +18,7 @@ process SETUP_CAT_DB { output: path("CAT_prepare_20210107/"), emit: cat_db path("CAT_prepare_20210107/CAT_DB_SETUP"), emit: completion_indicator + path("versions.txt"), emit: version script: """ printf "### Setting up CAT's reference database ###\\n\\n" @@ -29,6 +30,7 @@ process SETUP_CAT_DB { rm CAT_prepare_20210107.tar.gz CAT_prepare_20210107/2021-01-07_CAT_database/2021-01-07.nr.gz touch CAT_prepare_20210107/CAT_DB_SETUP + curl --version |head -n 1 | sed -E 's/(curl\\s.+)\\s\\(.+/\\1/' > versions.txt printf "### Set up completed successfully ###\\n\\n" """ } @@ -42,6 +44,7 @@ process SETUP_KOFAMSCAN_DB { output: path("kofamscan_db/"), emit: ko_db_dir path("kofamscan_db/KO_DB_SETUP"), emit: completion_indicator + path("versions.txt"), emit: version script: """ printf "### Setting up KOFamScan reference database ###\\n\\n" @@ -73,6 +76,7 @@ process SETUP_KOFAMSCAN_DB { gunzip ko_list.gz && \\ mv ko_list kofamscan_db/ && \\ touch kofamscan_db/KO_DB_SETUP + curl --version |head -n 1 | sed -E 's/(curl\\s.+)\\s\\(.+/\\1/' > versions.txt printf "### Set up completed successfully ###\\n\\n" """ } @@ -83,21 +87,21 @@ process SETUP_GTDBTK_DB { tag "Downloading and setting up genome taxonomy database toolkit-s (GTDBTK) database..." label "db_setup" + input: + val(GTDBTK_URL) output: path("GTDB-tk-ref-db/"), emit: gtdbtk_db_dir path("GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP"), emit: completion_indicator + path("versions.txt"), emit: version shell: ''' [ -d GTDB-tk-ref-db/ ] || mkdir -p GTDB-tk-ref-db/ # But still needs to be set for this particular session that is downloading and setting up the db export GTDBTK_DATA_PATH=GTDB-tk-ref-db/ - - # Make a copy of the download script to edit wget's timeout duration - cat `which download-db.sh` |sed 's/\$db_url/--timeout=3600 $db_url/' > download-db.sh && \ - chmod +x ./download-db.sh # Downloading - ./download-db.sh && touch GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP + download-GTDBTK-db.sh ${GTDBTK_URL} && touch GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP + gtdbtk -h |grep "GTDB-Tk" | sed -E 's/.+\\s+(GTDB-Tk v.+)\\s+.+/\\1/' > versions.txt printf "### Set up completed successfully ###\\n\\n" ''' } @@ -112,6 +116,7 @@ process SETUP_CHOCOPHLAN { output: path("humann3-db/chocophlan"), emit: chocophlan_dir path("humann3-db/CHOCOPHLAN_DB_SETUP"), emit: completion_indicator + path("versions.txt"), emit: version script: """ [ -d humann3-db/ ] || mkdir -p humann3-db/ @@ -122,8 +127,10 @@ process SETUP_CHOCOPHLAN { # No need to update locations since I pass them as arguaments to the script humann3_databases --update-config no --download chocophlan full humann3-db/ && \\ touch humann3-db/CHOCOPHLAN_DB_SETUP + humann3 --version > versions.txt printf "### Set up completed successfully ###\\n\\n" fi + """ } @@ -137,6 +144,7 @@ process SETUP_UNIREF { output: path("humann3-db/uniref/"), emit: uniref_dir path("humann3-db/UNIREF_DB_SETUP"), emit: completion_indicator + path("versions.txt"), emit: version script: """ [ -d humann3-db/ ] || mkdir -p humann3-db/ @@ -146,6 +154,7 @@ process SETUP_UNIREF { # No need to update locations since I pass them as arguaments to the script humann3_databases --update-config no --download uniref uniref90_ec_filtered_diamond humann3-db/ && \\ touch humann3-db/UNIREF_DB_SETUP + humann3 --version > versions.txt printf "### Set up completed successfully ###\\n\\n" fi """ @@ -160,6 +169,7 @@ process SETUP_UTILITY_MAPPING { output: path("humann3-db/utility_mapping/"), emit: utilities_dir path("humann3-db/UTILITY_MAPPING_SETUP"), emit: completion_indicator + path("versions.txt"), emit: version script: """ [ -d humann3-db/ ] || mkdir -p humann3-db/ @@ -179,6 +189,7 @@ process SETUP_UTILITY_MAPPING { touch humann3-db/UTILITY_MAPPING_SETUP fi + humann3 --version > versions.txt printf "### Set up completed successfully ###\\n\\n" fi """ @@ -193,7 +204,7 @@ process SETUP_METAPHLAN { output: path("metaphlan4-db/"), emit: metaphlan_db_dir path("metaphlan4-db/METAPHLAN4_DB_SETUP"), emit: completion_indicator - + path("versions.txt"), emit: version script: """ [ -d metaphlan4-db/ ] || mkdir -p metaphlan4-db/ @@ -203,6 +214,7 @@ process SETUP_METAPHLAN { printf "\\n\\n Downloading metaphlan db:\\n\\n" metaphlan --install --bowtie2db metaphlan4-db/ && \\ touch metaphlan4-db/METAPHLAN4_DB_SETUP + metaphlan --version > versions.txt printf "### Set up completed successfully ###\\n\\n" fi """ @@ -217,11 +229,18 @@ workflow make_humann_db { SETUP_UTILITY_MAPPING() SETUP_METAPHLAN() + software_versions_ch = Channel.empty() + SETUP_CHOCOPHLAN.out.version | mix(software_versions_ch) | set{software_versions_ch} + SETUP_UNIREF.out.version | mix(software_versions_ch) | set{software_versions_ch} + SETUP_METAPHLAN.out.version | mix(software_versions_ch) | set{software_versions_ch} + SETUP_UTILITY_MAPPING.out.version | mix(software_versions_ch) | set{software_versions_ch} + emit: chocophlan_dir = SETUP_CHOCOPHLAN.out.chocophlan_dir uniref_dir = SETUP_UNIREF.out.uniref_dir metaphlan_db_dir = SETUP_METAPHLAN.out.metaphlan_db_dir utilities_dir = SETUP_UTILITY_MAPPING.out.utilities_dir + versions = software_versions_ch } @@ -229,12 +248,21 @@ workflow make_databases { take: CAT_DB_LINK + GTDBTK_URL main: SETUP_CAT_DB(CAT_DB_LINK) SETUP_KOFAMSCAN_DB() - SETUP_GTDBTK_DB() + SETUP_GTDBTK_DB(GTDBTK_URL) make_humann_db() + + software_versions_ch = Channel.empty() + SETUP_CAT_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} + SETUP_KOFAMSCAN_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} + SETUP_GTDBTK_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} + make_humann_db.out.versions | mix(software_versions_ch) | set{software_versions_ch} + + emit: cat_db = SETUP_CAT_DB.out.cat_db @@ -244,11 +272,12 @@ workflow make_databases { uniref_dir = make_humann_db.out.uniref_dir metaphlan_db_dir = make_humann_db.out.metaphlan_db_dir utilities_dir = make_humann_db.out.utilities_dir + versions = software_versions_ch } workflow { - make_databases(Channel.of(params.CAT_DB_LINK)) + make_databases(Channel.of(params.database.CAT_DB_LINK), Channel.of(params.database.GTDBTK_LINK)) } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf index c4371eb7..919da2d8 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf @@ -19,12 +19,15 @@ process FASTQC { input: tuple val(sample_id), path(reads), val(isPaired) output: - tuple path("*.html"), path("*.zip") + tuple path("*.html"), path("*.zip"), emit: html + path("versions.txt"), emit: version script: """ fastqc -o . \\ -t ${task.cpus} \\ ${reads} + + fastqc --version > versions.txt """ } @@ -37,7 +40,8 @@ process MULTIQC { path(multiqc_config) path(files) output: - path("${params.additional_filename_prefix}${prefix}_multiqc${params.assay_suffix}_report.zip") + path("${params.additional_filename_prefix}${prefix}_multiqc${params.assay_suffix}_report.zip"), emit: report + path("versions.txt"), emit: version script: """ multiqc -q --filename ${params.additional_filename_prefix}${prefix}_multiqc \\ @@ -51,6 +55,7 @@ process MULTIQC { ${params.additional_filename_prefix}${prefix}_multiqc${params.assay_suffix}_report.zip \\ ${params.additional_filename_prefix}${prefix}_multiqc_report + multiqc --version > versions.txt """ } @@ -66,7 +71,9 @@ process BBDUK { tuple val(sample_id), path(reads), val(isPaired) path(adapters) output: - tuple val(sample_id), path("*${params.filtered_suffix}"), val(isPaired) + tuple val(sample_id), path("*${params.filtered_suffix}"), val(isPaired), emit: reads + path("${sample_id}-bbduk.log"), emit: log + path("versions.txt"), emit: version script: def isSwift = params.swift_1S ? 't' : 'f' """ @@ -77,15 +84,18 @@ process BBDUK { out2=${sample_id}${params.filtered_R2_suffix} \\ ref=${adapters} \\ ktrim=l k=17 ftm=5 qtrim=rl \\ - trimq=10 mlf=0.5 maxns=0 swift=${isSwift} + trimq=10 mlf=0.5 maxns=0 swift=${isSwift} > ${sample_id}-bbduk.log 2>&1 else - + bbduk.sh in=${reads[0]} out1=${sample_id}${params.filtered_suffix} \\ ref=${adapters} \\ ktrim=l k=17 ftm=5 qtrim=rl \\ - trimq=10 mlf=0.5 maxns=0 swift=${isSwift} + trimq=10 mlf=0.5 maxns=0 swift=${isSwift} > ${sample_id}-bbduk.log 2>&1 fi + + VERSION=`bbversion.sh` + echo "bbtools \${VERSION}" > versions.txt """ } @@ -93,24 +103,32 @@ process BBDUK { workflow quality_check { take: - prefix_ch - multiqc_config - reads_ch + prefix_ch + multiqc_config + reads_ch main: - fastqc_ch = FASTQC(reads_ch).flatten().collect() - MULTIQC(prefix_ch, multiqc_config, fastqc_ch) + FASTQC(reads_ch) + fastqc_ch = FASTQC.out.html.flatten().collect() + MULTIQC(prefix_ch, multiqc_config, fastqc_ch) + + software_versions_ch = Channel.empty() + FASTQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + MULTIQC.out.version | mix(software_versions_ch) | set{software_versions_ch} + + emit: + versions = software_versions_ch } workflow { Channel.fromPath(params.csv_file) .splitCsv() - .map{ row -> row.paired ? tuple( "${row.sample_id}", [file("${row.forward}"), file("${row.reverse}")], row.paired) : - tuple( "${row.sample_id}", [file("${row.forward}")], row.paired)} + .map{ row -> row.paired == 'true' ? tuple( "${row.sample_id}", [file("${row.forward}", checkIfExists: true), file("${row.reverse}", checkIfExists: true)], row.paired) : + tuple( "${row.sample_id}", [file("${row.forward}", checkIfExists: true)], row.paired)} .set{reads_ch} res_ch = quality_check(Channel.of(params.prefix), params.multiqc_config, reads_ch) - BBDUK(reads_ch) + BBDUK(reads_ch, params.adapters) } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf index 84c02acc..92368b6a 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf @@ -17,6 +17,7 @@ process HUMANN { tag "Running humann on ${sample_id}-s reads..." label "read_based" + input: tuple val(sample_id), path(reads), val(isPaired) @@ -29,6 +30,7 @@ process HUMANN { path("${sample_id}-humann3-out-dir/${sample_id}_pathabundance.tsv"), emit: pathabundance path("${sample_id}-humann3-out-dir/${sample_id}_pathcoverage.tsv"), emit: pathcoverage path("${sample_id}-humann3-out-dir/${sample_id}_metaphlan_bugs_list.tsv"), emit: metaphlan_bugs_list + path("versions.txt"), emit: version script: """ zcat ${reads} > ${sample_id}-reads.tmp.fq @@ -40,11 +42,11 @@ process HUMANN { --metaphlan-options "--bowtie2db ${metaphlan_dir} --unclassified_estimation --add_viruses --sample_id ${sample_id}" \\ --nucleotide-database ${chocophlan_dir} \\ --protein-database ${uniref_dir} \\ - --bowtie-options "--sensitive --mm" - + --bowtie-options "--sensitive --mm" && \\ mv ${sample_id}-humann3-out-dir/${sample_id}_humann_temp/${sample_id}_metaphlan_bugs_list.tsv \\ ${sample_id}-humann3-out-dir/${sample_id}_metaphlan_bugs_list.tsv + humann3 --version > versions.txt """ } @@ -68,6 +70,7 @@ process COMBINE_READ_BASED_PROCESSING_TABLES { path("${params.additional_filename_prefix}gene-families-initial.tsv"), emit: gene_families path("${params.additional_filename_prefix}pathway-abundances-initial.tsv"), emit: path_abundances path("${params.additional_filename_prefix}pathway-coverages-initial.tsv"), emit: path_coverages + path("versions.txt"), emit: version script: """ if [ ${params.use_conda} == true ]; then @@ -85,6 +88,8 @@ process COMBINE_READ_BASED_PROCESSING_TABLES { humann_join_tables -i gene-family-results/ -o ${params.additional_filename_prefix}gene-families-initial.tsv > /dev/null 2>&1 humann_join_tables -i path-abundance-results/ -o ${params.additional_filename_prefix}pathway-abundances-initial.tsv > /dev/null 2>&1 humann_join_tables -i path-coverage-results/ -o ${params.additional_filename_prefix}pathway-coverages-initial.tsv > /dev/null 2>&1 + + humann3 --version > versions.txt """ } @@ -99,18 +104,20 @@ process SPLIT_READ_BASED_PROCESSING_TABLES { tag "Splitting humann stratified tables..." label "read_based" + label "read_based_outputs" input: path(gene_families) path(path_abundances) path(path_coverages) output: - path("${params.additional_filename_prefix}Gene-families.tsv"), emit: gene_families - path("${params.additional_filename_prefix}Gene-families-grouped-by-taxa.tsv"), emit: gene_families_grouped - path("${params.additional_filename_prefix}Pathway-abundances.tsv"), emit: path_abundances - path("${params.additional_filename_prefix}Pathway-abundances-grouped-by-taxa.tsv"), emit: path_abundances_grouped - path("${params.additional_filename_prefix}Pathway-coverages.tsv"), emit: path_coverages - path("${params.additional_filename_prefix}Pathway-coverages-grouped-by-taxa.tsv"), emit: path_coverages_grouped + path("${params.additional_filename_prefix}Gene-families${params.assay_suffix}.tsv"), emit: gene_families + path("${params.additional_filename_prefix}Gene-families-grouped-by-taxa${params.assay_suffix}.tsv"), emit: gene_families_grouped + path("${params.additional_filename_prefix}Pathway-abundances${params.assay_suffix}.tsv"), emit: path_abundances + path("${params.additional_filename_prefix}Pathway-abundances-grouped-by-taxa${params.assay_suffix}.tsv"), emit: path_abundances_grouped + path("${params.additional_filename_prefix}Pathway-coverages${params.assay_suffix}.tsv"), emit: path_coverages + path("${params.additional_filename_prefix}Pathway-coverages-grouped-by-taxa${params.assay_suffix}.tsv"), emit: path_coverages_grouped + path("versions.txt"), emit: version script: """ [ -d temp_processing/ ] && rm -rf temp_processing/ @@ -118,18 +125,29 @@ process SPLIT_READ_BASED_PROCESSING_TABLES { # Gene Families humann_split_stratified_table -i ${gene_families} -o temp_processing/ > /dev/null 2>&1 - mv temp_processing/${params.additional_filename_prefix}gene-families-initial_stratified.tsv ${params.additional_filename_prefix}Gene-families-grouped-by-taxa.tsv - mv temp_processing/${params.additional_filename_prefix}gene-families-initial_unstratified.tsv ${params.additional_filename_prefix}Gene-families.tsv + mv temp_processing/${params.additional_filename_prefix}gene-families-initial_stratified.tsv \\ + ${params.additional_filename_prefix}Gene-families-grouped-by-taxa${params.assay_suffix}.tsv + + mv temp_processing/${params.additional_filename_prefix}gene-families-initial_unstratified.tsv \\ + ${params.additional_filename_prefix}Gene-families${params.assay_suffix}.tsv # Pathway Abundance humann_split_stratified_table -i ${path_abundances} -o temp_processing/ > /dev/null 2>&1 - mv temp_processing/${params.additional_filename_prefix}pathway-abundances-initial_stratified.tsv ${params.additional_filename_prefix}Pathway-abundances-grouped-by-taxa.tsv - mv temp_processing/${params.additional_filename_prefix}pathway-abundances-initial_unstratified.tsv ${params.additional_filename_prefix}Pathway-abundances.tsv + mv temp_processing/${params.additional_filename_prefix}pathway-abundances-initial_stratified.tsv \\ + ${params.additional_filename_prefix}Pathway-abundances-grouped-by-taxa${params.assay_suffix}.tsv + + mv temp_processing/${params.additional_filename_prefix}pathway-abundances-initial_unstratified.tsv \\ + ${params.additional_filename_prefix}Pathway-abundances${params.assay_suffix}.tsv # Pathway Coverage humann_split_stratified_table -i ${path_coverages} -o temp_processing/ > /dev/null 2>&1 - mv temp_processing/${params.additional_filename_prefix}pathway-coverages-initial_stratified.tsv ${params.additional_filename_prefix}Pathway-coverages-grouped-by-taxa.tsv - mv temp_processing/${params.additional_filename_prefix}pathway-coverages-initial_unstratified.tsv ${params.additional_filename_prefix}Pathway-coverages.tsv + mv temp_processing/${params.additional_filename_prefix}pathway-coverages-initial_stratified.tsv \\ + ${params.additional_filename_prefix}Pathway-coverages-grouped-by-taxa${params.assay_suffix}.tsv + + mv temp_processing/${params.additional_filename_prefix}pathway-coverages-initial_unstratified.tsv \\ + ${params.additional_filename_prefix}Pathway-coverages${params.assay_suffix}.tsv + + humann3 --version > versions.txt """ } @@ -143,6 +161,7 @@ process GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES { tag "Generating normalized humann tables..." label "read_based" + label "read_based_outputs" input: path(gene_families) @@ -150,7 +169,7 @@ process GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES { output: path("${params.additional_filename_prefix}Gene-families-cpm${params.assay_suffix}.tsv"), emit: gene_families path("${params.additional_filename_prefix}Pathway-abundances-cpm${params.assay_suffix}.tsv"), emit: path_abundances - + path("versions.txt"), emit: version script: """ humann_renorm_table \\ @@ -162,6 +181,8 @@ process GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES { -i ${path_abundances} \\ -o ${params.additional_filename_prefix}Pathway-abundances-cpm${params.assay_suffix}.tsv \\ --update-snames > /dev/null 2>&1 + + humann3 --version > versions.txt """ } @@ -174,12 +195,13 @@ process GEN_READ_BASED_PROCESSING_KO_TABLE { tag "Retrieving Kegg Orthologs..." label "read_based" + label "read_based_outputs" input: path(gene_families) output: - path("${params.additional_filename_prefix}Gene-families-KO-cpm${params.assay_suffix}.tsv") - + path("${params.additional_filename_prefix}Gene-families-KO-cpm${params.assay_suffix}.tsv"), emit: gene_families + path("versions.txt"), emit: version script: """ humann_regroup_table \\ @@ -190,6 +212,8 @@ process GEN_READ_BASED_PROCESSING_KO_TABLE { humann_renorm_table \\ -o ${params.additional_filename_prefix}Gene-families-KO-cpm${params.assay_suffix}.tsv \\ --update-snames > /dev/null 2>&1 + + humann3 --version > versions.txt """ } @@ -200,11 +224,13 @@ process COMBINE_READ_BASED_PROCESSING_TAXONOMY { tag "Merging metaphlan taxonomy tables..." label "read_based" + label "read_based_outputs" input: path(metaphlan_bugs_list_files) output: - path("${params.additional_filename_prefix}Metaphlan-taxonomy${params.assay_suffix}.tsv") + path("${params.additional_filename_prefix}Metaphlan-taxonomy${params.assay_suffix}.tsv"), emit: taxonomy + path("versions.txt"), emit: version script: """ merge_metaphlan_tables.py ${metaphlan_bugs_list_files} \\ @@ -212,6 +238,8 @@ process COMBINE_READ_BASED_PROCESSING_TAXONOMY { # Removing redundant text from headers sed -i 's/_metaphlan_bugs_list//g' ${params.additional_filename_prefix}Metaphlan-taxonomy${params.assay_suffix}.tsv + + metaphlan --version > versions.txt """ } @@ -245,16 +273,26 @@ workflow read_based { GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES(SPLIT_READ_BASED_PROCESSING_TABLES.out.gene_families, SPLIT_READ_BASED_PROCESSING_TABLES.out.path_abundances) - ko_table_ch = GEN_READ_BASED_PROCESSING_KO_TABLE(SPLIT_READ_BASED_PROCESSING_TABLES.out.gene_families) + GEN_READ_BASED_PROCESSING_KO_TABLE(SPLIT_READ_BASED_PROCESSING_TABLES.out.gene_families) + ko_table_ch = GEN_READ_BASED_PROCESSING_KO_TABLE.out.gene_families - taxonomy_ch = COMBINE_READ_BASED_PROCESSING_TAXONOMY(metaphlan_bugs_list_ch) + COMBINE_READ_BASED_PROCESSING_TAXONOMY(metaphlan_bugs_list_ch) + taxonomy_ch = COMBINE_READ_BASED_PROCESSING_TAXONOMY.out.taxonomy + + software_versions_ch = Channel.empty() + HUMANN.out.version | mix(software_versions_ch) | set{software_versions_ch} + COMBINE_READ_BASED_PROCESSING_TABLES.out.version | mix(software_versions_ch) | set{software_versions_ch} + SPLIT_READ_BASED_PROCESSING_TABLES.out.version | mix(software_versions_ch) | set{software_versions_ch} + GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES.out.version | mix(software_versions_ch) | set{software_versions_ch} + GEN_READ_BASED_PROCESSING_KO_TABLE.out.version | mix(software_versions_ch) | set{software_versions_ch} + COMBINE_READ_BASED_PROCESSING_TAXONOMY.out.version | mix(software_versions_ch) | set{software_versions_ch} emit: gene_families = GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES.out.gene_families path_abundances = GEN_NORMALIZED_READ_BASED_PROCESSING_TABLES.out.path_abundances ko_table = ko_table_ch taxonomy = taxonomy_ch - + versions = software_versions_ch } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf index 0fb61d8a..06427714 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf @@ -14,7 +14,8 @@ process MAPPING { input: tuple val(sample_id), path(assembly), path(reads), val(isPaired) output: - tuple val(sample_id), path("${sample_id}.sam") + tuple val(sample_id), path("${sample_id}.sam"), path("${sample_id}-mapping-info.txt"), emit: sam + path("versions.txt"), emit: version script: """ if [ ${isPaired} == 'true' ]; then @@ -52,6 +53,7 @@ process MAPPING { fi fi + bowtie2 --version | head -n 1 | sed -E 's/.*(bowtie2-align-s version.+)/\\1/' > versions.txt """ } @@ -64,9 +66,10 @@ process SAM_TO_BAM { label "mapping" input: - tuple val(sample_id), path(sam) + tuple val(sample_id), path(sam), path(mapping_info) output: - tuple val(sample_id), path("${sample_id}.bam") + tuple val(sample_id), path("${sample_id}.bam"), emit: bam + path("versions.txt"), emit: version script: """ # Only running if the assembly produced anything @@ -80,5 +83,6 @@ process SAM_TO_BAM { printf "Sorting and converting not performed for ${sample_id} because read mapping didn't produce anything.\\n" fi + samtools --version | head -n1 > versions.txt """ } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf index ea87c6dc..176115ee 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl = 2 /**************************************************************************************** ********************* Summarize Meta assembled genomes (MAGs) ************************** ****************************************************************************************/ - +include { ZIP_FASTA as ZIP_MAGS } from "./zip_fasta.nf" params.min_est_comp = 90 params.max_est_redund = 10 @@ -19,7 +19,7 @@ already exists if wanting to use disk space params.gtdb_tk_scratch_location = "" -/* +/* Retrieve MAGS. Filters checkm results based on estimate completion, redundancy, and strain heterogeneity. Defaults are conservatively 90, 10, and 50 */ @@ -27,7 +27,6 @@ params.gtdb_tk_scratch_location = "" process FILTER_CHECKM_RESULTS_AND_COPY_MAGS { tag "Filtering checkm-s results..." - label "mags" label "bit" input: @@ -62,14 +61,11 @@ process FILTER_CHECKM_RESULTS_AND_COPY_MAGS { """ } - - // Assign taxonomy to MAGs with gtdb-tk process GTDBTK_ON_MAGS { - tag "Assigning taxonomy to your MAGs with gtdb-tk..." - label "mags" + tag "Assigning taxonomy to your MAGs with gtdb-tk..." input: path(MAGs_checkm_out) @@ -79,7 +75,8 @@ process GTDBTK_ON_MAGS { env(GTDBTK_DATA_PATH) output: - path("gtdbtk-out/") + path("gtdbtk-out/"), emit: gtdbtk_out + path("versions.txt"), emit: version script: """ # Only running if any MAGs were recovered @@ -117,6 +114,7 @@ process GTDBTK_ON_MAGS { printf "\\n\\nThere were no MAGs recovered, so GTDB-tk was not run.\\n\\n" fi + gtdbtk -h |grep "GTDB-Tk" | sed -E 's/.+\\s+(GTDB-Tk v.+)\\s+.+/\\1/' > versions.txt """ } @@ -126,19 +124,19 @@ process GTDBTK_ON_MAGS { process SUMMARIZE_MAG_ASSEMBLIES { tag "Summarizing MAG assemblies..." - label "mags" label "bit" input: path(MAGs_dir) output: - path("${params.additional_filename_prefix}MAG-assembly-summaries.tsv") + path("${params.additional_filename_prefix}MAG-assembly-summaries.tsv"), emit: summary + path("versions.txt"), emit: version script: """ # Only running if any MAGs were recovered if [ `find -L ${MAGs_dir} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then - - # Remove fasta index if it exists + + # Remove fasta index if already exists rm -rf ${MAGs_dir}/*.fxi bit-summarize-assembly ${MAGs_dir}/*.fasta -o MAG-summaries.tmp -t @@ -152,6 +150,7 @@ process SUMMARIZE_MAG_ASSEMBLIES { > ${params.additional_filename_prefix}MAG-assembly-summaries.tsv fi + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -220,8 +219,8 @@ process SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS { path(gene_coverage_annotation_and_tax_files) path(MAGs_dir) output: - path("${params.additional_filename_prefix}MAG-level-KO-annotations${params.assay_suffix}.tsv") - + path("${params.additional_filename_prefix}MAG-level-KO-annotations${params.assay_suffix}.tsv"), emit: summary + path("versions.txt"), emit: version script: """ # Only running if any MAGs were recovered @@ -247,6 +246,7 @@ process SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS { > ${params.additional_filename_prefix}MAG-level-KO-annotations${params.assay_suffix}.tsv fi + python --version > versions.txt """ } @@ -261,8 +261,8 @@ process SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { path(MAG_level_KO_annotations) path(MAGs_dir) output: - path("${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.tsv") - + path("${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.*"), emit: summary + path("versions.txt"), emit: version script: """ # Getting number of MAGs recovered @@ -288,6 +288,11 @@ process SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { # can only create html output if there are more than 1 if [ \$num_mags_recovered -gt 1 ]; then KEGG-decoder -v interactive -i mod-MAG-level-KO-annotations.tmp -o MAG-KEGG-Decoder-out.tmp + + ## adding additional prefix to html output if there is one + [ -f MAG-KEGG-Decoder-out.html ] && \\ + mv MAG-KEGG-Decoder-out.html ${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.html + else KEGG-decoder -i mod-MAG-level-KO-annotations.tmp -o MAG-KEGG-Decoder-out.tmp fi @@ -304,6 +309,7 @@ process SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { > ${params.additional_filename_prefix}MAG-KEGG-Decoder-out${params.assay_suffix}.tsv fi + python --version > versions.txt """ } @@ -322,24 +328,37 @@ workflow summarize_mags { FILTER_CHECKM_RESULTS_AND_COPY_MAGS(bins_checkm_results_ch, bins_ch) MAGs_checkm_out_ch = FILTER_CHECKM_RESULTS_AND_COPY_MAGS.out.MAGs_checkm_out MAGs_dir_ch = FILTER_CHECKM_RESULTS_AND_COPY_MAGS.out.MAGs_dir + ZIP_MAGS(Channel.of("MAG"), MAGs_dir_ch) - gtdbtk_out_ch = GTDBTK_ON_MAGS(MAGs_checkm_out_ch, MAGs_dir_ch, gtdbtk_db_dir, use_gtdbtk_scratch_location, gtdbtk_db_dir) + GTDBTK_ON_MAGS(MAGs_checkm_out_ch, MAGs_dir_ch, gtdbtk_db_dir, use_gtdbtk_scratch_location, gtdbtk_db_dir) + gtdbtk_out_ch = GTDBTK_ON_MAGS.out.gtdbtk_out - MAG_assembly_summaries_ch = SUMMARIZE_MAG_ASSEMBLIES(MAGs_dir_ch) + SUMMARIZE_MAG_ASSEMBLIES(MAGs_dir_ch) + MAG_assembly_summaries_ch = SUMMARIZE_MAG_ASSEMBLIES.out.summary MAGs_overview_ch = GENERATE_MAGS_OVERVIEW_TABLE(MAG_assembly_summaries_ch, MAGs_checkm_out_ch, gtdbtk_out_ch, MAGs_dir_ch) - MAG_level_KO_annotations_ch = SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS(MAGs_overview_ch, + SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS(MAGs_overview_ch, gene_coverage_annotation_and_tax_files_ch, MAGs_dir_ch) + MAG_level_KO_annotations_ch = SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS.out.summary SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER(MAG_level_KO_annotations_ch, MAGs_dir_ch) + // Capture software versions + software_versions_ch = Channel.empty() + ZIP_MAGS.out.version | mix(software_versions_ch) | set{software_versions_ch} + GTDBTK_ON_MAGS.out.version | mix(software_versions_ch) | set{software_versions_ch} + SUMMARIZE_MAG_ASSEMBLIES.out.version | mix(software_versions_ch) | set{software_versions_ch} + SUMMARIZE_MAG_LEVEL_KO_ANNOTATIONS.out.version | mix(software_versions_ch) | set{software_versions_ch} + SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER.out.version | mix(software_versions_ch) | set{software_versions_ch} + emit: MAGs_overview = MAGs_overview_ch MAGs_dir = MAGs_dir_ch + versions = software_versions_ch } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf index 24a790ed..5bbbe48f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf @@ -1,12 +1,14 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.reduced_tree = "True" - /**************************************************************************************** ********************* Bin check and summary ******************************************** ****************************************************************************************/ +include { ZIP_FASTA as ZIP_BINS } from "./zip_fasta.nf" + +params.reduced_tree = "True" + // Summarize bin assemblies process SUMMARIZE_BIN_ASSEMBLIES { @@ -17,8 +19,8 @@ process SUMMARIZE_BIN_ASSEMBLIES { input: path(bins) output: - path("${params.additional_filename_prefix}bin-assembly-summaries.tsv") - + path("${params.additional_filename_prefix}bin-assembly-summaries.tsv"), emit: summary + path("versions.txt"), emit: version script: """ # Only running if any bins were recovered @@ -36,6 +38,7 @@ process SUMMARIZE_BIN_ASSEMBLIES { > ${params.additional_filename_prefix}bin-assembly-summaries.tsv fi + bit-version |grep "Bioinformatics Tools"|sed -E 's/^\\s+//' > versions.txt """ } @@ -49,11 +52,11 @@ process CHECKM_ON_BINS { input: path(bins) output: - path("${params.additional_filename_prefix}bins-checkm-out.tsv") - + path("${params.additional_filename_prefix}bins-checkm-out.tsv"), emit: checkm_output + path("versions.txt"), emit: version script: """ - # only running if there were bins recovered + # Only running if there were bins recovered if [ `find -L . -name '*fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then mkdir -p checkm-working-tmp/ @@ -87,6 +90,7 @@ process CHECKM_ON_BINS { > ${params.additional_filename_prefix}bins-checkm-out.tsv fi + checkm | grep CheckM | head -n 1 | sed -E 's/.+(CheckM\\sv.+)\\s.+/\\1/' > versions.txt """ } @@ -144,12 +148,23 @@ workflow summarize_bins { binning_ch main: - bins = binning_ch.map{ sample_id, depth, bins -> bins instanceof List ? bins.each{it}: bins }.flatten().collect() - bin_assembly_summaries_ch = SUMMARIZE_BIN_ASSEMBLIES(bins) - bins_checkm_results_ch = CHECKM_ON_BINS(bins) + bins = binning_ch.map{ sample_id, bins -> bins instanceof List ? bins.each{it}: bins }.flatten().collect() + ZIP_BINS(Channel.of("bin"), bins) + SUMMARIZE_BIN_ASSEMBLIES(bins) + bin_assembly_summaries_ch = SUMMARIZE_BIN_ASSEMBLIES.out.summary + + CHECKM_ON_BINS(bins) + bins_checkm_results_ch = CHECKM_ON_BINS.out.checkm_output + table = GENERATE_BINS_OVERVIEW_TABLE(bin_assembly_summaries_ch, bins_checkm_results_ch, bins) - + + software_versions_ch = Channel.empty() + ZIP_BINS.out.version | mix(software_versions_ch) | set{software_versions_ch} + SUMMARIZE_BIN_ASSEMBLIES.out.version | mix(software_versions_ch) | set{software_versions_ch} + CHECKM_ON_BINS.out.version | mix(software_versions_ch) | set{software_versions_ch} + emit: bins_checkm_results = bins_checkm_results_ch overview_table = table + versions = software_versions_ch } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/zip_fasta.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/zip_fasta.nf new file mode 100644 index 00000000..f1fc6a40 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/zip_fasta.nf @@ -0,0 +1,63 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +process ZIP_FASTA { + + tag "Zipping up your ${TYPE}s..." + label "genelab" + + + input: + val(TYPE) + path(DIR) + + output: + path("*.zip"), emit: zip_files, optional: true + path("versions.txt"), emit: version + + script: + """ + function zip_sample() { + + local SAMPLE=\$1 + local TYPE=\$2 + + mkdir -p \${SAMPLE}-\${TYPE}s && \\ + cp -f \${SAMPLE}-\${TYPE}*.fasta \${SAMPLE}-\${TYPE}s && \\ + zip -r \${SAMPLE}-\${TYPE}s.zip \${SAMPLE}-\${TYPE}s + + } + + + export -f zip_sample + + if [ ${TYPE} == 'bin' ]; then + + WORKDIR=`pwd` + else + + WORKDIR=${DIR} + fi + + if [ `find -L \${WORKDIR} -name '*.fasta' | wc -l | sed 's/^ *//'` -gt 0 ]; then + + + if [ ${TYPE} == 'MAG' ]; then + + find -L \${WORKDIR} -name '*.fasta' | xargs -I {} cp {} . + + fi + + SAMPLES=(`ls -1 *.fasta | sed -E 's/(.+)-${TYPE}.*.fasta/\\1/g'`) + + for SAMPLE in \${SAMPLES[*]};do + + zip_sample \${SAMPLE} ${TYPE} + + done + + fi + GL-version | grep "GeneLab utils"| sed -E 's/^\\s+//' > versions.txt + """ + +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index b321090c..69e98c3f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -4,15 +4,19 @@ params { // input file // a 3-column (single-end) or 4-column (paired-end) file csv_file = "${baseDir}/PE_file.csv" + /* Run assembly-based workflow, read-based, or both (values need to be one of: "assembly-based", "read-based", or "both") It runs both by default */ workflow = "both" + assay_suffix = "_GLmetagenomics" + // additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets) // leave as empty, i.e. "", if not wanted, include separator at end if adding one, e.g. "Swift1S_" additional_filename_prefix = "" + publishDir_mode = "link" // "copy", "link", "symlink" // Quality trimmed/filtered suffixes @@ -25,30 +29,35 @@ params { // Directories // Raw reads directory (can be relative to workflow directory, or needs to be full path) - raw_reads_dir = "${baseDir}/Raw_Sequence_Data/" + raw_reads_dir = "../Raw_Sequence_Data/" // output directories (all relative to processing directory, will be created) - fastqc_out_dir = "${baseDir}/FastQC_Outputs/" - filtered_reads_dir = "${baseDir}/Filtered_Sequence_Data/" - assembly_based_dir = "${baseDir}/Assembly-based_Processing/" - assemblies_dir = "${baseDir}/Assembly-based_Processing/assemblies/" - genes_dir = "${baseDir}/Assembly-based_Processing/predicted-genes/" - annotations_and_tax_dir = "${baseDir}/Assembly-based_Processing/annotations-and-taxonomy/" - mapping_dir = "${baseDir}/Assembly-based_Processing/read-mapping/" - combined_output_dir = "${baseDir}/Assembly-based_Processing/combined-outputs/" - bins_dir = "${baseDir}/Assembly-based_Processing/bins/" - MAGs_dir = "${baseDir}/Assembly-based_Processing/MAGs/" - read_based_dir = "${baseDir}/Read-based_Processing/" + fastqc_out_dir = "../FastQC_Outputs/" + filtered_reads_dir = "../Filtered_Sequence_Data/" + assembly_based_dir = "../Assembly-based_Processing/" + assemblies_dir = "../Assembly-based_Processing/assemblies/" + genes_dir = "../Assembly-based_Processing/predicted-genes/" + annotations_and_tax_dir = "../Assembly-based_Processing/annotations-and-taxonomy/" + mapping_dir = "../Assembly-based_Processing/read-mapping/" + combined_output_dir = "../Assembly-based_Processing/combined-outputs/" + bins_dir = "../Assembly-based_Processing/bins/" + MAGs_dir = "../Assembly-based_Processing/MAGs/" + read_based_dir = "../Read-based_Processing/" + genelab_dir = "../GeneLab/" + logs_dir = "../Logs/" + metadata_dir = "../Metadata/" // Database creation database { CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" - cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" - ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" + // Old link - https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz + GTDBTK_LINK = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" + ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" metaphlan_db_dir = null // "/path/to/Reference_DBs/metaphlan4-db/" chocophlan_dir = null // "/path/to/Reference_DBs/humann3-db/chocophlan/" uniref_dir = null // "/path/to/Reference_DBs/humann3-db/uniref/" - utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" - gtdbtk_db_dir = null // "/path/Reference_DBs/GTDB-tk-ref-db/" + utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" + gtdbtk_db_dir = null // "/path/to/Reference_DBs/GTDB-tk-ref-db/" } // Quality assessment parameters @@ -90,9 +99,10 @@ params { conda{ // Specify the paths to your existing conda environments + genelab = null // "/path/to/envs/genelab-utils" qc = null // "/path/to/envs/qc" - humann3 = null //"/path/to/envs/humann3" - cat = null // "/path/to/envs/CAT" + humann3 = null // "/path/to/envs/humann3" + cat = null // "/path/to/envs/genelab-utils/envs/CAT" prodigal = null // "/path/to/envs/prodigal" metabat = null // "/path/to/envs/metabat" gtdbtk = null // "/path/to/envs/gtdbtk" @@ -104,8 +114,11 @@ params { checkm = null // "/path/to/envs/checkm" } - GLDS_accession = false + GLDS_accession = false // OSD acession number for the data to be processed + // Pattern of files on OSDR for the GLDS_accession you want to process. + RawFilePattern = null // "_metaG", "_HRremoved" errorStrategy = "terminate" + debug = false // should info about the parameters set by the user be shown when the pipeline starts. } // Setting the default container engine as singularity @@ -114,26 +127,17 @@ params.containerEngine = "singularity" // i.e., slurm_conda and conda params.use_conda = false - profiles { - slurm_conda { - process.executor = 'slurm' - conda.enabled = true - params.use_conda = true - } + slurm { + process.executor = 'slurm' + } + conda { conda.enabled = true params.use_conda = true } - slurm_sing { - process.executor = 'slurm' - singularity.enabled = true - singularity.autoMounts = true - singularity.cacheDir = "singularity/" // local singularity images location - params.containerEngine = "singularity" - } singularity { singularity.enabled = true singularity.autoMounts = true @@ -150,9 +154,9 @@ profiles { } +// Maximum number of jobs to submit in parallel +executor.queueSize = 20 params.DB_ROOT = "${baseDir}/Reference_DBs" -// Number of jobs to run in parallel -executor.queueSize = 10 chocophlanDirExists = {params.database.chocophlan_dir != null} unirefDirExists = {params.database.uniref_dir != null} @@ -182,24 +186,37 @@ if(!chocophlanDirExists ||!unirefDirExists || !metaphlanDirExists || !utilitiesD process { errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"} - //queue = "normal,priority" + queue = "normal,priority" maxRetries = 2 memory = '5 GB' cache = 'lenient' cpus = 8 - //debug = true + + + withLabel: genelab { + + conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + container = "olabiyi/genelab-utils:1.3.22" + } + + withName: GET_RUNSHEET { + cpus = 10 + conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + container = "olabiyi/genelab-utils:1.3.22" + publishDir = [path: params.genelab_dir , mode: params.publishDir_mode] + } withLabel: bit { cpus = 2 - conda = {params.conda.bit ? params.conda.bit : "envs/bit.yaml"} + conda = {params.conda.bit != null ? params.conda.bit : "envs/bit.yaml"} container = "olabiyi/bit-astrobiomike:1.0" memory = "5 GB" } // Database set-up withLabel: humann_setup { - conda = {params.conda.humann3 ? params.conda.humann3 : "envs/humann3.yaml"} + conda = {params.conda.humann3 != null ? params.conda.humann3 : "envs/humann3.yaml"} container = "biobakery/humann:3.9" } @@ -213,56 +230,59 @@ process { } withName: SETUP_CAT_DB { - conda = {params.conda.cat ? params.conda.cat : "envs/cat.yaml"} + conda = {params.conda.cat != null ? params.conda.cat : "envs/cat.yaml"} container = "olabiyi/bit-astrobiomike:1.0" } withName: SETUP_KOFAMSCAN_DB { - conda = {params.conda.kofamscan ? params.conda.kofamscan : "envs/kofamscan.yaml"} + conda = {params.conda.kofamscan != null ? params.conda.kofamscan : "envs/kofamscan.yaml"} container = "olabiyi/bit-astrobiomike:1.0" } withName: SETUP_GTDBTK_DB { - conda = {params.conda.gtdbtk ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} + conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" } // Quality control and assesment withName: FASTQC { - conda = {params.conda.qc ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" cpus = 2 publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode] } withName: MULTIQC { - conda = {params.conda.qc ? params.conda.qc: "envs/qc.yaml"} + conda = {params.conda.qc != null ? params.conda.qc: "envs/qc.yaml"} container = "staphb/multiqc:1.19" cpus = 2 publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] } withName: BBDUK { - conda = {params.conda.qc ? params.conda.qc: "envs/qc.yaml"} + conda = {params.conda.qc != null ? params.conda.qc: "envs/qc.yaml"} container = "staphb/bbtools:38.86" cpus = 5 - errorStrategy = "retry" - maxRetries = 2 - memory = {50.GB * task.attempt} - publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode] + memory = "40 GB" + publishDir = publishDir = [[path: params.filtered_reads_dir, pattern: "*${params.filtered_suffix}" , mode: params.publishDir_mode], + [path: params.logs_dir, pattern: "*-bbduk.log" , mode: params.publishDir_mode]] } // Read-based processing withLabel: read_based { - conda = {params.conda.humann3 ? params.conda.humann3 : "envs/humann3.yaml"} + conda = {params.conda.humann3 != null ? params.conda.humann3 : "envs/humann3.yaml"} // this -> "biobakery/humann:3.9" is the latest version container = "biobakery/humann:3.9" - publishDir = [path: params.read_based_dir, mode: params.publishDir_mode] } + withLabel: read_based_outputs { + publishDir = [path: params.read_based_dir, mode: params.publishDir_mode] + } + + withName: HUMANN { cpus = 8 memory = "100 GB" @@ -280,22 +300,24 @@ process { } withName: ASSEMBLE { - conda = {params.conda.megahit ? params.conda.megahit : "envs/megahit.yaml"} + conda = {params.conda.megahit != null ? params.conda.megahit : "envs/megahit.yaml"} container = "biocontainers/megahit:1.2.9_cv1" cpus = 8 memory = "20 GB" + publishDir = [path: params.logs_dir, pattern: "*-assembly.log", mode: params.publishDir_mode] } withLabel: mapping { - conda = {params.conda.mapping ? params.conda.mapping : "envs/mapping.yaml"} + conda = {params.conda.mapping != null ? params.conda.mapping : "envs/mapping.yaml"} cpus = 8 - errorStrategy = "retry" - maxRetries = 2 - memory = {20.GB * task.attempt} + //errorStrategy = 'retry' + //maxRetries = 2 + memory = "20 GB" // {20.GB * task.attempt} } withName: MAPPING { container = "biocontainers/bowtie2:v2.4.1_cv1" + publishDir = [path: params.mapping_dir, pattern: "*-mapping-info.txt", mode: params.publishDir_mode] } withName: SAM_TO_BAM { @@ -304,9 +326,10 @@ process { } withName: CALL_GENES { - conda = {params.conda.prodigal ? params.conda.prodigal : "envs/prodigal.yaml"} + conda = {params.conda.prodigal != null ? params.conda.prodigal : "envs/prodigal.yaml"} container = "quay.io/biocontainers/prodigal:2.6.3--h031d066_8" cpus = 8 + publishDir = [path: params.genes_dir, pattern: "*-genes.gff", mode: params.publishDir_mode] } withLabel: call_genes { @@ -318,18 +341,16 @@ process { } withName: KO_ANNOTATION { - conda = {params.conda.kofamscan ? params.conda.kofamscan : "envs/kofamscan.yaml"} + conda = {params.conda.kofamscan != null ? params.conda.kofamscan : "envs/kofamscan.yaml"} container = "quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2" cpus = 8 - errorStrategy = "retry" - maxRetries = 2 - memory = {20.GB * task.attempt} - disk = {30.GB * task.attempt} - publishDir = [path: params.annotations_and_tax_dir, mode: params.publishDir_mode] + memory = "10 GB" + disk = "20 GB" + //publishDir = [path: params.annotations_and_tax_dir, mode: params.publishDir_mode] } withName: TAX_CLASSIFICATION { - conda = {params.conda.cat ? params.conda.cat : "envs/cat.yaml"} + conda = {params.conda.cat != null ? params.conda.cat : "envs/cat.yaml"} container = "nanozoo/catbat:5.2.3--e9c0a44" cpus = 8 memory = "50 GB" @@ -337,7 +358,7 @@ process { } withName: GET_COV_AND_DET { - conda = {params.conda.mapping ? params.conda.mapping : "envs/mapping.yaml"} + conda = {params.conda.mapping != null ? params.conda.mapping : "envs/mapping.yaml"} container = "staphb/bbtools:38.86" cpus = 8 memory = "20 GB" @@ -350,19 +371,22 @@ process { withName: METABAT_BINNING { - conda = {params.conda.metabat ? params.conda.metabat : "envs/metabat.yaml"} + conda = {params.conda.metabat != null ? params.conda.metabat : "envs/metabat.yaml"} container = "nanozoo/metabat2:2.15--c1941c7" cpus = 8 - publishDir = [[path: params.mapping_dir, mode: params.publishDir_mode, pattern: "*-metabat-assembly-depth.tsv"], - [path: params.bins_dir, mode: params.publishDir_mode, pattern: "*-bin*"]] + publishDir = [path: params.mapping_dir, mode: params.publishDir_mode, pattern: "*-metabat-assembly-depth.tsv"] } withLabel: bins { publishDir = [path: params.bins_dir, mode: params.publishDir_mode] } - withName: CHECKM_ON_BINS { - conda = {params.conda.checkm ? params.conda.checkm : "envs/checkm.yaml"} + withName: ZIP_BINS { + publishDir = [path: params.bins_dir, mode: params.publishDir_mode] + } + + withName: CHECKM_ON_BINS { + conda = {params.conda.checkm != null ? params.conda.checkm : "envs/checkm.yaml"} container = "nanozoo/checkm:1.1.3--c79a047" cpus = 8 memory = "50 GB" @@ -373,8 +397,13 @@ process { publishDir = [path: params.MAGs_dir, mode: params.publishDir_mode] } + withName: ZIP_MAGS { + publishDir = [path: params.MAGs_dir, mode: params.publishDir_mode] + } + + withName: GTDBTK_ON_MAGS { - conda = {params.conda.gtdbtk ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} + conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" containerOptions = { params.containerEngine == "singularity" ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } cpus = 8 @@ -383,7 +412,7 @@ process { } withName: SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { - conda = {params.conda.kegg_decoder ? params.conda.kegg_decoder : "envs/keggdecoder.yaml"} + conda = {params.conda.kegg_decoder != null ? params.conda.kegg_decoder : "envs/keggdecoder.yaml"} container = "fmalmeida/keggdecoder:latest" cpus = 8 } @@ -416,6 +445,6 @@ manifest { description = 'GeneLab bioinformatics processing pipelines for metagenomics sequencing data' mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '>=22.10.1' - version = '1.0.0' + nextflowVersion = '>=22.10.6' + version = 'GL-DPPD-7107' } diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm index beb00294..62c22964 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm @@ -4,9 +4,9 @@ #SBATCH --output=nf_master.o.%j ## Replace job_name with the name of the job you are running ## #SBATCH --error=nf_master.e.%j ## Replace job_name with the name of the job you are running ## #SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ## -#SBATCH --mem=2G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## +#SBATCH --mem=10G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## #SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ## -#SBATCH --mail-user=email@domain.com ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## +#SBATCH --mail-user=name@domain.com ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## #SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ## . ~/.profile @@ -28,7 +28,7 @@ echo $HOSTNAME ## You can see a list of all available environments by running the command: conda env list ## ## If you need a conda envrionment installed request it using JIRA ## -source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the environment ## +source activate /path/to/envs/genelab-utils ## Replace conda_env_name with the name of the environment ## ## Print the version of the tool you are using to ensure the tool version is recorded ## @@ -40,7 +40,8 @@ echo "" ## The command(s) that you want to run in this slurm job ## export NXF_SINGULARITY_CACHEDIR=singularity/ -nextflow run main.nf -profile slurm_sing -resume --csv_file PE_file.csv ## Replace command with the command(s) you want to run ## +#nextflow run main.nf -profile slurm,singularity -resume --csv_file PE_file.csv ## Replace command with the command(s) you want to run ## +nextflow run main.nf -profile slurm,singularity --GLDS_accession OSD-574 -resume ## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## From e409a28ca5883babd32c0cfb36286aeff8572e9f Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 24 Jun 2024 16:17:06 -0700 Subject: [PATCH 10/48] fix GTDBTK download --- .../SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh | 2 +- .../workflow_code/modules/database_creation.nf | 6 +++--- .../SW_MGIllumina/workflow_code/nextflow.config | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh index 943f1fb4..60186c16 100755 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh @@ -8,7 +8,7 @@ echo "Downloading the GTDB-Tk database to ${GTDBTK_DATA_PATH}..." DB_URL=$1 -TAR_FILE=$(basename ${db_url}) +TAR_FILE=$(basename ${DB_URL}) downloadFile=true diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf index 6a2b1693..3b3f67cd 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf @@ -93,8 +93,8 @@ process SETUP_GTDBTK_DB { path("GTDB-tk-ref-db/"), emit: gtdbtk_db_dir path("GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP"), emit: completion_indicator path("versions.txt"), emit: version - shell: - ''' + script: + """ [ -d GTDB-tk-ref-db/ ] || mkdir -p GTDB-tk-ref-db/ # But still needs to be set for this particular session that is downloading and setting up the db @@ -103,7 +103,7 @@ process SETUP_GTDBTK_DB { download-GTDBTK-db.sh ${GTDBTK_URL} && touch GTDB-tk-ref-db/SETUP_GTDBTK_DB_SETUP gtdbtk -h |grep "GTDB-Tk" | sed -E 's/.+\\s+(GTDB-Tk v.+)\\s+.+/\\1/' > versions.txt printf "### Set up completed successfully ###\\n\\n" - ''' + """ } // The processes below download the databases required by humann3. diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index 69e98c3f..d623157f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -156,7 +156,7 @@ profiles { // Maximum number of jobs to submit in parallel executor.queueSize = 20 -params.DB_ROOT = "${baseDir}/Reference_DBs" +params.DB_ROOT = "../Reference_DBs" chocophlanDirExists = {params.database.chocophlan_dir != null} unirefDirExists = {params.database.uniref_dir != null} @@ -428,15 +428,15 @@ process { def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${baseDir}/Resource_Usage/execution_timeline_${trace_timestamp}.html" + file = "../Resource_Usage/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${baseDir}/Resource_Usage/execution_report_${trace_timestamp}.html" + file = "../Resource_Usage/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${baseDir}/Resource_Usage/execution_trace_${trace_timestamp}.txt" + file = "../Resource_Usage/execution_trace_${trace_timestamp}.txt" } manifest { From 72b0601744313c04f91bd6d643eee42c046a3d25 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 25 Jun 2024 10:18:04 -0700 Subject: [PATCH 11/48] Fixed humann utilities mounting issue --- .../SW_MGIllumina/workflow_code/nextflow.config | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index d623157f..05085d2b 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -3,7 +3,7 @@ params { // input file // a 3-column (single-end) or 4-column (paired-end) file - csv_file = "${baseDir}/PE_file.csv" + csv_file = "PE_file.csv" /* Run assembly-based workflow, read-based, or both (values need to be one of: "assembly-based", "read-based", or "both") @@ -158,14 +158,12 @@ profiles { executor.queueSize = 20 params.DB_ROOT = "../Reference_DBs" -chocophlanDirExists = {params.database.chocophlan_dir != null} -unirefDirExists = {params.database.uniref_dir != null} -metaphlanDirExists = {params.database.metaphlan_db_dir != null} -utilitiesDirExists = {params.database.utilities_dir != null} - // Mount the databases to their predefined locations in the Biobakery container -if(!chocophlanDirExists ||!unirefDirExists || !metaphlanDirExists || !utilitiesDirExists) { +if(params.database.chocophlan_dir == null || + params.database.uniref_dir == null || + params.database.metaphlan_db_dir == null || + params.database.utilities_dir == null) { //biobakery/humann:3.6 - replace /usr/local/lib/python3.6/dist-packages/humann/data/ //chocophlan = "${params.DB_ROOT}/humann3-db/chocophlan/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/chocophlan_DEMO" From a4df49d330d84dbcbb0eab63e88aac257bfbb701 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 25 Jun 2024 15:38:14 -0700 Subject: [PATCH 12/48] Edited software collation --- .../SW_MGIllumina/workflow_code/main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 8ea49b02..0ae59999 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -313,7 +313,9 @@ def deleteWS(string){ // Main workflow workflow { - // Parse file input + // Software Version Capturing - runsheet + software_versions_ch = Channel.empty() + // Parse file input if(params.GLDS_accession){ GET_RUNSHEET(params.GLDS_accession) @@ -321,6 +323,7 @@ workflow { .splitCsv(header:true) .set{file_ch} + GET_RUNSHEET.out.version | mix(software_versions_ch) | set{software_versions_ch} }else{ Channel.fromPath(params.csv_file, checkIfExists: true) @@ -335,9 +338,6 @@ workflow { }.set{reads_ch} - // Software Version Capturing - runsheet - software_versions_ch = Channel.empty() - GET_RUNSHEET.out.version | mix(software_versions_ch) | set{software_versions_ch} // Quality check and trim the input reads raw_qc(Channel.of("raw"), params.multiqc_config,reads_ch) From d2cfbdf263f84be18990d9b6d5558424f4fa1cea Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 26 Jun 2024 17:45:57 -0700 Subject: [PATCH 13/48] Updated GTDBTK --- .../SW_MGIllumina/workflow_code/modules/summarize_MAG.nf | 4 ++-- .../SW_MGIllumina/workflow_code/nextflow.config | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf index 176115ee..3c3c0f7c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf @@ -92,7 +92,7 @@ process GTDBTK_ON_MAGS { -x fasta \\ --out_dir gtdbtk-out/ \\ --cpus ${task.cpus} \\ - --pplacer_cpus 1 + --skip_ani_screen else @@ -101,7 +101,7 @@ process GTDBTK_ON_MAGS { -x fasta \\ --out_dir gtdbtk-out/ \\ --cpus ${task.cpus} \\ - --pplacer_cpus 1 + --skip_ani_screen fi diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index 05085d2b..034886ad 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -156,7 +156,7 @@ profiles { // Maximum number of jobs to submit in parallel executor.queueSize = 20 -params.DB_ROOT = "../Reference_DBs" +params.DB_ROOT = "/full/path/to/Reference_DBs" // Mount the databases to their predefined locations in the Biobakery container @@ -402,7 +402,7 @@ process { withName: GTDBTK_ON_MAGS { conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} - container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" + container = "quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_1" // "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" containerOptions = { params.containerEngine == "singularity" ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } cpus = 8 memory = "600 GB" From 5b5cfa360a85d0e4ef0625a3c0e56d3e78439cd5 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 28 Jun 2024 14:12:59 -0500 Subject: [PATCH 14/48] Added README.md --- .../SW_MGIllumina/README.md | 172 +++++++++++++----- .../SW_MGIllumina/workflow_code/main.nf | 77 ++++---- .../workflow_code/nextflow.config | 164 ++++++++++------- 3 files changed, 260 insertions(+), 153 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md index 131ca570..5c319fe5 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md @@ -1,101 +1,173 @@ -# SW_MGIllumina Workflow Information and Usage Instructions +# Workflow Information and Usage Instructions +## General Workflow Info -## General workflow info -The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (SW_MGIllumina) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). +### Implementation Tools +The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. > **Note on reference databases** -> Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about 240 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. +> Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. -## Utilizing the workflow -1. [Install conda, mamba, and `genelab-utils` package](#1-install-conda-mamba-and-genelab-utils-package) -2. [Download the workflow template files](#2-download-the-workflow-template-files) -3. [Modify the variables in the config.yaml file](#3-modify-the-variables-in-the-configyaml-file) -4. [Run the workflow](#4-run-the-workflow) +## Utilizing the Workflow -### 1. Install conda, mamba, and `genelab-utils` package -We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +1. [Install nextflow, conda and singularity](#1-install-nextflow-conda-and-singularity) + 1a. [Install nextflow and conda](#1a-install-nextflow-and-conda) + 1b. [Install singularity](#1b-install-singularity) -Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: +2. [Download the workflow files](#2-download-the-workflow-files) + +3. [Run the workflow](#3-run-the-workflow) + 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + +4. [Workflow outputs](#4-workflow-outputs) + 4a. [Main outputs](#4a-main-outputs) + 4b. [Resource logs](#4b-resource-logs) + +
+ +### 1. Install nextflow, conda and singularity + + + +#### 1a. Install nextflow and conda + +Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). + +> Note: If you want to install anaconda, we recommend installing a miniconda, python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). + +We recommend installing a miniconda, python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). + +Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations. ```bash conda install -n base -c conda-forge mamba ``` -> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if wanted. +> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5). -Once mamba is installed, you can install the genelab-utils conda package in a new environment with the following command: +Once mamba is installed, you can install the genelab-utils conda package which contains nextflow with the following command: ```bash -mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' +mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike genelab-utils ``` - The environment then needs to be activated: ```bash conda activate genelab-utils -``` -### 2. Download the workflow template files -The workflow files for processing Illumina metagenomics sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest SW_MGIllumina version on to your system, run the following command: +# Test that nextflow is installed +nextflow -h -```bash -GL-get-workflow MG-Illumina +# Update nextflow +nextflow self-update ``` -This downloaded the workflow into a directory called `SW_MGIllumina_*/`, with the workflow version number at the end. +
-> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: -> ```bash -> GL-get-workflow MG-Illumina --wanted-version 2.0.0 -> ``` +#### 1b. Install singularity -### 3. Modify the variables in the config.yaml file -Once you've downlonaded the workflow template, you can modify the variables in your downloaded version of the [config.yaml](workflow_code/config.yaml) file as needed in order to match your dataset and system setup. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below). You will also need to indicate the paths to your input data (raw reads) and the root directory for where the reference databases should be stored (they will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the config.yaml file to be consistent with the study you want to process and the machine you're using. +Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. -> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). +We recommend installing singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). -**Example for how to create a single-column list of unique sample identifiers from your raw data file names** +
-For example, if you have paired-end read data for 2 samples located in `../Raw_Data/` relative to your workflow directory, that would look like this: +### 2. Download the workflow files + +All files required for utilizing the NF_XXX GeneLab workflow for processing metagenomics illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: ```bash -ls ../Raw_Data/ +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina/NF_MGIllumina.zip +unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X ``` +OR by using the genelab-utils conda package + +```bash +GL-get-workflow MG-Illumina ``` -Sample-1_R1_raw.fastq.gz -Sample-1_R2_raw.fastq.gz -Sample-2_R1_raw.fastq.gz -Sample-2_R2_raw.fastq.gz -``` -You would set up your `unique-sample-IDs.txt` file as follows: +
+ +### 3. Run the Workflow + +For options and detailed help on how to run the workflow, run the following command: ```bash -cat unique-sample-IDs.txt +nextflow run main.nf --help ``` +> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. + +
+ +#### 3a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-574 ``` -Sample-1 -Sample-2 + +
+ +#### 3b. Approach 2: Run slurm jobs in singularity containers with a csv file as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv ``` -### 4. Run the workflow +
-While in the directory holding the Snakefile, config.yaml, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: +#### 3c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) ```bash -snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p +nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc ``` -* `--use-conda` – specifies to use the conda environments included in the workflow (these are specified in the files in the workflow [envs/](workflow_code/envs) directory) -* `--conda-prefix` – indicates where the needed conda environments will be stored. Adding this option will also allow the same conda environments to be re-used when processing additional datasets, rather than making new environments each time you run the workflow. The value listed for this option, `${CONDA_PREFIX}/envs`, points to the default location for conda environments (note: the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). -* `-j` – assigns the number of jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) -* `-p` – specifies to print out each command being run to the screen +
+ +**Required Parameters For All Approaches:** + +* `-run main.nf` - Instructs nextflow to run the NF_XXX workflow +* `-resume` - Resumes workflow execution using previously cached results +* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow + + + *Required only if you would like to pull and process data directly from OSDR* + +* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-574. + +*Required only if --GLDS_accession is not passed as an argument* + +* `--csv_file` – A 3-column (single-end) or 4-column (paired-end) input csv file (sample_id, forward, [reverse,] paired). Please see the sample `SE_file.csv` and `PE_file.csv` in this repository for examples on how to format this file. + +> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. + +
+ +#### 3d. Modify parameters and cpu resources in the nextflow config file + +Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). + +Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the nexflow.config file to be consistent with the study you want to process and the machine you're using. + +### 4. Workflow outputs + +#### 4a. Main outputs + +The outputs from this pipeline are documented in the [GL-DPPD-7107](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md) processing protocol. + +#### 4b. Resource logs + +Standard nextflow resource usage logs are also produced as follows: -See `snakemake -h` and [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) for more options and details. +- Output: + - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) + - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) + - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) ---- +> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 0ae59999..7901c028 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -26,10 +26,10 @@ if (params.help) { println(" > nextflow run main.nf -resume -profile conda --GLDS_accession OSD-574 --conda.qc ") println() println("Required arguments:") - println("""-profile [STRING] What profile should be used to run the workflow. Options are [slurm, singularity, docker, and conda]. - singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. - To combine profiles, separate them comma. For example for to combine slurm and singularity profiels, pass 'slurm,singularity' as arguement. """) - println("--csv_file [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired). Mandatory if a GLDS accession is not provided.") + println("""-profile [STRING] Specifies the profile to be used to run the workflow. Options are [slurm, singularity, docker, and conda]. + singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. + To combine profiles, separate two or more profiles with comma. For example, to combine slurm and singularity profiles, pass 'slurm,singularity' as argument. """) + println("--csv_file [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired). Required only if a GLDS accession is not provided.") println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") println(" The sample_id column should contain unique sample ids.") println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") @@ -37,9 +37,9 @@ if (params.help) { println() println("Optional arguments:") println(" --help Print this help message and exit") - println(" --workflow [STRING] Which workflow should be run. Options are one of [read-based, assembly-based, both]. Default: both.") - println(" --publishDir_mode [STRING] How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") - println(" --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: ignore") + println(" --workflow [STRING] Specifies that workflow to be run. Options are one of [read-based, assembly-based, both]. Default: both.") + println(" --publishDir_mode [STRING] Specifies how nextflow handles output file publishing. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println(" --errorStrategy [STRING] Specifies how nextflow handles errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: ignore") println(" --swift_1S [BOOLEAN] Setting for trimming recommended when working with Swift 1S libraries.") println(" adds `swift=t` setting to bbduk quality trimming/filtering command. For info on this, see example, ") println(" https://swiftbiosci.com/wp-content/uploads/2019/03/16-0853-Tail-Trim-Final-442019.pdf.") @@ -58,32 +58,32 @@ if (params.help) { println(" --max_est_strain_het [INT] Minimum estimated strain heterogeneity. Default: 50.") println(" --reduced_tree [STRING] reduced_tree option for checkm, limits the RAM usage to 16GB; https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands#tree.") println(" 'True' for yes, anything else will be considered 'False' and the default full tree will be used. Default: 'True'. ") - println(" --max_mem [INT] Maximum memory allowed passed to megahit assembler. Can be set either by proportion of available on system, e.g. 0.5") + println(" --max_mem [INT] Maximum memory allowed, passed to megahit assembler. Can be set either by proportion of available on system, e.g. 0.5") println(" or by absolute value in bytes, e.g. 100e9 would be 100 GB. Default: 100e9.") - + println() println(" --pileup_mem [STRING] pileup.sh paramater for calculating contig coverage and depth. Memory used by bbmap's pileup.sh (within the GET_COV_AND_DET process). ") println(" passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes.") println(" 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased.Default: '5g' .") println(" --block_size [int] Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options. Default: 4.") println() println("File Suffixes:") - println(" --filtered_suffix [STRING] Suffix to use for naming your quality filtered reads. Only applicable when input reads are single-end. Default: _filtered.fastq.gz.") - println(" --filtered_R1_suffix [STRING] Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz.") - println(" --filtered_R2_suffix [STRING] Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz.") + println(" --filtered_suffix [STRING] Specifies the suffix for naming quality filtered reads. Only applicable when input reads are single-end. Default: _filtered.fastq.gz.") + println(" --filtered_R1_suffix [STRING] Specifies the suffix for naming quality filtered forward reads. Default: _R1_filtered.fastq.gz.") + println(" --filtered_R2_suffix [STRING] Specifies the suffix for naming quality filtered reverse reads. Default: _R2_filtered.fastq.gz.") println() println("Output directories:") - println(" --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/.") - println(" --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: FastQC_Outputs/.") - println(" --filtered_reads_dir [PATH] Where should your filtered reads be stored. Default: Filtered_Sequence_Data/.") - println(" --assembly_based_dir [PATH] Where should the results of assembly-based analysis be stored. Default: Assembly-based_Processing/.") - println(" --assemblies_dir [PATH] Where should your assemblies be stored. Default: Assembly-based_Processing/assemblies/.") - println(" --genes_dir [PATH] Where should the predicted genes from your assemblies be stored. Default: Assembly-based_Processing/predicted-genes/.") - println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: Assembly-based_Processing/annotations-and-taxonomy/.") - println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: Assembly-based_Processing/read-mapping/.") - println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: Assembly-based_Processing/combined-outputs/.") - println(" --bins_dir [PATH] Assembly bins directory. Default: Assembly-based_Processing/bins/.") - println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: Assembly-based_Processing/MAGs/.") - println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: Read-based_Processing/.") + println(" --raw_reads_dir [PATH] Specifies where the fastqc report of the raw reads will be published. Default: ../Raw_Sequence_Data/.") + println(" --fastqc_out_dir [PATH] Specifies where multiqc outputs will be published. Default: ../FastQC_Outputs/.") + println(" --filtered_reads_dir [PATH] Specifies where filtered reads will be published. Default: ../Filtered_Sequence_Data/.") + println(" --assembly_based_dir [PATH] Specifies where the results of assembly-based analysis will be published. Default: ../Assembly-based_Processing/.") + println(" --assemblies_dir [PATH] Specifies where assemblies will be published. Default: ../Assembly-based_Processing/assemblies/.") + println(" --genes_dir [PATH] Specifies where predicted genes from the assemblies will be published. Default: ../Assembly-based_Processing/predicted-genes/.") + println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: ../Assembly-based_Processing/annotations-and-taxonomy/.") + println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: ../Assembly-based_Processing/read-mapping/.") + println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.") + println(" --bins_dir [PATH] Assembly bins directory. Default: ../Assembly-based_Processing/bins/.") + println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: ../Assembly-based_Processing/MAGs/.") + println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: ../Read-based_Processing/.") println() println("Genelab specific arguements:") println(" --GLDS_accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") @@ -93,24 +93,30 @@ if (params.help) { println(" their filenames, we can provide '-p fastq.gz,NxtaFlex,metagenomics,raw'. Default: null.") println(" --assay_suffix [STRING] Genelab's assay suffix. Default: _GLmetagenomics.") println(" --additional_filename_prefix [STRING] additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets).") - println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: '' .") + println(" include separator at end if adding one, e.g. Swift1S_ if wanted. Default: empty string .") println() println("Paths to existing databases and database links.") + println(" --DB_ROOT [PATH] FULL PATH to root directory where the databases will be downloaded if they don't exist.") + println(" Relative paths such as '~/' and '../' will fail, please don't use them. Default: ./Reference_DBs/ ") println("CAT database directory strings:") - println("The strings below will be added to the end of the --database.cat_db path arguement provided below.") - println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/.") - println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/.") - println(" --database.CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") - println(" --database.cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") - println(" --database.ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") + println(" The strings below will be added to the end of the --database.cat_db path arguement provided below.") + println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/.") + println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/.") + println(" --database.CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") + println("CAT database ") + println(" --database.cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") + println("Humann database:") println(" --database.metaphlan_db_dir [PATH] Path to metaphlan database. Example, /path/to/Reference_DBs/metaphlan4-db/. Default: null.") println(" --database.chocophlan_dir [PATH] Path to Humann's chocophlan nucleotide database. Example, /path/to/Reference_DBs/humann3-db/chocophlan/. Default: null.") println(" --database.uniref_dir [PATH] Path to Humann's Uniref protein database. Example, /path/to/Reference_DBs/humann3-db/uniref/. Default: null.") println(" --database.utilities_dir [PATH] Path to Humann's untilities database. Example, /path/to/Reference_DBs/humann3-db/utility_mapping/. Default: null.") + println("GTDBTK database:") println(" --database.GTDBTK_LINK [URL] GTDBTK database online download link. Default: https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz.") println(" --database.gtdbtk_db_dir [PATH] Path to GTDBTK database. Example, /path/Reference_DBs/GTDB-tk-ref-db/. Default: null.") + println("kofam scan database database:") + println(" --database.ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") println() - println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") + println("Paths to existing conda environments to use, otherwise, new ones will be created using the yaml files in envs/.") println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") println(" --conda.humann3 [PATH] Path to a conda environment with humann3 installed. Default: null.") println(" --conda.cat [PATH] Path to a conda environment containing CAT (Contig annotation tool). Default: null.") @@ -123,6 +129,7 @@ if (params.help) { println(" --conda.kofamscan [PATH] Path to a conda environment containing KOFAM SCAN. Default: null.") println(" --conda.mapping [PATH] Path to a conda environment with bowtie and samtools installed. Default: null.") println(" --conda.checkm [PATH] Path to a conda environment with checkm installed. Default: null.") + println() print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number of cpus, memory per task etc.") exit 0 } @@ -211,12 +218,6 @@ log.info """ // Create GLDS runsheet include { GET_RUNSHEET } from "./modules/create_runsheet.nf" -// Processes to create the required database(s) if not provided -/* -include { SETUP_CAT_DB; SETUP_KOFAMSCAN_DB; SETUP_GTDBTK_DB; - SETUP_CHOCOPHLAN; SETUP_UNIREF; SETUP_UTILITY_MAPPING; - SETUP_METAPHLAN } from "./modules/database_creation.nf" -*/ include { make_humann_db } from "./modules/database_creation.nf" // Read quality check and filtering diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index 034886ad..c2db18bc 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -1,4 +1,4 @@ -// Global parameter +//******** Global parameters *****************// params { // input file @@ -29,35 +29,41 @@ params { // Directories // Raw reads directory (can be relative to workflow directory, or needs to be full path) - raw_reads_dir = "../Raw_Sequence_Data/" - // output directories (all relative to processing directory, will be created) - fastqc_out_dir = "../FastQC_Outputs/" - filtered_reads_dir = "../Filtered_Sequence_Data/" - assembly_based_dir = "../Assembly-based_Processing/" - assemblies_dir = "../Assembly-based_Processing/assemblies/" - genes_dir = "../Assembly-based_Processing/predicted-genes/" + raw_reads_dir = "../Raw_Sequence_Data/" + // Output directories (all relative to processing directory, will be created) + fastqc_out_dir = "../FastQC_Outputs/" + filtered_reads_dir = "../Filtered_Sequence_Data/" + assembly_based_dir = "../Assembly-based_Processing/" + assemblies_dir = "../Assembly-based_Processing/assemblies/" + genes_dir = "../Assembly-based_Processing/predicted-genes/" annotations_and_tax_dir = "../Assembly-based_Processing/annotations-and-taxonomy/" - mapping_dir = "../Assembly-based_Processing/read-mapping/" - combined_output_dir = "../Assembly-based_Processing/combined-outputs/" - bins_dir = "../Assembly-based_Processing/bins/" - MAGs_dir = "../Assembly-based_Processing/MAGs/" - read_based_dir = "../Read-based_Processing/" - genelab_dir = "../GeneLab/" - logs_dir = "../Logs/" - metadata_dir = "../Metadata/" - - // Database creation + mapping_dir = "../Assembly-based_Processing/read-mapping/" + combined_output_dir = "../Assembly-based_Processing/combined-outputs/" + bins_dir = "../Assembly-based_Processing/bins/" + MAGs_dir = "../Assembly-based_Processing/MAGs/" + read_based_dir = "../Read-based_Processing/" + genelab_dir = "../GeneLab/" + logs_dir = "../Logs/" + metadata_dir = "../Metadata/" + + //************************* Database creation **********************************// + /* Root directory where the databases will be downloaded if they don't exist. + This should be provided as a full path (starting with '/'). + Note that relative paths such as '~/' and '../' are not expanded + by nextflow's evaluation of files, so don't use that. + */ + DB_ROOT = "${baseDir}/Reference_DBs" database { - CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" + CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" // Old link - https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz - GTDBTK_LINK = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" - cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" - ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" - metaphlan_db_dir = null // "/path/to/Reference_DBs/metaphlan4-db/" - chocophlan_dir = null // "/path/to/Reference_DBs/humann3-db/chocophlan/" - uniref_dir = null // "/path/to/Reference_DBs/humann3-db/uniref/" - utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" - gtdbtk_db_dir = null // "/path/to/Reference_DBs/GTDB-tk-ref-db/" + GTDBTK_LINK = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" + ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" + metaphlan_db_dir = null // "/path/to/Reference_DBs/metaphlan4-db/" + chocophlan_dir = null // "/path/to/Reference_DBs/humann3-db/chocophlan/" + uniref_dir = null // "/path/to/Reference_DBs/humann3-db/uniref/" + utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" + gtdbtk_db_dir = null // "/path/to/Reference_DBs/GTDB-tk-ref-db/" } // Quality assessment parameters @@ -76,15 +82,15 @@ params { pileup_mem = "5g" // pileup.sh paramater for calculating contig coverage and depth block_size = 4 // CAT blocksize - // ---------- CAT database directory strings -----------------------------------------// + //******************** CAT database directory strings ************************// // The string below will be added to the end of the params.database.cat_db provided above // cat taxonomy directory with cat_db path provided above cat_taxonomy_dir = "2021-01-07_taxonomy/" - cat_db_sub_dir = "2021-01-07_CAT_database/" + cat_db_sub_dir = "2021-01-07_CAT_database/" // MAG parameters - min_est_comp = 90 - max_est_redund = 10 + min_est_comp = 90 + max_est_redund = 10 max_est_strain_het = 50 /* @@ -98,27 +104,27 @@ params { conda{ - // Specify the paths to your existing conda environments - genelab = null // "/path/to/envs/genelab-utils" - qc = null // "/path/to/envs/qc" - humann3 = null // "/path/to/envs/humann3" - cat = null // "/path/to/envs/genelab-utils/envs/CAT" - prodigal = null // "/path/to/envs/prodigal" - metabat = null // "/path/to/envs/metabat" - gtdbtk = null // "/path/to/envs/gtdbtk" - kegg_decoder = null // "/path/to/envs/kegg_decoder" - megahit = null // "/path/to/envs/megahit" - bit = null // "/path/to/envs/bit" - kofamscan = null // "/path/to/envs/kofamscan" - mapping = null // "/path/to/envs/mapping" - checkm = null // "/path/to/envs/checkm" + // Specify paths to your existing conda environments + genelab = null // "/path/to/envs/genelab-utils" + qc = null // "/path/to/envs/qc" + humann3 = null // "/path/to/envs/humann3" + cat = null // "/path/to/envs/genelab-utils/envs/CAT" + prodigal = null // "/path/to/envs/prodigal" + metabat = null // "/path/to/envs/metabat" + gtdbtk = null // "/path/to/envs/gtdbtk" + kegg_decoder = null // "/path/to/envs/kegg_decoder" + megahit = null // "/path/to/envs/megahit" + bit = null // "/path/to/envs/bit" + kofamscan = null // "/path/to/envs/kofamscan" + mapping = null // "/path/to/envs/mapping" + checkm = null // "/path/to/envs/checkm" } - GLDS_accession = false // OSD acession number for the data to be processed + GLDS_accession = false // GLDS or OSD acession number for the data to be processed // Pattern of files on OSDR for the GLDS_accession you want to process. RawFilePattern = null // "_metaG", "_HRremoved" errorStrategy = "terminate" - debug = false // should info about the parameters set by the user be shown when the pipeline starts. + debug = false // should info about the parameters set by the user be shown when the workflow starts. } // Setting the default container engine as singularity @@ -127,21 +133,27 @@ params.containerEngine = "singularity" // i.e., slurm_conda and conda params.use_conda = false + + +/******************************************************************************************************* +*************************************** Workflow Profiles ********************************************** +********************************************************************************************************/ + profiles { slurm { - process.executor = 'slurm' + process.executor = 'slurm' } conda { - conda.enabled = true - params.use_conda = true + conda.enabled = true + params.use_conda = true } singularity { singularity.enabled = true singularity.autoMounts = true - singularity.cacheDir = "singularity/" // local singularity images location + singularity.cacheDir = "singularity/" // local singularity images location params.containerEngine = "singularity" } @@ -156,10 +168,10 @@ profiles { // Maximum number of jobs to submit in parallel executor.queueSize = 20 -params.DB_ROOT = "/full/path/to/Reference_DBs" -// Mount the databases to their predefined locations in the Biobakery container + +// Mount Humann databases to their predefined locations in the Biobakery container being used if(params.database.chocophlan_dir == null || params.database.uniref_dir == null || params.database.metaphlan_db_dir == null || @@ -181,8 +193,13 @@ if(params.database.chocophlan_dir == null || } +/****************************************************************************************************************** +***************** Tune process specific resources (cpu, container, memory etc.) *********************************** +*******************************************************************************************************************/ process { + + //******************* Default process settings ************************// errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore"} queue = "normal,priority" maxRetries = 2 @@ -190,6 +207,11 @@ process { cache = 'lenient' cpus = 8 + /********************************************************************************************* + ******************************** Specific process settings ********************************** + *********************************************************************************************/ + + //************************* Generic process labels used throught the workflow ****************// withLabel: genelab { @@ -197,12 +219,6 @@ process { container = "olabiyi/genelab-utils:1.3.22" } - withName: GET_RUNSHEET { - cpus = 10 - conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} - container = "olabiyi/genelab-utils:1.3.22" - publishDir = [path: params.genelab_dir , mode: params.publishDir_mode] - } withLabel: bit { @@ -212,7 +228,7 @@ process { memory = "5 GB" } -// Database set-up +//*************************************** Database set-up ********************************************// withLabel: humann_setup { conda = {params.conda.humann3 != null ? params.conda.humann3 : "envs/humann3.yaml"} container = "biobakery/humann:3.9" @@ -242,7 +258,15 @@ process { container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" } -// Quality control and assesment +//************************* GLDS_accession runsheet and input file retrieval **************************************// + withName: GET_RUNSHEET { + cpus = 10 + conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + container = "olabiyi/genelab-utils:1.3.22" + publishDir = [path: params.genelab_dir , mode: params.publishDir_mode] + } + +//********************************** Read quality control and assesment ********************************************// withName: FASTQC { conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} container = "staphb/fastqc:0.12.1" @@ -267,7 +291,7 @@ process { } -// Read-based processing +//************************************ Read-based processing *********************************************************// withLabel: read_based { conda = {params.conda.humann3 != null ? params.conda.humann3 : "envs/humann3.yaml"} @@ -291,7 +315,7 @@ process { } -// Assembly-based proessing +//*************************************** Assembly-based proessing **************************************************// withLabel: assembly { publishDir = [path: params.assemblies_dir, mode: params.publishDir_mode] @@ -422,6 +446,10 @@ process { } +/***************************************************************************** +********************** Workflow Resource Usage capturing ********************* +******************************************************************************/ + // Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { @@ -437,12 +465,18 @@ trace { file = "../Resource_Usage/execution_trace_${trace_timestamp}.txt" } + + +/****************************************************************************** +**************************** Workflow Metadata ******************************** +*******************************************************************************/ + manifest { author = 'Olabiyi Aderemi Obayomi, Mike Douglas Lee' homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/' - description = 'GeneLab bioinformatics processing pipelines for metagenomics sequencing data' + description = 'Metagenomics workflow for pipeline document GL-DPPD-7107' mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '>=22.10.6' - version = 'GL-DPPD-7107' + version = '1.0.0' } From f2997487fe824db266c67ac7b96e41a2de46e438 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 28 Jun 2024 15:28:11 -0500 Subject: [PATCH 15/48] Fixed extra header issue --- .../SW_MGIllumina/workflow_code/SE_file.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv index 99b6e25f..50eace60 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv @@ -1,4 +1,4 @@ -sample_id,forward,reverse,paired +sample_id,forward,paired RR23_FCS_FLT_F1,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F1_R1_HRremoved_raw.fastq.gz,false RR23_FCS_FLT_F2,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F2_R1_HRremoved_raw.fastq.gz,false RR23_FCS_FLT_F3,/path/to/Raw_Sequence_Data/RR23_FCS_FLT_F3_R1_HRremoved_raw.fastq.gz,false @@ -25,4 +25,4 @@ RR23_FCS_VIV_V5,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V5_R1_HRremoved_raw.fast RR23_FCS_VIV_V6,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V6_R1_HRremoved_raw.fastq.gz,false RR23_FCS_VIV_V7,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V7_R1_HRremoved_raw.fastq.gz,false RR23_FCS_VIV_V8,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V8_R1_HRremoved_raw.fastq.gz,false -RR23_FCS_VIV_V9,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V9_R1_HRremoved_raw.fastq.gz,false +RR23_FCS_VIV_V9,/path/to/Raw_Sequence_Data/RR23_FCS_VIV_V9_R1_HRremoved_raw.fastq.gz,false \ No newline at end of file From d119d4c5f6ced8413c56d259032480dced46d219 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 1 Jul 2024 14:00:59 -0500 Subject: [PATCH 16/48] Fixed GTDBTK taxonomy variable issue --- .../workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh index 2bf57eb5..a1c502e0 100755 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh @@ -2,7 +2,7 @@ MAGs_dir=$1 MAG_assembly_summaries=$2 MAGs_checkm_out=$3 -gtdbtk-out=$4 +gtdbtk_out=$4 # Making sure none of the intermediate files exist already rm -rf checkm-estimates.tmp \ @@ -17,7 +17,7 @@ for MAG in $(cut -f 1 ${MAG_assembly_summaries} | tail -n +2); do grep -w -m 1 "^${MAG}" ${MAGs_checkm_out} | \ cut -f 12,13,14 >> checkm-estimates.tmp - grep -w "^${MAG}" ${gtdbtk-out}/gtdbtk.*.summary.tsv | \ + grep -w "^${MAG}" ${gtdbtk_out}/gtdbtk.*.summary.tsv | \ cut -f 2 | sed 's/^.__//' | \ sed 's/;.__/\t/g' | \ awk 'BEGIN{ OFS=FS="\t" } { for (i=1; i<=NF; i++) if ( $i ~ /^ *$/ ) $i = "NA" }; 1' \ From 9b791491e29ce5bac52f8071424872aeb45b81cd Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 1 Jul 2024 16:18:07 -0500 Subject: [PATCH 17/48] Changed default database location --- .../SW_MGIllumina/workflow_code/main.nf | 2 +- .../SW_MGIllumina/workflow_code/nextflow.config | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf index 7901c028..046f1692 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf @@ -97,7 +97,7 @@ if (params.help) { println() println("Paths to existing databases and database links.") println(" --DB_ROOT [PATH] FULL PATH to root directory where the databases will be downloaded if they don't exist.") - println(" Relative paths such as '~/' and '../' will fail, please don't use them. Default: ./Reference_DBs/ ") + println(" Relative paths such as '~/' and '../' will fail, please don't use them. Default: ../Reference_DBs/ ") println("CAT database directory strings:") println(" The strings below will be added to the end of the --database.cat_db path arguement provided below.") println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/.") diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config index c2db18bc..9f3e57fc 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config @@ -46,13 +46,7 @@ params { logs_dir = "../Logs/" metadata_dir = "../Metadata/" - //************************* Database creation **********************************// - /* Root directory where the databases will be downloaded if they don't exist. - This should be provided as a full path (starting with '/'). - Note that relative paths such as '~/' and '../' are not expanded - by nextflow's evaluation of files, so don't use that. - */ - DB_ROOT = "${baseDir}/Reference_DBs" + //************************* Databases **********************************// database { CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" // Old link - https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz @@ -169,7 +163,13 @@ profiles { // Maximum number of jobs to submit in parallel executor.queueSize = 20 - +/* + Root directory where the databases will be downloaded if they don't exist. + This should be provided as a full path (starting with '/'). + Note that relative paths such as '~/' and '../' are not expanded + by nextflow's evaluation of files, so don't use that. +*/ +params.DB_ROOT = ("${baseDir}".split("/")[0..-2]).join('/') + "/Reference_DBs" // Mount Humann databases to their predefined locations in the Biobakery container being used if(params.database.chocophlan_dir == null || From 6409cbf3b20d4bd4942f2f18620a4051a1142450 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 27 Aug 2024 17:54:28 -0500 Subject: [PATCH 18/48] Updated versions and README --- .../GL-DPPD-7107-A.md | 1117 ++++++++++ Metagenomics/Illumina/README.md | 4 +- .../NF_MGIllumina/CHANGELOG.md | 4 + .../NF_MGIllumina/README.md | 179 ++ .../workflow_code/PE_file.csv | 0 .../workflow_code/SE_file.csv | 0 .../workflow_code/bin/clean-paths.sh | 24 + .../bin/combine-all-gene-tables.py | 0 ...evel-coverages-annots-and-tax-per-group.py | 0 .../workflow_code/bin/create_runsheet.sh | 0 .../workflow_code/bin/download-GTDBTK-db.sh | 0 .../bin/format-contig-tax-classifications.sh | 0 .../bin/format-gene-tax-classifications.sh | 0 .../generate-assembly-based-overview-table.sh | 0 .../workflow_code/bin/get-cov-and-depth.sh | 0 .../bin/get_MAGs_estimates_and_taxonomy.sh | 0 .../workflow_code/bin/parse-MAG-annots.py | 0 .../workflow_code/bin/prepull_singularity.sh | 31 + .../workflow_code/bin/swap-MAG-IDs.py | 0 .../workflow_code/config/bbtools_adapters.fa | 0 .../workflow_code/config/multiqc.config | 7 + .../NF_MGIllumina/workflow_code/envs/bit.yaml | 7 + .../NF_MGIllumina/workflow_code/envs/cat.yaml | 7 + .../workflow_code/envs/checkm.yaml | 7 + .../workflow_code/envs/genelab.yaml | 0 .../workflow_code/envs/gtdb-tk.yaml | 6 + .../workflow_code/envs/humann3.yaml | 8 + .../workflow_code/envs/image_def.bit | 32 + .../workflow_code/envs/image_def.genelab | 32 + .../workflow_code/envs/keggdecoder.yaml | 7 + .../workflow_code/envs/kofamscan.yaml | 9 + .../workflow_code/envs/mapping.yaml | 9 + .../workflow_code/envs/megahit.yaml | 8 + .../workflow_code/envs/metabat.yaml | 6 + .../workflow_code/envs/prodigal.yaml | 8 + .../NF_MGIllumina/workflow_code/envs/qc.yaml | 10 + .../workflow_code/main.nf | 2 +- .../workflow_code/modules/assembly.nf | 4 +- .../modules/assembly_annotation.nf | 6 +- .../modules/assembly_based_processing.nf | 0 .../workflow_code/modules/binning.nf | 0 .../modules/combine_contig_annotation.nf | 0 .../workflow_code/modules/coverage.nf | 2 +- .../workflow_code/modules/create_runsheet.nf | 4 +- .../modules/database_creation.nf | 2 +- .../modules/quality_assessment.nf | 10 +- .../modules/read_based_processing.nf | 4 +- .../workflow_code/modules/read_mapping.nf | 0 .../workflow_code/modules/summarize_MAG.nf | 6 +- .../summarize_assembly-based_processing.nf | 0 .../workflow_code/modules/summarize_bins.nf | 2 +- .../workflow_code/modules/zip_fasta.nf | 0 .../workflow_code/nextflow.config | 60 +- .../workflow_code/slurm_submit.slurm | 2 +- .../SW_MGIllumina/README.md | 172 +- .../SW_MGIllumina/workflow_code/Snakefile | 1925 +++++++++++++++++ .../workflow_code/bin/clean-paths.sh | 20 - .../SW_MGIllumina/workflow_code/config.yaml | 258 +++ .../SW_MGIllumina/workflow_code/envs/cat.yaml | 1 - .../scripts/combine-all-gene-tables.py | 312 +++ .../scripts/combine-benchmarks.sh | 18 + ...evel-coverages-annots-and-tax-per-group.py | 151 ++ .../format-contig-tax-classifications.sh | 6 + .../format-gene-tax-classifications.sh | 6 + .../generate-assembly-based-overview-table.sh | 74 + .../workflow_code/scripts/parse-MAG-annots.py | 30 + .../workflow_code/scripts/slurm-status.py | 17 + .../workflow_code/scripts/swap-MAG-IDs.py | 31 + 68 files changed, 4449 insertions(+), 198 deletions(-) create mode 100644 Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/PE_file.csv (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/SE_file.csv (100%) create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/combine-all-gene-tables.py (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/create_runsheet.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/download-GTDBTK-db.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/format-contig-tax-classifications.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/format-gene-tax-classifications.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/generate-assembly-based-overview-table.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/get-cov-and-depth.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/parse-MAG-annots.py (100%) mode change 100755 => 100644 create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/bin/swap-MAG-IDs.py (100%) mode change 100755 => 100644 rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/config/bbtools_adapters.fa (100%) create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/envs/genelab.yaml (100%) create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/main.nf (99%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/assembly.nf (96%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/assembly_annotation.nf (93%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/assembly_based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/binning.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/combine_contig_annotation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/coverage.nf (96%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/create_runsheet.nf (90%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/database_creation.nf (96%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/quality_assessment.nf (95%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/read_based_processing.nf (97%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/read_mapping.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/summarize_MAG.nf (96%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/summarize_assembly-based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/summarize_bins.nf (96%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/modules/zip_fasta.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/nextflow.config (89%) rename Metagenomics/Illumina/Workflow_Documentation/{SW_MGIllumina => NF_MGIllumina}/workflow_code/slurm_submit.slurm (95%) create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile delete mode 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py create mode 100644 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md new file mode 100644 index 00000000..55e70382 --- /dev/null +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -0,0 +1,1117 @@ +# Bioinformatics pipeline for Illumina metagenomics data + +> **This document holds an overview and some example commands of how GeneLab processes Illumina metagenomics datasets. Exact processing commands for specific datasets that have been released are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** + +--- + +**Date:** September 15, 2021 +**Revision:** - +**Document Number:** GL-DPPD-7107 + +**Submitted by:** +Michael D. Lee (GeneLab Analysis Team) + +**Approved by:** +Sylvain Costes (GeneLab Project Manager) +Samrawit Gebre (GeneLab Deputy Project Manager and Interim GeneLab Configuration Manager) +Amanda Saravia-Butler (GeneLab Data Processing Lead) +Jonathan Galazka (GeneLab Project Scientist) + +--- + +# Table of contents + +- [**Software used**](#software-used) +- [**General processing overview with example commands**](#general-processing-overview-with-example-commands) + - [**Pre-processing**](#pre-processing) + - [1. Raw Data QC](#1-raw-data-qc) + - [2. Quality filtering/trimming](#2-quality-filteringtrimming) + - [3. Filtered/Trimmed Data QC](#3-filteredtrimmed-data-qc) + - [**Assembly-based processing**](#assembly-based-processing) + - [4. Sample assembly](#4-sample-assembly) + - [5. Renaming contigs and summarizing assemblies](#5-renaming-contigs-and-summarizing-assemblies) + - [6. Gene prediction](#6-gene-prediction) + - [7. Functional annotation](#7-functional-annotation) + - [8. Taxonomic classification](#8-taxonomic-classification) + - [9. Read-mapping](#9-read-mapping) + - [10. Getting coverage information and filtering based on detection](#10-getting-coverage-information-and-filtering-based-on-detection) + - [11. Combining gene-level coverage, taxonomy, and functional annotations into one table for each sample](#11-combining-gene-level-coverage-taxonomy-and-functional-annotations-into-one-table-for-each-sample) + - [12. Combining contig-level coverage and taxonomy into one table for each sample](#12-combining-contig-level-coverage-and-taxonomy-into-one-table-for-each-sample) + - [13. Generating normalized, gene-level-coverage summary tables of KO-annotations and taxonomy across samples](#13-generating-normalized-gene-level-coverage-summary-tables-of-ko-annotations-and-taxonomy-across-samples) + - [14. **M**etagenome-**A**ssembled **G**enome (MAG) recovery](#14-metagenome-assembled-genome-mag-recovery) + - [15. Generating MAG-level functional summary overview](#15-generating-mag-level-functional-summary-overview) + - [**Read-based processing**](#read-based-processing) + - [16. Taxonomic and functional profiling](#16-taxonomic-and-functional-profiling) + +--- + +# Software used + +|Program|Version|Relevant Links| +|:------|:-----:|------:| +|FastQC| 0.12.1 |[https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)| +|MultiQC| 1.19 |[https://multiqc.info/](https://multiqc.info/)| +|bbduk| 38.86 |[https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/)| +|megahit| 1.2.9 |[https://github.com/voutcn/megahit#megahit](https://github.com/voutcn/megahit#megahit)| +|bit| 1.8.53 |[https://github.com/AstrobioMike/bioinf_tools#bioinformatics-tools-bit](https://github.com/AstrobioMike/bioinf_tools#bioinformatics-tools-bit)| +|bowtie2| 2.4.1 |[https://github.com/BenLangmead/bowtie2#overview](https://github.com/BenLangmead/bowtie2#overview)| +|samtools| 1.2 |[https://github.com/samtools/samtools#samtools](https://github.com/samtools/samtools#samtools)| +|prodigal| 2.6.3 |[https://github.com/hyattpd/Prodigal#prodigal](https://github.com/hyattpd/Prodigal#prodigal)| +|KOFamScan| 1.3.0 |[https://github.com/takaram/kofam_scan#kofamscan](https://github.com/takaram/kofam_scan#kofamscan)| +|CAT| 5.2.3 |[https://github.com/dutilh/CAT#cat-and-bat](https://github.com/dutilh/CAT#cat-and-bat)| +|Metabat2| 2.15 |[https://bitbucket.org/berkeleylab/metabat/src/master/](https://bitbucket.org/berkeleylab/metabat/src/master/)| +|checkm| 1.1.3 |[https://github.com/Ecogenomics/CheckM](https://github.com/Ecogenomics/CheckM)| +|gtdbtk| 2.4.0 |[https://github.com/Ecogenomics/GTDBTk](https://github.com/Ecogenomics/GTDBTk)| +|KEGGDecoder| 1.2.2 |[https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder](https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder) +|HUMAnN3| 3.9 |[https://huttenhower.sph.harvard.edu/humann3/](https://huttenhower.sph.harvard.edu/humann3/)| +|MetaPhlAn3| 4.10 |[https://github.com/biobakery/MetaPhlAn/tree/3.0](https://github.com/biobakery/MetaPhlAn/tree/3.0)| + +--- + +# General processing overview with example commands + +> Exact processing commands and output files listed in **bold** below are included with each Metagenomics Seq processed dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). + +## Pre-processing +### 1. Raw Data QC + +``` +fastqc -o raw_fastqc_output *raw.fastq.gz +``` + +**Parameter Definitions:** + +* `-o` – the output directory to store results +* `*raw.fastq.gz` – the input reads are specified as a positional argument, and can be given all at once with wildcards like this, or as individual arguments with spaces in between them + +**Input data:** + +* *raw.fastq.gz (raw reads, after human read removal) + +**Output data:** + +* *fastqc.html (FastQC output html summary) +* *fastqc.zip (FastQC output data) + + +#### 1a. Compile Raw Data QC + +``` +multiqc -o raw_multiqc_output -n raw_multiqc -z raw_fastqc_output/ +# this is how it's packaged with our workflow outputs +zip -r raw_multiqc_GLmetagenomics_report.zip raw_multiqc_output +``` + +**Parameter Definitions:** + +* `-o` – the output directory to store results +* `-n` – the filename prefix of results +* `-z` – specifies to zip the output data directory +* `raw_fastqc_output/` – the directory holding the output data from the fastqc run, provided as a positional argument + +**Input data:** + +* raw_fastqc_output/*fastqc.zip (FastQC output data) + +**Output data:** + +* **raw_multiqc_GLmetagenomics_report.zip** (zip containing the following) + * **raw_multiqc.html** (multiqc output html summary) + * **raw_multiqc_data** (directory containing multiqc output data) + +
+ +--- + +### 2. Quality filtering/trimming + +``` +bbduk.sh in=sample-1-R1-raw.fastq.gz in2=sample-1-R2-raw.fastq.gz out1=sample-1_R1_filtered.fastq.gz \ + out2=sample-1_R2_filtered.fastq.gz ref=ref-adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ + trimq=10 mlf=0.5 maxns=0 > bbduk.log 2>&1 + +# if libraries were prepared with the Swift1S kit +# bbduk.sh in=sample-1-R1-raw.fastq.gz in2=sample-1-R2-raw.fastq.gz out1=sample-1_R1_filtered.fastq.gz \ + out2=sample-1_R2_filtered.fastq.gz ref=ref-adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ + trimq=10 mlf=0.5 maxns=0 swift=t > bbduk.log 2>&1 + +``` + +**Parameter Definitions:** + +* `in` and `in2` – specifies the forward and reverse input reads, respectively (no `in2` if working with single-end data) + +* `out1` and `out2` – specifies the forward and reverse output reads, respectively (no `out2` if working with single-end data) + +* `ref` – specifies a fasta file holding potential adapter sequences (comes with bbduk installation) + +* `ktrim` – specifies to trim adapters from the 5’ end (left) if found + +* `k` – sets minimum length of kmer match to identify adapter sequences (provided by the “ref” file above) + +* `ftm` – sets a multiple of expected length the sequence should be (handles poor additional bases that are sometimes present, see “Force-Trim Modulo” section on [this page](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/)) + +* `qtrim` – sets quality-score-based trimming to be applied to left and right sides + +* `trimq` – sets the score to use for PHRED-algorithm trimming + +* `mlf` – sets the minimum length of reads retained based on their initial length + +* `maxns` – sets the maximum number of Ns allowed in a read before it will be filtered out + +* `swift` – tells the program to look for and trim low-complexity adaptase reminants from the Swift1S kit + +* `> bbduk.log 2>&1` – redirects the stderr and stdout to a log file for saving + +**Input data:** + +* *raw.fastq.gz (raw reads) + +**Output data:** + +* **\*_filtered.fastq.gz** (filtered/trimmed reads) +* **\*-bbduk.log** (log file of standard output and error from bbduk run) + +
+ +--- + +### 3. Filtered/Trimmed Data QC +``` +fastqc -o filtered_fastqc_output/ *filtered.fastq.gz +``` + +**Parameter Definitions:** + +* `-o` – the output directory to store results +* `*filtered.fastq.gz` – the input reads are specified as a positional argument, and can be given all at once with wildcards like this, or as individual arguments with spaces in between them + +**Input data:** + +* *filtered.fastq.gz (filtered/trimmed reads) + +**Output data:** + +* *fastqc.html (FastQC output html summary) +* *fastqc.zip (FastQC output data) + + +#### 3a. Compile Filtered/Trimmed Data QC +``` +multiqc -o filtered_multiqc_output -n filtered_multiqc -z filtered_fastqc_output/ +# this is how it's packaged with our workflow outputs +zip -r filtered_multiqc_GLmetagenomics_report.zip filtered_multiqc_output +``` + +**Parameter Definitions:** + +* `-o` – the output directory to store results +* `-n` – the filename prefix of results +* `-z` – specifies to zip the output data directory +* `filtered_fastqc_output/` – the directory holding the output data from the fastqc run, provided as a positional argument + +**Input data:** + +* filtered_fastqc_output/*fastqc.zip (FastQC output data) + +**Output data:** + +* **filtered_multiqc_GLmetagenomics_report.zip** (zip containing the following) + * **filtered_multiqc.html** (multiqc output html summary) + * **filtered_multiqc_data** (directory containing multiqc output data) + +
+ +--- + +## Assembly-based processing +### 4. Sample assembly +``` +megahit -1 sample-1_R1_filtered.fastq.gz -2 sample-1_R2_filtered.fastq.gz \ + -o sample-1-assembly -t 10 --min-contig-length 500 > sample-1-assembly.log 2>&1 +``` + +**Parameter Definitions:** + +* `-1 and -2` – specifies the input forward and reverse reads (if single-end data, then neither `-1` nor `-2` are used, instead single-end reads are passed to `-r`) + +* `-o` – specifies output directory + +* `-t` – specifies the number of threads to use + +* `--min-contig-length` – specifies the minimum contig length to write out + +* `> sample-1-assembly.log 2>&1` – sends stdout/stderr to log file + + +**Input data:** + +* *fastq.gz (filtered/trimmed reads) + +**Output data:** + +* sample-1-assembly/final.contigs.fa (assembly file) +* **sample-1-assembly.log** (log file) + +
+ +--- + +### 5. Renaming contigs and summarizing assemblies + +#### 5a. Renaming contig headers +``` +bit-rename-fasta-headers -i sample-1-assembly/final.contigs.fa -w c_sample-1 -o sample-1-assembly.fasta +``` + +**Parameter Definitions:** + +* `-i` – input fasta file + +* `-w` – wanted header prefix (a number will be appended for each contig), starts with a “c_” to ensure they won’t start with a number which can be problematic + +* `-o` – output fasta file + + +**Input data:** + +* sample-1-assembly/final.contigs.fa (assembly file) + +**Output files:** + +* **sample-1-assembly.fasta** (contig-renamed assembly file) + + +#### 5b. Summarizing assemblies + +``` +bit-summarize-assembly -o assembly-summaries.tsv *assembly.fasta +``` + +**Parameter Definitions:** + +* `-o` – output summary table + +* – multiple input assemblies can be provided as positional arguments + + +**Input data:** + +* *-assembly.fasta (contig-renamed assembly files) + +**Output files:** + +* **assembly-summaries_GLmetagenomics.tsv** (table of assembly summary statistics) + +
+ +--- + +### 6. Gene prediction +``` +prodigal -a sample-1-genes.faa -d sample-1-genes.fasta -f gff -p meta -c -q \ + -o sample-1-genes.gff -i sample-1-assembly.fasta +``` +**Parameter Definitions:** + +* `-a` – specifies the output amino acid sequences file + +* `-d` – specifies the output nucleotide sequences file + +* `-f` – specifies the output format gene-calls file + +* `-p` – specifies which mode to run the gene-caller in + +* `-c` – no incomplete genes reported + +* `-q` – run in quiet mode (don’t output process on each contig) + +* `-o` – specifies the name of the output gene-calls file + +* `-i` – specifies the input assembly + +**Input data:** + +* sample-1-assembly.fasta (assembly file) + +**Output data:** + +* **sample-1-genes.faa** (gene-calls amino-acid fasta file) +* **sample-1-genes.fasta** (gene-calls nucleotide fasta file) +* **sample-1-genes.gff** (gene-calls in general feature format) + +
+ +--- + +### 7. Functional annotation +> **Notes** +> The annotation process overwrites the same temporary directory by default. So if running multiple processses at a time, it is necessary to specify a specific temporary directory with the `--tmp-dir` argument as shown below. + + +#### 7a. Downloading reference database of HMM models (only needs to be done once) + +``` +curl -LO ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz +curl -LO ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz +tar -xzvf profiles.tar.gz +gunzip ko_list.gz +``` + +#### 7b. Running KEGG annotation +``` +exec_annotation -p profiles/ -k ko_list --cpu 15 -f detail-tsv -o sample-1-KO-tab.tmp \ + --tmp-dir sample-1-tmp-KO --report-unannotated sample-1-genes.faa +``` + +**Parameter Definitions:** +* `-p` – specifies the directory holding the downloaded reference HMMs + +* `-k` – specifies the downloaded reference KO (Kegg Orthology) terms + +* `--cpu` – specifies the number of searches to run in parallel + +* `-f` – specifies the output format + +* `-o` – specifies the output file name + +* `--tmp-dir` – specifies the temporary directory to write to (needed if running more than one process concurrently, see Notes above) + +* `--report-unannotated` – specifies to generate an output for each entry + +* `sample-1-genes.faa` – the input file is specified as a positional argument + + +**Input data:** + +* sample-1-genes.faa (amino-acid fasta file) +* profiles/ (reference directory holding the KO HMMs) +* ko_list (reference list of KOs to scan for) + +**Output data:** + +* sample-1-KO-tab.tmp (table of KO annotations assigned to gene IDs) + + +#### 7c. Filtering output to retain only those passing the KO-specific score and top hits +``` +bit-filter-KOFamScan-results -i sample-1-KO-tab.tmp -o sample-1-annotations.tsv + + # removing temporary files +rm -rf sample-1-tmp-KO/ sample-1-KO-annots.tmp +``` + +**Parameter Definitions:** + +* `-i` – specifies the input table + +* `-o` – specifies the output table + + +**Input data:** + +* sample-1-KO-tab.tmp (table of KO annotations assigned to gene IDs) + +**Output data:** + +* sample-1-annotations.tsv (table of KO annotations assigned to gene IDs) + +
+ +--- + +### 8. Taxonomic classification + +#### 8a. Pulling and un-packing pre-built reference db (only needs to be done once) +``` +wget tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20200618.tar.gz +tar -xvzf CAT_prepare_20200618.tar.gz +``` + +#### 8b. Running taxonomic classification +``` +CAT contigs -c sample-1-assembly.fasta -d CAT_prepare_20200618/2020-06-18_database/ \ + -t CAT_prepare_20200618/2020-06-18_taxonomy/ -p sample-1-genes.faa \ + -o sample-1-tax-out.tmp -n 15 -r 3 --top 4 --I_know_what_Im_doing +``` + +**Parameter Definitions:** + +* `-c` – specifies the input assembly fasta file + +* `-d` – specifies the CAT reference sequence database + +* `-t` – specifies the CAT reference taxonomy database + +* `-p` – specifies the input protein fasta file + +* `-o` – specifies the output prefix + +* `-n` – specifies the number of cores to use + +* `-r` – specifies the number of top protein hits to consider in assigning tax + +* `--top` – specifies the number of protein alignments to store + +* `--I_know_what_Im_doing` – allows us to alter the `--top` parameter + + +**Input data:** + +* sample-1-assembly.fasta (assembly file) +* sample-1-genes.faa (gene-calls amino-acid fasta file) + +**Output data:** + +* sample-1-tax-out.tmp.ORF2LCA.txt (gene-calls taxonomy file) +* sample-1-tax-out.tmp.contig2classification.txt (contig taxonomy file) + +#### 8c. Adding taxonomy info from taxids to genes +``` +CAT add_names -i sample-1-tax-out.tmp.ORF2LCA.txt -o sample-1-gene-tax-out.tmp \ + -t CAT_prepare_20200618/2020-06-18_taxonomy/ --only_official +``` + +**Parameter Definitions:** + +* `-i` – specifies the input taxonomy file + +* `-o` – specifies the output file + +* `-t` – specifies the CAT reference taxonomy database + +* `--only_official` – specifies to add only standard taxonomic ranks + +**Input data:** + +* sample-1-tax-out.tmp.ORF2LCA.txt (gene-calls taxonomy file) + +**Output data:** + +* sample-1-gene-tax-out.tmp (gene-calls taxonomy file with lineage info added) + + + +#### 8d. Adding taxonomy info from taxids to contigs +``` +CAT add_names -i sample-1-tax-out.tmp.contig2classification.txt -o sample-1-contig-tax-out.tmp \ + -t CAT-ref/2020-06-18_taxonomy/ --only_official +``` + +**Parameter Definitions:** + +* `-i` – specifies the input taxonomy file + +* `-o` – specifies the output file + +* `-t` – specifies the CAT reference taxonomy database + +* `--only_official` – specifies to add only standard taxonomic ranks + + +**Input data:** + +* sample-1-tax-out.tmp.contig2classification.txt (contig taxonomy file) + +**Output data:** + +* sample-1-contig-tax-out.tmp (contig taxonomy file with lineage info added) + + +#### 8e. Formatting gene-level output with awk and sed +``` +awk -F $'\t' ' BEGIN { OFS=FS } { if ( $2 == "lineage" ) { print $1,$2,$4,$5,$6,$7,$8,$9,$10 } \ + else if ( $2 == "ORF has no hit to database" || $2 ~ /^no taxid found/ ) \ + { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } else { n=split($2,lineage,";"); \ + print $1,lineage[n],$4,$5,$6,$7,$8,$9,$10 } } ' sample-1-gene-tax-out.tmp | \ + sed 's/not classified/NA/g' | sed 's/superkingdom/domain/' | sed 's/^# ORF/gene_ID/' | \ + sed 's/lineage/taxid/' | sed 's/\*//g' > sample-1-gene-tax-out.tsv +``` + +#### 8f. Formatting contig-level output with awk and sed +``` +awk -F $'\t' ' BEGIN { OFS=FS } { if ( $2 == "classification" ) { print $1,$4,$6,$7,$8,$9,$10,$11,$12 } \ + else if ( $2 == "unclassified" ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ + else { n=split($4,lineage,";"); print $1,lineage[n],$6,$7,$8,$9,$10,$11,$12 } } ' sample-1-contig-tax-out.tmp | \ + sed 's/not classified/NA/g' | sed 's/superkingdom/domain/' | sed 's/: [0-9\.]*//g' | sed 's/^# contig/contig_ID/' | \ + sed 's/lineage/taxid/' | sed 's/\*//g' > sample-1-contig-tax-out.tsv + + # clearing intermediate files +rm sample-1*.tmp* +``` + +**Input data:** + +* sample-1-gene-tax-out.tmp (gene-calls taxonomy file with lineage info added) +* sample-1-contig-tax-out.tmp (contig taxonomy file with lineage info added) + + +**Output data:** + +* sample-1-gene-tax-out.tsv (gene-calls taxonomy file with lineage info added reformatted) +* sample-1-contig-tax-out.tsv (contig taxonomy file with lineage info added reformatted) + +
+ +--- + +### 9. Read-mapping + +#### 9a. Building reference index +``` +bowtie2-build sample-1-assembly.fasta sample-1-assembly-bt-index +``` + +**Parameter Definitions:** + +* `sample-1-assembly.fasta` - first positional argument specifies the input assembly + +* `sample-1-assembly-bt-index` - second positional argument specifies the prefix of the output index files + + +#### 9b. Performing mapping, conversion to bam, and sorting +``` +bowtie2 --threads 15 -x sample-1-assembly-bt-index -1 sample-1_R1_filtered.fastq.gz \ + -2 sample-1_R2_filtered.fastq.gz 2> sample-1-mapping-info.txt | samtools view -b | samtools sort -@ 15 > sample-1.bam +``` + +**Parameter Definitions:** + +* `--threads` – specifies the number of threads to run in parallel + +* `-x` – specifies the prefix of the reference index files to map to (generated in the previous `bowtie2-build` step + +* `-1 and -2` – specifies the forward and reverse reads to map (if single-end data, neither `-1` nor `-2` are provided, and the single-end reads are passed to `-r`) + +* `2> sample-1-mapping-info.txt` – capture the printed summary results in a log file + +* `samtools view -b` – convert the output directly to bam format (compressed) + +* `samtools sort -@` – sort the bam file using the specified number of threads + +* `>` – redirect the output to a file + +#### 9c. Indexing +``` +samtools index -@ 15 sample-1.bam +``` + +**Parameter Definitions:** +* `-@` – set number of threads to use + +* `sample-1.bam` - input bam file is provided as a positional argument as generated from the above mapping step + +**Input data:** + +* sample-1-assembly.fasta (assembly file) +* *.fastq.gz (filtered/trimmed reads) + +**Output data:** + +* **sample-1.bam** (mapping file) +* sample-1.bam.bai (bam index file) +* **sample-1-mapping-info.txt** (read-mapping log file) + +
+ +--- + +### 10. Getting coverage information and filtering based on detection +> **Notes** +> “Detection” is a metric of what proportion of a reference sequence recruited reads (see [here](http://merenlab.org/2017/05/08/anvio-views/#detection)). Filtering based on detection is one way of helping to mitigate non-specific read-recruitment. + +#### 10a. Filtering coverage levels based on detection + +``` + # pileup.sh comes from the bbduk.sh package +pileup.sh -in sample-1.bam fastaorf=sample-1-genes.fasta outorf=sample-1-gene-cov-and-det.tmp \ + out=sample-1-contig-cov-and-det.tmp +``` + +**Parameter Definitions:** + +* `-in` – the input bam file + +* `fastaorf=` – input gene-calls nucleotide fasta file + +* `outorf=` – the output gene-coverage tsv file + +* `out=` – the output contig-coverage tsv file + + +#### 10b. Filtering gene coverage based on requiring 50% detection and parsing down to just gene ID and coverage +``` +grep -v "#" sample-1-gene-cov-and-det.tmp | awk -F $'\t' ' BEGIN { OFS=FS } { if ( $10 <= 0.5 ) $4 = 0 } \ + { print $1,$4 } ' > sample-1-gene-cov.tmp + +cat <( printf "gene_ID\tcoverage\n" ) sample-1-gene-cov.tmp > sample-1-gene-coverages.tsv +``` + +Filtering contig coverage based on requiring 50% detection and parsing down to just contig ID and coverage: +``` +grep -v "#" sample-1-contig-cov-and-det.tmp | awk -F $'\t' ' BEGIN { OFS=FS } { if ( $5 <= 50 ) $2 = 0 } \ + { print $1,$2 } ' > sample-1-contig-cov.tmp + +cat <( printf "contig_ID\tcoverage\n" ) sample-1-contig-cov.tmp > sample-1-contig-coverages.tsv + + # removing intermediate files + +rm sample-1-*.tmp +``` + +**Input data:** + +* sample-1.bam (mapping file) +* sample-1-genes.fasta (gene-calls nucleotide fasta file) + +**Output data:** + +* sample-1-gene-coverages.tsv (table with gene-level coverages) +* sample-1-contig-coverages.tsv (table with contig-level coverages) + +
+ +--- + +### 11. Combining gene-level coverage, taxonomy, and functional annotations into one table for each sample +> **Notes** +> Just uses `paste`, `sed`, and `awk`, all are standard in any Unix-like environment. + +``` +paste <( tail -n +2 sample-1-gene-coverages.tsv | sort -V -k 1 ) <( tail -n +2 sample-1-annotations.tsv | sort -V -k 1 | cut -f 2- ) \ + <( tail -n +2 sample-1-gene-tax-out.tsv | sort -V -k 1 | cut -f 2- ) > sample-1-gene-tab.tmp + +paste <( head -n 1 sample-1-gene-coverages.tsv ) <( head -n 1 sample-1-annotations.tsv | cut -f 2- ) \ + <( head -n 1 sample-1-gene-tax-out.tsv | cut -f 2- ) > sample-1-header.tmp + +cat sample-1-header.tmp sample-1-gene-tab.tmp > sample-1-gene-coverage-annotation-and-tax.tsv + + # removing intermediate files +rm sample-1*tmp sample-1-gene-coverages.tsv sample-1-annotations.tsv sample-1-gene-tax-out.tsv +``` + +**Input data:** + +* sample-1-gene-coverages.tsv (table with gene-level coverages from step 10) +* sample-1-annotations.tsv (table of KO annotations assigned to gene IDs from step 7) +* sample-1-gene-tax-out.tsv (gene-level taxonomic classifications from step 8) + + +**Output data:** + +* **sample-1-gene-coverage-annotation-and-tax.tsv** (table with combined gene coverage, annotation, and taxonomy info) + +
+ +--- + +### 12. Combining contig-level coverage and taxonomy into one table for each sample +> **Notes** +> Just uses `paste`, `sed`, and `awk`, all are standard in any Unix-like environment. + +``` +paste <( tail -n +2 sample-1-contig-coverages.tsv | sort -V -k 1 ) \ + <( tail -n +2 sample-1-contig-tax-out.tsv | sort -V -k 1 | cut -f 2- ) > sample-1-contig.tmp + +paste <( head -n 1 sample-1-contig-coverages.tsv ) <( head -n 1 sample-1-contig-tax-out.tsv | cut -f 2- ) \ + > sample-1-contig-header.tmp + +cat sample-1-contig-header.tmp sample-1-contig.tmp > sample-1-contig-coverage-and-tax.tsv + + # removing intermediate files +rm sample-1*tmp sample-1-contig-coverages.tsv sample-1-contig-tax-out.tsv +``` + +**Input data:** + +* sample-1-contig-coverages.tsv (table with contig-level coverages from step 10) +* sample-1-contig-tax-out.tsv (contig-level taxonomic classifications from step 8) + + +**Output data:** + +* **sample-1-contig-coverage-and-tax.tsv** (table with combined contig coverage and taxonomy info) + +
+ +--- + +### 13. Generating normalized, gene-level-coverage summary tables of KO-annotations and taxonomy across samples +> **Notes** +> * To combine across samples to generate these summary tables, we need the same "units". This is done for annotations based on the assigned KO terms, and all non-annotated functions are included together as "Not annotated". It is done for taxonomic classifications based on taxids (full lineages included in the table), and any not classified are included together as "Not classified". +> * The values we are working with are coverage per gene (so they are number of bases recruited to the gene normalized by the length of the gene). These have been normalized by making the total coverage of a sample 1,000,000 and setting each individual gene-level coverage its proportion of that 1,000,000 total. So basically percent, but out of 1,000,000 instead of 100 to make the numbers more friendly. + +``` +bit-GL-combine-KO-and-tax-tables *-gene-coverage-annotation-and-tax.tsv -o Combined +``` + +**Parameter Definitions:** + +* takes positional arguments specifying the input tsv files, can be provided as a space-delimited list of files, or with wildcards like above + +- `-o` – specifies the output prefix + + +**Input data:** + +* *-gene-coverage-annotation-and-tax.tsv (tables with combined gene coverage, annotation, and taxonomy info generated for individual samples from step 12) + +**Output data:** + +* **Combined-gene-level-KO-function-coverages-CPM_GLmetagenomics.tsv** (table with all samples combined based on KO annotations; normalized to coverage per million genes covered) +* **Combined-gene-level-taxonomy-coverages-CPM_GLmetagenomics.tsv** (table with all samples combined based on gene-level taxonomic classifications; normalized to coverage per million genes covered) + +
+ +--- + +### 14. **M**etagenome-**A**ssembled **G**enome (MAG) recovery + +#### 14a. Binning contigs +``` +jgi_summarize_bam_contig_depths --outputDepth sample-1-metabat-assembly-depth.tsv --percentIdentity 97 --minContigLength 1000 --minContigDepth 1.0 --referenceFasta sample-1-assembly.fasta sample-1.bam + +metabat2 --inFile sample-1-assembly.fasta --outFile sample-1 --abdFile sample-1-metabat-assembly-depth.tsv -t 4 + +mkdir sample-1-bins +mv sample-1*bin*.fasta sample-1-bins +zip -r sample-1-bins.zip sample-1-bins +``` + +**Parameter Definitions:** + +* `--outputDepth` – specifies the output depth file +* `--percentIdentity` – minimum end-to-end percent identity of a mapped read to be included +* `--minContigLength` – minimum contig length to include +* `--minContigDepth` – minimum contig depth to include +* `--referenceFasta` – the assembly fasta file generated in step 4 +* `sample-1.bam` – final positional arguments are the bam files generated in step 9 +* `--inFile` - the assembly fasta file generated in step 4 +* `--outFile` - the prefix of the identified bins output files +* `--abdFile` - the depth file generated by the previous `jgi_summarize_bam_contig_depths` command +* `-t` - specifies number of threads to use + + +**Input data:** + +* sample-1-assembly.fasta (assembly fasta file created in step 4) +* sample-1.bam (bam file created in step 9) + +**Output data:** + +* **sample-1-metabat-assembly-depth.tsv** (tab-delimited summary of coverages) +* sample-1-bins/sample-1-bin\*.fasta (fasta files of recovered bins) +* **sample-1-bins.zip** (zip file containing fasta files of recovered bins) + +#### 14b. Bin quality assessment +Utilizes the default `checkm` database available [here](https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz), `checkm_data_2015_01_16.tar.gz`. + +``` +checkm lineage_wf -f bins-overview_GLmetagenomics.tsv --tab_table -x fa ./ checkm-output-dir +``` + +**Parameter Definitions:** + +* `lineage_wf` – specifies the workflow being utilized +* `-f` – specifies the output summary file +* `--tab_table` – specifies the output summary file should be a tab-delimited table +* `-x` – specifies the extension that is on the bin fasta files that are being assessed +* `./` – first positional argument at end specifies the directory holding the bins generated in step 14a +* `checkm-output-dir` – second positional argument at end specifies the primary checkm output directory with detailed information + +**Input data:** + +* bin fasta files generated by step 14a + +**Output data:** + +* **bins-overview_GLmetagenomics.tsv** (tab-delimited file with quality estimates per bin) +* checkm-output-dir (directory holding detailed checkm outputs) + +#### 14c. Filtering MAGs + +``` +cat <( head -n 1 bins-overview_GLmetagenomics.tsv ) \ + <( awk -F $'\t' ' $12 >= 90 && $13 <= 10 && $14 == 0 ' bins-overview_GLmetagenomics.tsv | sed 's/bin./MAG-/' ) \ + > checkm-MAGs-overview.tsv + +# copying bins into a MAGs directory in order to run tax classification +awk -F $'\t' ' $12 >= 90 && $13 <= 10 && $14 == 0 ' bins-overview_GLmetagenomics.tsv | cut -f 1 > MAG-bin-IDs.tmp + +mkdir MAGs +for ID in MAG-bin-IDs.tmp +do + MAG_ID=$(echo $ID | sed 's/bin./MAG-/') + cp ${ID}.fasta MAGs/${MAG_ID}.fasta +done + +for SAMPLE in $(cat MAG-bin-IDs.tmp | sed 's/-bin.*//' | sort -u); +do + mkdir ${SAMPLE}-MAGs + mv ${SAMPLE}-*MAG*.fasta ${SAMPLE}-MAGs + zip -r ${SAMPLE}-MAGs.zip ${SAMPLE}-MAGs +done +``` + +**Input data:** + +* bins-overview_GLmetagenomics.tsv (tab-delimited file with quality estimates per bin) + +**Output data:** + +* checkm-MAGs-overview.tsv (tab-delimited file with quality estimates per MAG) +* MAGs/\*.fasta (directory holding high-quality MAGs) +* **\*-MAGs.zip** (zip files containing directories of high-quality MAGs) + + +#### 14d. MAG taxonomic classification +Uses default `gtdbtk` database setup with program's `download.sh` command. + +``` +gtdbtk classify_wf --genome_dir MAGs/ -x fa --out_dir gtdbtk-output-dir --skip_ani_screen +``` + +**Parameter Definitions:** + +* `classify_wf` – specifies the workflow being utilized +* `--genome_dir` – specifies the directory holding the MAGs generated in step 14c +* `-x` – specifies the extension that is on the MAG fasta files that are being taxonomically classified +* `-out_dir` – specifies the output directory +* `-skip_ani_screen` - specifies to skip ani_screening step to classify genomes using mash and skani + +**Input data:** + +* **MAGs/\*.fasta (directory holding high-quality MAGs) + +**Output data:** + +* gtdbtk-output-dir/gtdbtk.\*.summary.tsv (files with assigned taxonomy and info) + +
+ +--- + +### 15. Generating MAG-level functional summary overview + +#### 15a. Getting KO annotations per MAG +This utilizes the helper script [`parse-MAG-annots.py`](../Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py). + +```bash +for file in $( ls MAGs/*.fasta ) +do + + MAG_ID=$( echo ${file} | cut -f 2 -d "/" | sed 's/.fasta//' ) + sample_ID=$( echo ${MAG_ID} | sed 's/-MAG-[0-9]*$//' ) + + grep "^>" ${file} | tr -d ">" > ${MAG_ID}-contigs.tmp + + python parse-MAG-annots.py -i annotations-and-taxonomy/${sample_ID}-gene-coverage-annotation-and-tax.tsv \ + -w ${MAG_ID}-contigs.tmp -M ${MAG_ID} \ + -o MAG-level-KO-annotations_GLmetagenomics.tsv + + rm ${MAG_ID}-contigs.tmp + +done +``` + +**Parameter Definitions:** + +* `-i` – specifies the input sample gene-coverage-annotation-and-tax.tsv file generated in step 11 above + +* `-w` – specifies the appropriate temporary file holding all the contigs in the current MAG + +* `-M` – specifies the current MAG unique identifier + +* `-o` – specifies the output file + +**Input data:** + +* \*-gene-coverage-annotation-and-tax.tsv (sample gene-coverage-annotation-and-tax.tsv file generated in step 11 above) + +**Output data:** + +* **MAG-level-KO-annotations_GLmetagenomics.tsv** (tab-delimited table holding MAGs and their KO annotations) + + +#### 15b. Summarizing KO annotations with KEGG-Decoder + +```bash +KEGG-decoder -v interactive -i MAG-level-KO-annotations_GLmetagenomics.tsv -o MAG-KEGG-Decoder-out_GLmetagenomics.tsv +``` + +**Parameter Definitions:** + +* `-v interactive` – specifies to create an interactive html output + +* `-i` – specifies the input MAG-level-KO-annotations_GLmetagenomics.tsv file generated in step 15a above + +* `-o` – specifies the output table + +**Input data:** + +* MAG-level-KO-annotations_GLmetagenomics.tsv (tab-delimited table holding MAGs and their KO annotations, generated in step 15a above) + +**Output data:** + +* **MAG-KEGG-Decoder-out_GLmetagenomics.tsv** (tab-delimited table holding MAGs and their proportions of genes held known to be required for specific pathways/metabolisms) + +* **MAG-KEGG-Decoder-out_GLmetagenomics.html** (interactive heatmap html file of the above output table) + +
+ +--- + +## Read-based processing +### 16. Taxonomic and functional profiling +The following uses the `humann3` and `metaphlan3` reference databases downloaded on 26-Sept-2020 as follows: + +```bash +humann_databases --download chocophlan full +humann_databases --download uniref uniref90_diamond +humann_databases --download utility_mapping full +metaphlan --install +``` + +#### 16a. Running humann3 (which also runs metaphlan3) +```bash + # forward and reverse reads need to be provided combined if paired-end (if not paired-end, single-end reads are provided to the --input argument next) +cat sample-1_R1_filtered.fastq.gz sample-1_R2_filtered.fastq.gz > sample-1-combined.fastq.gz + +humann --input sample-1-combined.fastq.gz --output sample-1-humann3-out-dir --threads 15 \ + --output-basename sample-1 --metaphlan-options "--unknown_estimation --add_viruses \ + --sample_id sample-1" +``` + +**Parameter Definitions:** + +* `--input` – specifies the input combined forward and reverse reads (if paired-end) + +* `--output` – specifies output directory + +* `--threads` – specifies the number of threads to use + +* `--output-basename` – specifies prefix of the output files + +* `--metaphlan-options` – options to be passed to metaphlan + * `--unknown_estimation` – include unclassified in estimated relative abundances + * `--add_viruses` – include viruses in the reference database + * `--sample_id` – specifies the sample identifier we want in the table (rather than full filename) + + +#### 16b. Merging multiple sample functional profiles into one table +```bash + # they need to be in their own directories +mkdir genefamily-results/ pathabundance-results/ pathcoverage-results/ + + # copying results from previous running humann3 step (14a) to get them all together in their own directories (as is needed) +cp *-humann3-out-dir/*genefamilies.tsv genefamily-results/ +cp *-humann3-out-dir/*abundance.tsv pathabundance-results/ +cp *-humann3-out-dir/*coverage.tsv pathcoverage-results/ + +humann_join_tables -i genefamily-results/ -o gene-families.tsv +humann_join_tables -i pathabundance-results/ -o path-abundances.tsv +humann_join_tables -i pathcoverage-results/ -o path-coverages.tsv +``` + +**Parameter Definitions:** + +* `-i` – the directory holding the input tables + +* `-o` – the name of the output combined table + + +#### 16c. Splitting results tables +The read-based functional annotation tables have taxonomic info and non-taxonomic info mixed together initially. `humann` comes with a helper script to split these. Here we are using that to generate both non-taxonomically grouped functional info files and taxonomically grouped ones. + +```bash +humann_split_stratified_table -i gene-families.tsv -o ./ +mv gene-families_stratified.tsv Gene-families-grouped-by-taxa_GLmetagenomics.tsv +mv gene-families_unstratified.tsv Gene-families_GLmetagenomics.tsv + +humann_split_stratified_table -i path-abundances.tsv -o ./ +mv path-abundances_stratified.tsv Path-abundances-grouped-by-taxa_GLmetagenomics.tsv +mv path-abundances_unstratified.tsv Path-abundances_GLmetagenomics.tsv + +humann2_split_stratified_table -i path-coverages.tsv -o ./ +mv path-coverages_stratified.tsv Path-coverages-grouped-by-taxa_GLmetagenomics.tsv +mv path-coverages_unstratified.tsv Path-coverages_GLmetagenomics.tsv +``` + +**Parameter Definitions:** + +* `-i` – the input combined table + +* `-o` – output directory (here specifying current directory) + + +#### 16d. Normalizing gene families and pathway abundance tables +This generates some normalized tables of the read-based functional outputs from humann that are more readily suitable for across sample comparisons. + +```bash +humann_renorm_table -i Gene-families_GLmetagenomics.tsv -o Gene-families-cpm_GLmetagenomics.tsv --update-snames +humann_renorm_table -i Path-abundances_GLmetagenomics.tsv -o Path-abundances-cpm_GLmetagenomics.tsv --update-snames +``` + +**Parameter Definitions:** + +* `-i` – the input combined table + +* `-o` – name of the output normalized table + +* `--update-snames` – change suffix of column names in tables to "-CPM" + + +#### 16e. Generating a normalized gene-family table that is grouped by Kegg Orthologs (KOs) + +```bash +humann_regroup_table -i Gene-families_GLmetagenomics.tsv -g uniref90_ko | humann_rename_table -n kegg-orthology | \ + humann_renorm_table -o Gene-families-KO-cpm_GLmetagenomics.tsv --update-snames +``` + +**Parameter Definitions:** + +* `-i` – the input table + +* `-g` – the map to use to group uniref IDs into Kegg Orthologs + +* `|` – sending that output into the next humann command to add human-readable Kegg Orthology names + +* `-n` – specifying we are converting Kegg orthology IDs into Kegg orthology human-readable names + +* `|` – sending that output into the next humann command to normalize to copies-per-million + +* `-o` – specifying the final output file name + +* `--update-snames` – change suffix of column names in tables to "-CPM" + +#### 16f. Combining taxonomy tables + +```bash +merge_metaphlan_tables.py *-humann3-out-dir/*_humann_temp/*_metaphlan_bugs_list.tsv > Metaphlan-taxonomy_GLmetagenomics.tsv +``` + +**Parameter Definitions:** + +* input metaphlan tables are provided as position arguments (produced during humann3 run above, step 14a) + +* `>` – output is redirected from stdout to a file + + +**Input data:** + +* *fastq.gz (filtered/trimmed reads from step 2, forward and reverse reads concatenated if paired-end) + +**Output data:** + +* **Gene-families_GLmetagenomics.tsv** (gene-family abundances) +* **Gene-families-grouped-by-taxa_GLmetagenomics.tsv** (gene-family abundances grouped by taxa) +* **Gene-families-cpm_GLmetagenomics.tsv** (gene-family abundances normalized to copies-per-million) +* **Gene-families-KO-cpm_GLmetagenomics.tsv** (KO term abundances normalized to copies-per-million) +* **Pathway-abundances_GLmetagenomics.tsv** (pathway abundances) +* **Pathway-abundances-grouped-by-taxa_GLmetagenomics.tsv** (pathway abundances grouped by taxa) +* **Pathway-abundances-cpm_GLmetagenomics.tsv** (pathway abundances normalized to copies-per-million) +* **Pathway-coverages_GLmetagenomics.tsv** (pathway coverages) +* **Pathway-coverages-grouped-by-taxa_GLmetagenomics.tsv** (pathway coverages grouped by taxa) +* **Metaphlan-taxonomy_GLmetagenomics.tsv** (metaphlan estimated taxonomic relative abundances) + +--- diff --git a/Metagenomics/Illumina/README.md b/Metagenomics/Illumina/README.md index 881134f7..0b842d28 100644 --- a/Metagenomics/Illumina/README.md +++ b/Metagenomics/Illumina/README.md @@ -1,7 +1,7 @@ # GeneLab bioinformatics processing pipeline for Illumina metagenomics sequencing data -> **The document [`GL-DPPD-7107.md`](Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md) holds an overview and example commands for how GeneLab processes Illumina metagenomics sequencing datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code are provided for each GLDS dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** +> **The document [`GL-DPPD-7107-A.md`](Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md) holds an overview and example commands for how GeneLab processes Illumina metagenomics sequencing datasets. See the [Repository Links](#repository-links) descriptions below for more information. Processed data output files and processing code are provided for each GLDS dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** > > Note: The exact processing commands and MGIllumina version used for specific GLDS datasets can be found in the *_processing_info.zip file under "Files" for each respective GLDS dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). @@ -26,4 +26,4 @@ --- **Developed and maintained by:** -Michael D. Lee (Mike.Lee@nasa.gov) +Michael D. Lee (Mike.Lee@nasa.gov) and Olabiyi A.Obayomi (olabiyi.a.obayomi@nasa.gov) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md new file mode 100644 index 00000000..c3da8cd0 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md @@ -0,0 +1,4 @@ +# Workflow change log + +## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina) +- workflow version that converted snakemake to nextflow \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md new file mode 100644 index 00000000..7e2899b2 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -0,0 +1,179 @@ +# Workflow Information and Usage Instructions + +## General Workflow Info + +### Implementation Tools +The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. + +> **Note on reference databases** +> Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. + + +## Utilizing the Workflow + +1. [Install Nextflow and Singularity](#1-install-nextflow-and-singularity) + 1a. [Install Nextflow](#1a-install-nextflow) + 1b. [Install Singularity](#1b-install-singularity) + +2. [Download the workflow files](#2-download-the-workflow-files) + +3. [Fetch Singularity Images](#3-fetch-singularity-images) + +4. [Run the workflow](#3-run-the-workflow) + 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + +4. [Workflow outputs](#4-workflow-outputs) + 4a. [Main outputs](#4a-main-outputs) + 4b. [Resource logs](#4b-resource-logs) + +
+ +--- + +### 1. Install Nextflow and Singularity + +#### 1a. Install Nextflow + +Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). + +> Note: If you want to install Anaconda, we recommend installing a Miniconda, Python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +> +> Once conda is installed on your system, you can install the latest version of Nextflow by running the following commands: +> +> ```bash +> conda install -c bioconda nextflow +> nextflow self-update +> ``` + +
+ +#### 1b. Install Singularity + +Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. + +We recommend installing Singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). + +> Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity). + +
+ +--- + +### 2. Download the workflow files + +All files required for utilizing the NF_XXX GeneLab workflow for processing metagenomics illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: + +```bash +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina/NF_MGIllumina.zip +unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X +``` +
+ +--- + +### 3. Fetch Singularity Images + +Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). + +To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_MGIllumina workflow: +> Note: This command should be run in the location containing the `NF_MGIllumina` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. + +```bash +bash ./bin/prepull_singularity.sh nextflow.config +``` + + +Once complete, a `singularity` folder containing the Singularity images will be created. Run the following command to export this folder as a Nextflow configuration environment variable to ensure Nextflow can locate the fetched images: + +```bash +export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity +``` + +
+ +--- + +### 4. Run the Workflow + +For options and detailed help on how to run the workflow, run the following command: + +```bash +nextflow run main.nf --help +``` + +> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. + +
+ +#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-574 +``` + +
+ +#### 4b. Approach 2: Run slurm jobs in singularity containers with a csv file as input + +```bash +nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv +``` + +
+ +#### 4c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) + +```bash +nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc +``` + +
+ +**Required Parameters For All Approaches:** + +* `-run main.nf` - Instructs nextflow to run the NF_XXX workflow +* `-resume` - Resumes workflow execution using previously cached results +* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow + + + *Required only if you would like to pull and process data directly from OSDR* + +* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-574. + +*Required only if --GLDS_accession is not passed as an argument* + +* `--csv_file` – A 3-column (single-end) or 4-column (paired-end) input csv file (sample_id, forward, [reverse,] paired). Please see the sample [SE_file.csv](workflow_code/SE_file.csv)and [PE_file.csv]((workflow_code/PE_file.csv) in this repository for examples on how to format this file. + +> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. + +
+ +#### 4d. Modify parameters and cpu resources in the nextflow config file + +Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). + +Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the machine you're using. + +
+ +--- + +### 5. Workflow outputs + +#### 5a. Main outputs + +The outputs from this pipeline are documented in the [GL-DPPD-7107-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md) processing protocol. + +#### 5b. Resource logs + +Standard nextflow resource usage logs are also produced as follows: + +- Output: + - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) + - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) + - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) + +> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/PE_file.csv similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/PE_file.csv rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/PE_file.csv diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/SE_file.csv similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/SE_file.csv rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/SE_file.csv diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh new file mode 100644 index 00000000..416758a2 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -e + +# only built for use on N288 cluster + +# example usage: bash clean-paths.sh + +# making sure by chance we are not overwriting a wanted file called 't' + +if [ -s t ]; then + printf "\n This simple program temporarily writes to a file called 't'\n" + printf " Since that exists already here, we are not going to continue.\n\n" + exit +fi + + +ROOT_DIR=$(echo $2 | awk '{N=split($0,a,"/"); for(i=0; i < N-1; i++) printf "%s/", a[i]}' | sed 's|//|/|') + + +sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${1} \ + | sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \ + | sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \ + | sed -E 's|/[a-z]{6}/[^ ]*||g' \ + | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1} \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-all-gene-tables.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-all-gene-tables.py old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-all-gene-tables.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-all-gene-tables.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/create_runsheet.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/create_runsheet.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/create_runsheet.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get-cov-and-depth.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get-cov-and-depth.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get-cov-and-depth.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get-cov-and-depth.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/parse-MAG-annots.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/parse-MAG-annots.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh new file mode 100644 index 00000000..125130f1 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Addresses issue: https://github.com/nextflow-io/nextflow/issues/1210 + +CONFILE=${1:-nextflow.config} +OUTDIR=${2:-./singularity} + +if [ ! -e $CONFILE ]; then + echo "$CONFILE does not exist" + exit +fi + +TMPFILE=`mktemp` + +CURDIR=$(pwd) + +mkdir -p $OUTDIR + +cat ${CONFILE}|grep 'container'|perl -lane 'if ( $_=~/container\s*\=\s*\"(\S+)\"/ ) { $_=~/container\s*\=\s*\"(\S+)\"/; print $1 unless ( $1=~/^\s*$/ or $1=~/\.sif/ or $1=~/\.img/ ) ; }' > $TMPFILE + +cd ${OUTDIR} + +while IFS= read -r line; do + name=$line + name=${name/:/-} + name=${name//\//-} + echo $name + singularity pull ${name}.img docker://$line +done < $TMPFILE + +cd $CURDIR \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/swap-MAG-IDs.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/swap-MAG-IDs.py old mode 100755 new mode 100644 similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/swap-MAG-IDs.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/swap-MAG-IDs.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config/bbtools_adapters.fa b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/bbtools_adapters.fa similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config/bbtools_adapters.fa rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/bbtools_adapters.fa diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config new file mode 100644 index 00000000..e3bd4ac0 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config @@ -0,0 +1,7 @@ +extra_fn_clean_exts: + - "_raw" + - "_HRremoved_raw" + - "_filtered" + +show_analysis_paths: False +show_analysis_time: False diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml new file mode 100644 index 00000000..6e39c137 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - bit=1.8.53 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml new file mode 100644 index 00000000..545c1870 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml @@ -0,0 +1,7 @@ +name: CAT +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - cat=5.2.3 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml new file mode 100644 index 00000000..d52e414c --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml @@ -0,0 +1,7 @@ +name: checkm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - checkm-genome=1.1.3 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/genelab.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/genelab.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/genelab.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/genelab.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml new file mode 100644 index 00000000..90ca8d58 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - gtdbtk=2.4.0 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml new file mode 100644 index 00000000..fa616b0d --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - defaults + - biobakery +dependencies: + - humann=3.9 + - metaphlan=4.10 \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit new file mode 100644 index 00000000..e4730aeb --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit @@ -0,0 +1,32 @@ +FROM continuumio/anaconda3:2024.02-1 + +MAINTAINER Olabiyi Obayomi + +RUN /opt/conda/bin/conda init bash && \ + /opt/conda/bin/conda config --add channels bioconda && \ + /opt/conda/bin/conda config --add channels conda-forge && \ + /opt/conda/bin/conda update conda -y && \ + /opt/conda/bin/conda clean -afy + +# copy the necessary files +COPY bit.yaml ./ + +RUN apt-get --allow-releaseinfo-change update && \ + apt-get upgrade -y && \ + dpkg --configure -a + +# Install environment +RUN conda env create -f bit.yaml && \ + conda clean --all + +RUN apt-get clean && \ + apt-get autoremove + +RUN echo "source activate /opt/conda/envs/bit/" > ~/.bashrc + +RUN apt-get install -y groff + +ENV PATH="/opt/conda/envs/bit/bin/:$PATH" + +RUN apt-get install -y procps +CMD ["/bin/bash"] diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab new file mode 100644 index 00000000..a9b44d69 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab @@ -0,0 +1,32 @@ +FROM continuumio/anaconda3:2024.02-1 + +MAINTAINER Olabiyi Obayomi + +RUN /opt/conda/bin/conda init bash && \ + /opt/conda/bin/conda config --add channels bioconda && \ + /opt/conda/bin/conda config --add channels conda-forge && \ + /opt/conda/bin/conda update conda -y && \ + /opt/conda/bin/conda clean -afy + +# copy the necessary files +COPY genelab.yaml ./ + +RUN apt-get --allow-releaseinfo-change update && \ + apt-get upgrade -y && \ + dpkg --configure -a + +# Install environment +RUN conda env create -f genelab.yaml && \ + conda clean --all + +RUN apt-get clean && \ + apt-get autoremove + +RUN echo "source activate /opt/conda/envs/genelab-utils/" > ~/.bashrc + +RUN apt-get install -y groff + +ENV PATH="/opt/conda/envs/genelab-utils/bin/:$PATH" + +RUN apt-get install -y procps +CMD ["/bin/bash"] diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml new file mode 100644 index 00000000..9ec5e2b7 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge +dependencies: + - python=3.6 + - pip + - pip: + - KEGGDecoder==1.2.2 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml new file mode 100644 index 00000000..6f5654f8 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - kofamscan=1.3.0 + - hmmer=3.3.0 + - bit=1.8.53 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml new file mode 100644 index 00000000..a31c7e76 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bowtie2=2.4.1 + - tbb=2020.2 + - bbmap=38.86 + - samtools=1.2 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml new file mode 100644 index 00000000..42fa709a --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - megahit=1.2.9 + - bit=1.8.53 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml new file mode 100644 index 00000000..de7774a2 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - metabat2=2.15 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml new file mode 100644 index 00000000..4872bf98 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - defaults + - astrobiomike +dependencies: + - prodigal=2.6.3 + - bit=1.8.53 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml new file mode 100644 index 00000000..3838ce5d --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - fastqc=0.12.1 + - multiqc=1.19 + - bbmap=38.86 + - zip=3.0 + - python=3.8 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf similarity index 99% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf index 046f1692..4039515a 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf @@ -374,7 +374,7 @@ workflow { // Software Version Capturing - combining all captured sofware versions - nf_version = "Nextflow Version:".concat("${nextflow.version}\n<><><>\n") + nf_version = "Nextflow Version ".concat("${nextflow.version}\n<><><>\n") nextflow_version_ch = Channel.value(nf_version) // Write software versions to file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf similarity index 96% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf index b1fc59d1..f1e698d0 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.paired = false -params.max_mem = 100e9 +//params.paired = false +//params.max_mem = 100e9 /**************************************************************************************** ************************** Sequence assembly and summary ******************************* diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf similarity index 93% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf index b05ec015..d9dfae35 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_annotation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf @@ -1,8 +1,8 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.cat_db = "/mnt/c/Users/olabi/Documents/bioinformatics/test/processing_info/ref-dbs/CAT_prepare_20210107/2021-01-07_CAT_database/" -params.ko_db_dir = "/mnt/c/Users/olabi/Documents/bioinformatics/test/processing_info/ref-dbs/kofamscan_db/" -params.block_size = 4 +//params.cat_db = "/mnt/c/Users/olabi/Documents/bioinformatics/test/processing_info/ref-dbs/CAT_prepare_20210107/2021-01-07_CAT_database/" +//params.ko_db_dir = "/mnt/c/Users/olabi/Documents/bioinformatics/test/processing_info/ref-dbs/kofamscan_db/" +//params.block_size = 4 /**************************************************************************************** ************************** Sequence Assembly Annotation ******************************* diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/assembly_based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/binning.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/binning.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/binning.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/combine_contig_annotation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/combine_contig_annotation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/combine_contig_annotation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf similarity index 96% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf index 0eb7ab42..fb5aca11 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/coverage.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf @@ -1,6 +1,6 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.pileup_mem = "5g" +//params.pileup_mem = "5g" /* This process pulls out coverage and detection information for each sample, gene-level and contig-level, diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf similarity index 90% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf index 8dfe2a08..4aeaa844 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/create_runsheet.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf @@ -1,8 +1,8 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.GLDS_accession = "OSD-574" -params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process +//params.GLDS_accession = "OSD-574" +//params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process process GET_RUNSHEET { diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf similarity index 96% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf index 3b3f67cd..60ef9d03 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/database_creation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.CAT_DL_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" +//params.CAT_DL_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" /**************************************************************************************** *************************** Metagenomics databases set-up ****************************** diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf similarity index 95% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf index 919da2d8..e286900b 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/quality_assessment.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf @@ -6,11 +6,11 @@ nextflow.enable.dsl = 2 ****************************************************************************************/ // a 2-column (single-end) or 3-column (paired-end) file -params.prefix = "raw" // "filetered" -params.csv_file = "file.csv" -params.swift_1S = false -params.adapters = "${baseDir}/config/bbtools_dapters.fa" -params.multiqc_config = "config/multiqc.config" +//params.prefix = "raw" // "filetered" +//params.csv_file = "file.csv" +//params.swift_1S = false +//params.adapters = "${baseDir}/config/bbtools_dapters.fa" +//params.multiqc_config = "config/multiqc.config" process FASTQC { // FastQC performed on reads diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf similarity index 97% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf index 92368b6a..d27366fd 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -params.additional_filename_prefix = "" -params.assay_suffix = "_GLmetagenomics" +//params.additional_filename_prefix = "" +//params.assay_suffix = "_GLmetagenomics" /**************************************************************************************** ********************* Read-based processing using Humann3 ******************************* diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_mapping.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/read_mapping.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_mapping.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf similarity index 96% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf index 3c3c0f7c..14840db4 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_MAG.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf @@ -6,9 +6,9 @@ nextflow.enable.dsl = 2 ****************************************************************************************/ include { ZIP_FASTA as ZIP_MAGS } from "./zip_fasta.nf" -params.min_est_comp = 90 -params.max_est_redund = 10 -params.max_est_strain_het = 50 +//params.min_est_comp = 90 +//params.max_est_redund = 10 +//params.max_est_strain_het = 50 /* Scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf similarity index 96% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf index 5bbbe48f..53d56da1 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/summarize_bins.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf @@ -7,7 +7,7 @@ nextflow.enable.dsl = 2 include { ZIP_FASTA as ZIP_BINS } from "./zip_fasta.nf" -params.reduced_tree = "True" +//params.reduced_tree = "True" // Summarize bin assemblies process SUMMARIZE_BIN_ASSEMBLIES { diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/zip_fasta.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/zip_fasta.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/modules/zip_fasta.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/zip_fasta.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config similarity index 89% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config index 9f3e57fc..f8895dc4 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config @@ -24,7 +24,7 @@ params { filtered_R2_suffix = "_R2_filtered.fastq.gz" // If single-end - filtered_suffix = "_filtered.fastq.gz" + filtered_suffix = "_filtered.fastq.gz" // Directories @@ -49,7 +49,6 @@ params { //************************* Databases **********************************// database { CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" - // Old link - https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz GTDBTK_LINK = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" @@ -61,8 +60,8 @@ params { } // Quality assessment parameters - swift_1S = false - adapters = "${baseDir}/config/bbtools_adapters.fa" + swift_1S = false + adapters = "${baseDir}/config/bbtools_adapters.fa" multiqc_config = "${baseDir}/config/multiqc.config" @@ -83,9 +82,9 @@ params { cat_db_sub_dir = "2021-01-07_CAT_database/" // MAG parameters - min_est_comp = 90 - max_est_redund = 10 - max_est_strain_het = 50 + min_est_comp = 90 // Minimum estimated completeness + max_est_redund = 10 // Maximum estimated redundancy + max_est_strain_het = 50 // Maximum estimated strain heterogeneity /* Scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; @@ -98,20 +97,20 @@ params { conda{ - // Specify paths to your existing conda environments - genelab = null // "/path/to/envs/genelab-utils" - qc = null // "/path/to/envs/qc" - humann3 = null // "/path/to/envs/humann3" - cat = null // "/path/to/envs/genelab-utils/envs/CAT" - prodigal = null // "/path/to/envs/prodigal" - metabat = null // "/path/to/envs/metabat" - gtdbtk = null // "/path/to/envs/gtdbtk" - kegg_decoder = null // "/path/to/envs/kegg_decoder" - megahit = null // "/path/to/envs/megahit" - bit = null // "/path/to/envs/bit" - kofamscan = null // "/path/to/envs/kofamscan" - mapping = null // "/path/to/envs/mapping" - checkm = null // "/path/to/envs/checkm" + // Specify paths to existing conda environments + genelab = null // "/path/to/envs/genelab-utils" + qc = null // "/path/to/envs/qc" + humann3 = null // "/path/to/envs/humann3" + cat = null // "/path/to/envs/genelab-utils/envs/CAT" + prodigal = null // "/path/to/envs/prodigal" + metabat = null // "/path/to/envs/metabat" + gtdbtk = null // "/path/to/envs/gtdbtk" + kegg_decoder = null // "/path/to/envs/kegg_decoder" + megahit = null // "/path/to/envs/megahit" + bit = null // "/path/to/envs/bit" + kofamscan = null // "/path/to/envs/kofamscan" + mapping = null // "/path/to/envs/mapping" + checkm = null // "/path/to/envs/checkm" } GLDS_accession = false // GLDS or OSD acession number for the data to be processed @@ -121,10 +120,9 @@ params { debug = false // should info about the parameters set by the user be shown when the workflow starts. } -// Setting the default container engine as singularity +// Setting the default container engine to singularity params.containerEngine = "singularity" -// Conda shouldn't be used be default except when using conda-based profiles -// i.e., slurm_conda and conda +// Conda shouldn't be used by default except when using conda-based profiles params.use_conda = false @@ -208,10 +206,10 @@ process { cpus = 8 /********************************************************************************************* - ******************************** Specific process settings ********************************** + ******************************** Process Specific Settings ********************************** *********************************************************************************************/ - //************************* Generic process labels used throught the workflow ****************// + //************************* Generic process labels used throughout the workflow ****************// withLabel: genelab { @@ -255,7 +253,7 @@ process { withName: SETUP_GTDBTK_DB { conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} - container = "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" + container = "quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_1" } //************************* GLDS_accession runsheet and input file retrieval **************************************// @@ -286,7 +284,7 @@ process { container = "staphb/bbtools:38.86" cpus = 5 memory = "40 GB" - publishDir = publishDir = [[path: params.filtered_reads_dir, pattern: "*${params.filtered_suffix}" , mode: params.publishDir_mode], + publishDir = [[path: params.filtered_reads_dir, pattern: "*${params.filtered_suffix}" , mode: params.publishDir_mode], [path: params.logs_dir, pattern: "*-bbduk.log" , mode: params.publishDir_mode]] } @@ -426,7 +424,7 @@ process { withName: GTDBTK_ON_MAGS { conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} - container = "quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_1" // "quay.io/biocontainers/gtdbtk:1.5.0--pyhdfd78af_0" + container = "quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_1" containerOptions = { params.containerEngine == "singularity" ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } cpus = 8 memory = "600 GB" @@ -447,7 +445,7 @@ process { /***************************************************************************** -********************** Workflow Resource Usage capturing ********************* +********************** Workflow Resource Usage Capturing ********************* ******************************************************************************/ // Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config @@ -474,7 +472,7 @@ trace { manifest { author = 'Olabiyi Aderemi Obayomi, Mike Douglas Lee' homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/' - description = 'Metagenomics workflow for pipeline document GL-DPPD-7107' + description = 'Metagenomics workflow for pipeline document GL-DPPD-7107-A' mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '>=22.10.6' diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm similarity index 95% rename from Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm index 62c22964..232fe48c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/slurm_submit.slurm +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm @@ -28,7 +28,7 @@ echo $HOSTNAME ## You can see a list of all available environments by running the command: conda env list ## ## If you need a conda envrionment installed request it using JIRA ## -source activate /path/to/envs/genelab-utils ## Replace conda_env_name with the name of the environment ## +source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the conda environment with nextflow installed ## ## Print the version of the tool you are using to ensure the tool version is recorded ## diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md index 5c319fe5..131ca570 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/README.md @@ -1,173 +1,101 @@ -# Workflow Information and Usage Instructions +# SW_MGIllumina Workflow Information and Usage Instructions -## General Workflow Info -### Implementation Tools -The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. +## General workflow info +The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow and utilizes [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (SW_MGIllumina) is run using the command line interface (CLI) of any unix-based system. The workflow can be used even if you are unfamiliar with Snakemake and conda, but if you want to learn more about those, [this Snakemake tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) within [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) is a good place to start for that, and an introduction to conda with installation help and links to other resources can be found [here at Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro). > **Note on reference databases** -> Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. +> Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about 240 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. +## Utilizing the workflow -## Utilizing the Workflow +1. [Install conda, mamba, and `genelab-utils` package](#1-install-conda-mamba-and-genelab-utils-package) +2. [Download the workflow template files](#2-download-the-workflow-template-files) +3. [Modify the variables in the config.yaml file](#3-modify-the-variables-in-the-configyaml-file) +4. [Run the workflow](#4-run-the-workflow) -1. [Install nextflow, conda and singularity](#1-install-nextflow-conda-and-singularity) - 1a. [Install nextflow and conda](#1a-install-nextflow-and-conda) - 1b. [Install singularity](#1b-install-singularity) +### 1. Install conda, mamba, and `genelab-utils` package +We recommend installing a Miniconda, Python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). -2. [Download the workflow files](#2-download-the-workflow-files) - -3. [Run the workflow](#3-run-the-workflow) - 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) - -4. [Workflow outputs](#4-workflow-outputs) - 4a. [Main outputs](#4a-main-outputs) - 4b. [Resource logs](#4b-resource-logs) - -
- -### 1. Install nextflow, conda and singularity - - - -#### 1a. Install nextflow and conda - -Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). - -> Note: If you want to install anaconda, we recommend installing a miniconda, python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). - -We recommend installing a miniconda, python3 version appropriate for your system, as exemplified in [the above link](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). - -Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations. +Once conda is installed on your system, we recommend installing [mamba](https://github.com/mamba-org/mamba#mamba), as it generally allows for much faster conda installations: ```bash conda install -n base -c conda-forge mamba ``` -> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5). +> You can read a quick intro to mamba [here](https://astrobiomike.github.io/unix/conda-intro#bonus-mamba-no-5) if wanted. -Once mamba is installed, you can install the genelab-utils conda package which contains nextflow with the following command: +Once mamba is installed, you can install the genelab-utils conda package in a new environment with the following command: ```bash -mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike genelab-utils +mamba create -n genelab-utils -c conda-forge -c bioconda -c defaults -c astrobiomike 'genelab-utils>=1.1.02' ``` + The environment then needs to be activated: ```bash conda activate genelab-utils +``` -# Test that nextflow is installed -nextflow -h +### 2. Download the workflow template files +The workflow files for processing Illumina metagenomics sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest SW_MGIllumina version on to your system, run the following command: -# Update nextflow -nextflow self-update +```bash +GL-get-workflow MG-Illumina ``` -
+This downloaded the workflow into a directory called `SW_MGIllumina_*/`, with the workflow version number at the end. -#### 1b. Install singularity +> Note: If wanting an earlier version, the wanted version can be provided as an optional argument like so: +> ```bash +> GL-get-workflow MG-Illumina --wanted-version 2.0.0 +> ``` -Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. +### 3. Modify the variables in the config.yaml file +Once you've downlonaded the workflow template, you can modify the variables in your downloaded version of the [config.yaml](workflow_code/config.yaml) file as needed in order to match your dataset and system setup. For example, you will have to provide a text file containing a single-column list of unique sample identifiers (see an example of how to set this up below). You will also need to indicate the paths to your input data (raw reads) and the root directory for where the reference databases should be stored (they will be setup automatically). Additionally, if necessary, you'll need to modify each variable in the config.yaml file to be consistent with the study you want to process and the machine you're using. -We recommend installing singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). +> Note: If you are unfamiliar with how to specify paths, one place you can learn more is [here](https://astrobiomike.github.io/unix/getting-started#the-unix-file-system-structure). -
+**Example for how to create a single-column list of unique sample identifiers from your raw data file names** -### 2. Download the workflow files - -All files required for utilizing the NF_XXX GeneLab workflow for processing metagenomics illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: +For example, if you have paired-end read data for 2 samples located in `../Raw_Data/` relative to your workflow directory, that would look like this: ```bash -wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina/NF_MGIllumina.zip -unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X +ls ../Raw_Data/ ``` -OR by using the genelab-utils conda package - -```bash -GL-get-workflow MG-Illumina ``` - -
- -### 3. Run the Workflow - -For options and detailed help on how to run the workflow, run the following command: - -```bash -nextflow run main.nf --help +Sample-1_R1_raw.fastq.gz +Sample-1_R2_raw.fastq.gz +Sample-2_R1_raw.fastq.gz +Sample-2_R2_raw.fastq.gz ``` -> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. - -
- -#### 3a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input +You would set up your `unique-sample-IDs.txt` file as follows: ```bash -nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-574 +cat unique-sample-IDs.txt ``` -
- -#### 3b. Approach 2: Run slurm jobs in singularity containers with a csv file as input - -```bash -nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv +``` +Sample-1 +Sample-2 ``` -
+### 4. Run the workflow -#### 3c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) +While in the directory holding the Snakefile, config.yaml, and other workflow files that you downloaded in [step 2](#2-download-the-workflow-template-files), here is one example command of how to run the workflow: ```bash -nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc +snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p ``` -
- -**Required Parameters For All Approaches:** - -* `-run main.nf` - Instructs nextflow to run the NF_XXX workflow -* `-resume` - Resumes workflow execution using previously cached results -* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow - - - *Required only if you would like to pull and process data directly from OSDR* - -* `--GLDS_accession` – A Genelab / OSD accession number e.g. OSD-574. - -*Required only if --GLDS_accession is not passed as an argument* - -* `--csv_file` – A 3-column (single-end) or 4-column (paired-end) input csv file (sample_id, forward, [reverse,] paired). Please see the sample `SE_file.csv` and `PE_file.csv` in this repository for examples on how to format this file. - -> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. - -
- -#### 3d. Modify parameters and cpu resources in the nextflow config file - -Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). - -Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. For example, you can directly set the the full paths to available conda environments in the `conda` scope within the `params` scope. Additionally, if necessary, you'll need to modify each variable in the nexflow.config file to be consistent with the study you want to process and the machine you're using. - -### 4. Workflow outputs - -#### 4a. Main outputs - -The outputs from this pipeline are documented in the [GL-DPPD-7107](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md) processing protocol. - -#### 4b. Resource logs - -Standard nextflow resource usage logs are also produced as follows: +* `--use-conda` – specifies to use the conda environments included in the workflow (these are specified in the files in the workflow [envs/](workflow_code/envs) directory) +* `--conda-prefix` – indicates where the needed conda environments will be stored. Adding this option will also allow the same conda environments to be re-used when processing additional datasets, rather than making new environments each time you run the workflow. The value listed for this option, `${CONDA_PREFIX}/envs`, points to the default location for conda environments (note: the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). +* `-j` – assigns the number of jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) +* `-p` – specifies to print out each command being run to the screen -- Output: - - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) - - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) - - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) +See `snakemake -h` and [Snakemake's documentation](https://snakemake.readthedocs.io/en/stable/) for more options and details. -> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). +--- diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile new file mode 100644 index 00000000..17eb6870 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/Snakefile @@ -0,0 +1,1925 @@ +############################################################################################ +## Snakefile for GeneLab Illumina metagenomics processing workflow ## +## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## +## Version 2.0.4 ## +############################################################################################ + +import os + +configfile: "config.yaml" + + +######################################## +############# General Info ############# +######################################## + +""" +See the corresponding 'config.yaml' file for general use information. +Variables that may need to be adjusted should usually be changed there, not here. +""" + + +######################################## +####### Assay-specific GL suffix ####### +######################################## + +assay_suffix = "_GLmetagenomics" + + +######################################## +######## Some colors and helpers ####### +######################################## + +tty_colors = { + 'green' : '\033[0;32m%s\033[0m', + 'yellow' : '\033[0;33m%s\033[0m', + 'red' : '\033[0;31m%s\033[0m' +} + +def color_text(text, color='green'): + if sys.stdout.isatty(): + return(tty_colors[color] % text) + else: + return(text) + + +######################################## +#### Reading samples file into list #### +######################################## + +sample_IDs_file = config["sample_info_file"] +sample_ID_list = [line.strip() for line in open(sample_IDs_file)] + +# making sure there are all unique names +if len(set(sample_ID_list)) != len(sample_ID_list): + + print(color_text("\n Not all sample IDs in the " + str(config["sample_info_file"]) + " file are unique :(\n", "yellow")) + print(" Exiting for now.\n") + exit(1) + +######################################## +######## Setting up directories ######## +######################################## + +if config["workflow"] == "both": + + dirs_to_create = [config["fastqc_out_dir"], config["filtered_reads_dir"], config["assembly_based_dir"], + config["read_based_dir"], config["assemblies_dir"], config["genes_dir"], + config["annotations_and_tax_dir"], config["mapping_dir"], config["combined_output_dir"], + config["bins_dir"], config["MAGs_dir"], config["logs_dir"], "benchmarks"] + +elif config["workflow"] == "assembly-based": + + dirs_to_create = [config["fastqc_out_dir"], config["filtered_reads_dir"], config["assembly_based_dir"], + config["assemblies_dir"], config["genes_dir"], + config["annotations_and_tax_dir"], config["mapping_dir"], config["combined_output_dir"], + config["bins_dir"], config["MAGs_dir"], config["logs_dir"], "benchmarks"] + + +elif config["workflow"] == "read-based": + + dirs_to_create = [config["fastqc_out_dir"], config["filtered_reads_dir"], config["read_based_dir"], + config["MAGs_dir"], config["logs_dir"], "benchmarks"] + +else: + + print(color_text("\n The 'workflow' variable in the config.yaml file needs to be one of 'assembly-based', 'read-based', or 'both'.", "yellow")) + print("\n Exiting for now.\n") + + exit(1) + +for dir in dirs_to_create: + try: + os.mkdir(dir) + except: + pass + + +######################################## +############# Rules start ############## +######################################## +# all rule depends on if assembly-based, read-based, or both + +if config["workflow"] == "both": + + rule all: + input: + config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", + config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", + config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv", + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.tsv", + config["bins_dir"] + config["additional_filename_prefix"] + f"bins-overview{assay_suffix}.tsv", + config["assemblies_dir"] + config["additional_filename_prefix"] + f"assembly-summaries{assay_suffix}.tsv", + config["assembly_based_dir"] + config["additional_filename_prefix"] + f"Assembly-based-processing-overview{assay_suffix}.tsv", + config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip", + config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip", + config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", + config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv", + config["read_based_dir"] + config["additional_filename_prefix"] + f"Metaphlan-taxonomy{assay_suffix}.tsv" + shell: + """ + bash scripts/combine-benchmarks.sh + """ + +elif config["workflow"] == "assembly-based": + + rule all: + input: + config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", + config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", + config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv", + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.tsv", + config["bins_dir"] + config["additional_filename_prefix"] + f"bins-overview{assay_suffix}.tsv", + config["assemblies_dir"] + config["additional_filename_prefix"] + f"assembly-summaries{assay_suffix}.tsv", + config["assembly_based_dir"] + config["additional_filename_prefix"] + f"Assembly-based-processing-overview{assay_suffix}.tsv", + config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip", + config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip" + shell: + """ + bash scripts/combine-benchmarks.sh + """ + +elif config["workflow"] == "read-based": + + rule all: + input: + config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip", + config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip", + config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", + config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv", + config["read_based_dir"] + config["additional_filename_prefix"] + f"Metaphlan-taxonomy{assay_suffix}.tsv" + shell: + """ + bash scripts/combine-benchmarks.sh + """ + + +rule summarize_MAG_KO_annots_with_KEGG_Decoder: + conda: + "envs/keggdecoder.yaml" + input: + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-level-KO-annotations{assay_suffix}.tsv" + params: + MAGs_dir = config["MAGs_dir"], + mod_input_annotations = config["MAGs_dir"] + "mod-MAG-level-KO-annotations.tmp", + temp_output = config["MAGs_dir"] + "MAG-KEGG-Decoder-out.tmp", + mapping_file = config["MAGs_dir"] + "MAG-ID-map.tmp", + orig_html_output = config["MAGs_dir"] + "MAG-KEGG-Decoder-out.html", + final_html_output = config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.html" + output: + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-KEGG-Decoder-out{assay_suffix}.tsv" + benchmark: + "benchmarks/summarize_MAG_KO_annots_with_KEGG_Decoder-benchmarks.tsv" + shell: + """ + # getting number of MAGs recovered + num_mags_recovered=$(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') + # only running if any MAGs were recovered + if [ ${{num_mags_recovered}} -gt 0 ]; then + + # KEGGDecoder splits on the first underscore to identify unique genome/MAG IDs + # this can be problematic with how things are named, so we are swapping them all to not have + # any "_" first, then afterwards we are changing the output table back to the original names so + # they match elsewhere (they will still be slightly different in the html output, but that is + # only manually explored anyway) + + # making version of input for KEGGDecoder with no underscores + tr "_" "-" < {input} > {params.mod_input_annotations} + + # making mapping file + paste <( cut -f 1 {input} ) <( cut -f 1 {params.mod_input_annotations} ) > {params.mapping_file} + + + # running KEGGDecoder + # can only create html output if there are more than 1 + if [ ${{num_mags_recovered}} -gt 1 ]; then + KEGG-decoder -v interactive -i {params.mod_input_annotations} -o {params.temp_output} + ## adding additional prefix to html output if there is one + if [ {params.orig_html_output} != {params.final_html_output} ]; then + mv {params.orig_html_output} {params.final_html_output} + fi + else + KEGG-decoder -i {params.mod_input_annotations} -o {params.temp_output} + fi + + + # swapping MAG IDs back in output tsv from KEGGDecoder + python scripts/swap-MAG-IDs.py -i {params.temp_output} -m {params.mapping_file} -o {output} + + # removing intermediate files + rm {params.mod_input_annotations} {params.mapping_file} {params.temp_output} + + else + + printf "There were no MAGs recovered.\n" > {output} + + fi + """ + + +rule summarize_MAG_level_KO_annotations: + input: + MAG_overview = config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv", + trigger = expand(config["annotations_and_tax_dir"] + "{ID}-gene-coverage-annotation-and-tax.tsv", ID = sample_ID_list) + params: + MAGs_dir = config["MAGs_dir"], + annot_and_tax_dir = config["annotations_and_tax_dir"], + tmp_contig_IDs = "curr-contig-ids.tmp" + output: + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAG-level-KO-annotations{assay_suffix}.tsv" + benchmark: + "benchmarks/summarize_MAG_level_KO_annotations-benchmarks.tsv" + shell: + """ + # only running if any MAGs were recovered + if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + for MAG in $(cut -f 1 {input.MAG_overview} | tail -n +2) + do + + sample_ID=$(echo $MAG | sed 's/-MAG-[0-9]*$//') + grep "^>" {params.MAGs_dir}${{MAG}}.fasta | tr -d ">" > {params.tmp_contig_IDs} + + python scripts/parse-MAG-annots.py -i {params.annot_and_tax_dir}${{sample_ID}}-gene-coverage-annotation-and-tax.tsv -w {params.tmp_contig_IDs} -M ${{MAG}} -o {output} + rm {params.tmp_contig_IDs} + + done + + else + + printf "There were no MAGs recovered.\n" > {output} + + fi + """ + + +rule generate_assembly_processing_overview_table: + input: + trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv" + params: + sample_IDs_file = config["sample_info_file"], + assemblies_dir = config["assemblies_dir"], + genes_dir = config["genes_dir"], + mapping_dir = config["mapping_dir"], + bins_dir = config["bins_dir"], + MAGs_dir = config["MAGs_dir"], + assembly_summaries = config["bins_dir"] + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv", + checkm_results = config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv" + output: + config["assembly_based_dir"] + config["additional_filename_prefix"] + f"Assembly-based-processing-overview{assay_suffix}.tsv" + benchmark: + "benchmarks/generate_assembly_processing_overview_table-benchmarks.tsv" + shell: + """ + bash scripts/generate-assembly-based-overview-table.sh {params.sample_IDs_file} {params.assemblies_dir} {params.genes_dir} {params.mapping_dir} {params.bins_dir} {params.MAGs_dir} {output} + # removing intermediate files from assembly-based process + rm -rf {params.assembly_summaries} {params.checkm_results} + """ + + +rule generate_MAGs_overview_table: + input: + assembly_summaries = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAG-assembly-summaries.tsv", + checkm_results = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv", + gtdb_done_trigger = config["MAGs_dir"] + "gtdbtk-out" + params: + gtdb_results = config["MAGs_dir"] + "gtdbtk-out/gtdbtk.*.summary.tsv", + checkm_tmp = config["MAGs_dir"] + "checkm-estimates.tmp", + gtdb_tmp = config["MAGs_dir"] + "gtdb-taxonomies.tmp", + checkm_w_header_tmp = config["MAGs_dir"] + "checkm-estimates-with-headers.tmp", + gtdb_w_header_tmp = config["MAGs_dir"] + "gtdb-taxonomies-with-headers.tmp", + overview_tmp = config["MAGs_dir"] + "MAGs-overview.tmp", + overview_header_tmp = config["MAGs_dir"] + "MAGs-overview-header.tmp", + overview_sorted_tmp = config["MAGs_dir"] + "MAGs-overview-sorted.tmp", + MAGs_dir = config["MAGs_dir"] + output: + config["MAGs_dir"] + config["additional_filename_prefix"] + f"MAGs-overview{assay_suffix}.tsv" + benchmark: + "benchmarks/generate_MAGs_overview_table-benchmarks.tsv" + shell: + """ + # only running if any MAGs were recovered + if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + # making sure none of the intermediate files exist already + rm -rf {params.checkm_tmp} {params.gtdb_tmp} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} {params.overview_tmp} {params.overview_header_tmp} {params.overview_sorted_tmp} + + for MAG in $(cut -f 1 {input.assembly_summaries} | tail -n +2) + do + + grep -w -m 1 "^${{MAG}}" {input.checkm_results} | cut -f 12,13,14 >> {params.checkm_tmp} + grep -w "^${{MAG}}" {params.gtdb_results} | cut -f 2 | sed 's/^.__//' | sed $'s/;.__/\t/g' | awk -F $'\\t' ' BEGIN {{ OFS=FS }} {{ for (i=1; i<=NF; i++) if ( $i ~/^ *$/) $i = "NA" }}; 1 ' >> {params.gtdb_tmp} + + done + + # adding headers + cat <(printf "est. completeness\\test. redundancy\\test. strain heterogeneity\\n") {params.checkm_tmp} > {params.checkm_w_header_tmp} + cat <(printf "domain\\tphylum\\tclass\\torder\\tfamily\\tgenus\\tspecies\\n") {params.gtdb_tmp} > {params.gtdb_w_header_tmp} + + paste {input.assembly_summaries} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} > {params.overview_tmp} + + # ordering by taxonomy + head -n 1 {params.overview_tmp} > {params.overview_header_tmp} + tail -n +2 {params.overview_tmp} | sort -t $'\\t' -k 14,20 > {params.overview_sorted_tmp} + + cat {params.overview_header_tmp} {params.overview_sorted_tmp} > {output} + + rm -rf {params.checkm_tmp} {params.gtdb_tmp} {params.checkm_w_header_tmp} {params.gtdb_w_header_tmp} {params.overview_tmp} {params.overview_header_tmp} {params.overview_sorted_tmp} {input} + + else + + rm -rf {params.MAGs_dir}* + + printf "There were no MAGs recovered.\n" > {output} + + fi + """ + + +rule summarize_MAG_assemblies: + """ summarize MAG assemblies """ + + conda: + "envs/bit.yaml" + input: + trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv" + params: + intermediate_file = config["MAGs_dir"] + "MAG-summaries.tmp", + MAGs_dir = config["MAGs_dir"] + output: + config["MAGs_dir"] + config["additional_filename_prefix"] + "MAG-assembly-summaries.tsv" + benchmark: + "benchmarks/summarize_MAG_assemblies-benchmarks.tsv" + shell: + """ + # only running if any MAGs were recovered + if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + bit-summarize-assembly {params.MAGs_dir}*.fasta -o {params.intermediate_file} -t + + # slimming down the output + cut -f 1,2,3,5,6,8,11,18,19,20 {params.intermediate_file} > {output} + rm {params.intermediate_file} + + else + + printf "There were no MAGs recovered.\n" > {output} + + fi + """ + + +rule gtdbtk_on_MAGs: + """ assign taxonomy to MAGs with gtdb-tk """ + + conda: + "envs/gtdb-tk.yaml" + input: + trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv", + gtdbtk_db_trigger = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"] + params: + MAGs_dir = config["MAGs_dir"], + gtdbtk_db_dir = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"], + pplacer_cpus = config["gtdb_tk_checkm_pplacer_cpus"], + gtdb_tk_scratch_location = config["gtdb_tk_scratch_location"] + output: + directory(config["MAGs_dir"] + "gtdbtk-out") + resources: + cpus = config["gtdb_tk_num_cpus"], + mem_mb = config["gtdbtk_memory_resources"] + log: + config["logs_dir"] + "gtdbtk-run.log" + benchmark: + "benchmarks/run_gtdbtk_on_MAGs-with-1-pplacer-cpu-benchmarks.tsv" + shell: + """ + # making sure database variable is set properly (can be off if using previous db location with new gtdb-tk conda env) + # this runs if the exit status of seeking the help menu isn't 0 (e.g. gtdb-tk tells us something is wrong with where it's looking for the ref db) + if ! gtdbtk -h > /dev/null; then + # adding wanted location to this conda env PATH (gtdb-tk looks in the GTDBTK_DATA_PATH variable), + # so will be set when the conda environment is started from now on + mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/ + echo 'export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh + + # but still needs to be set for this particular session that is downloading and setting up the db + GTDBTK_DATA_PATH={params.gtdbtk_db_dir} + fi + + # only running if any MAGs were recovered + if [ $(find {params.MAGs_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + if [ "{params.gtdb_tk_scratch_location}" != "" ]; then + + gtdbtk classify_wf --scratch_dir {params.gtdb_tk_scratch_location} --genome_dir {params.MAGs_dir} -x fasta --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} > {log} 2>&1 + + else + + gtdbtk classify_wf --genome_dir {params.MAGs_dir} -x fasta --out_dir {output} --cpus {resources.cpus} --pplacer_cpus {params.pplacer_cpus} > {log} 2>&1 + + fi + + else + + mkdir -p {output} + printf "There were no MAGs recovered.\n" > {params.MAGs_dir}No-MAGs-recovered.txt + printf "\n\nThere were no MAGs recovered, so GTDB-tk was not run.\n\n" > {log} + + fi + """ + + + +rule filter_checkm_results_and_copy_MAGs: + """ + Filters checkm results based on est. completion, redundancy, and strain heterogeneity set in 'config.yaml' + Defaults are conservatively 90, 10, and 50 + """ + + input: + config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv" + output: + config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv" + params: + bins_dir = config["bins_dir"], + MAGs_dir = config["MAGs_dir"], + tmp_file = config["MAGs_dir"] + "MAGs-checkm-out.tmp", + min_est_comp = config["minimum_estimated_completion"], + max_est_redund = config["maximum_estimated_redundancy"], + max_est_strain_het = config["maximum_estimated_strain_heterogeneity"] + benchmark: + "benchmarks/filtering_checkm_results_and_copying_MAGs-benchmarks.tsv" + shell: + """ + # only running if there were bins recovered + if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + cat <( printf "Bin Id\tMarker lineage\t# genomes\t# markers\t# marker sets\t0\t1\t2\t3\t4\t5+\tCompleteness\tContamination\tStrain heterogeneity\n" ) \ + <( awk -F $'\\t' ' $12 >= {params.min_est_comp} && $13 <= {params.max_est_redund} && $14 <= {params.max_est_strain_het} ' {input} ) > {params.tmp_file} + + sed 's/-bin\./-MAG-/' {params.tmp_file} > {output} + + for MAG in $(cut -f 1 {params.tmp_file} | tail -n +2) + do + new_ID=$(echo $MAG | sed 's/-bin\./-MAG-/') + cp {params.bins_dir}${{MAG}}.fasta {params.MAGs_dir}${{new_ID}}.fasta + done + + rm {params.tmp_file} + + else + + printf "There were no MAGs recovered.\n" > {output} + + fi + """ + + +rule generate_bins_overview_table: + input: + assembly_summaries = config["bins_dir"] + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv", + checkm_results = config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv", + timing_trigger = config["MAGs_dir"] + config["additional_filename_prefix"] + "MAGs-checkm-out.tsv" + params: + checkm_tmp = config["bins_dir"] + "checkm-estimates.tmp", + checkm_w_header_tmp = config["bins_dir"] + "checkm-estimates-with-headers.tmp", + bins_dir = config["bins_dir"] + output: + config["bins_dir"] + config["additional_filename_prefix"] + f"bins-overview{assay_suffix}.tsv" + benchmark: + "benchmarks/generate_bins_overview_table-benchmarks.tsv" + shell: + """ + # only running if there were bins recovered + if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + # making sure none of the intermediate files exist already + rm -rf {params.checkm_tmp} {params.checkm_w_header_tmp} + + for bin in $(cut -f 1 {input.assembly_summaries} | tail -n +2) + do + + grep -w -m 1 "^${{bin}}" {input.checkm_results} | cut -f 12,13,14 >> {params.checkm_tmp} + + done + + # adding header + cat <(printf "est. completeness\\test. redundancy\\test. strain heterogeneity\\n") {params.checkm_tmp} > {params.checkm_w_header_tmp} + + # combining + paste {input.assembly_summaries} {params.checkm_w_header_tmp} > {output} + + rm -rf {params.checkm_tmp} {params.checkm_w_header_tmp} + + else + + rm -rf {params.bins_dir}* + printf "There were no bins recovered.\n" > {output} + + fi + """ + + +rule checkm_on_bins: + """ runs checkm on recovered bins """ + + conda: + "envs/checkm.yaml" + input: + trigger = expand(config["mapping_dir"] + "{ID}-metabat-assembly-depth.tsv", ID = sample_ID_list) + params: + bins_dir = config["bins_dir"], + tmp_output_dir = config["bins_dir"] + "checkm-out-tmp/", + tmp_working_dir = config["bins_dir"] + "checkm-working-tmp/", + num_threads = config["gtdb_tk_checkm_pplacer_cpus"], + reduced_tree = config["reduced_tree"] + resources: + cpus = config["num_cpus"], + mem_mb = config["checkm_memory_resources"] + output: + config["bins_dir"] + config["additional_filename_prefix"] + "bins-checkm-out.tsv" + log: + config["logs_dir"] + "checkm.log" + benchmark: + "benchmarks/run_checkm_on_bins-benchmarks.tsv" + shell: + """ + # only running if there were bins recovered + if [ $(find {params.bins_dir} -name "*fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + mkdir -p {params.tmp_working_dir} + + if [ "{params.reduced_tree}" == True ]; then + + checkm lineage_wf -f {output} --tab_table -t {resources.cpus} --reduced_tree --pplacer_threads {params.num_threads} -x fasta {params.bins_dir} {params.tmp_output_dir} --tmpdir {params.tmp_working_dir} > {log} 2>&1 + + else + + checkm lineage_wf -f {output} --tab_table -t {resources.cpus} --pplacer_threads {params.num_threads} -x fasta {params.bins_dir} {params.tmp_output_dir} --tmpdir {params.tmp_working_dir} > {log} 2>&1 + + fi + + rm -rf {params.tmp_output_dir} {params.tmp_working_dir} + + else + + printf "There were no bins recovered, so checkm was not run.\n" > {output} + + fi + """ + + +rule summarize_bin_assemblies: + """ summarize bin assemblies """ + + conda: + "envs/bit.yaml" + input: + trigger = expand(config["mapping_dir"] + "{ID}-metabat-assembly-depth.tsv", ID = sample_ID_list) + params: + intermediate_file = config["bins_dir"] + "bin-summaries.tmp", + bins_dir = config["bins_dir"] + output: + config["bins_dir"] + config["additional_filename_prefix"] + "bin-assembly-summaries.tsv" + benchmark: + "benchmarks/summarize_bin_assemblies-benchmarks.tsv" + shell: + """ + # only running if any bins were recovered + if [ $(find {params.bins_dir} -name "*.fasta" | wc -l | sed 's/^ *//') -gt 0 ]; then + + bit-summarize-assembly {params.bins_dir}*.fasta -o {params.intermediate_file} -t + + # slimming down the output + cut -f 1,2,3,5,6,8,11,18,19,20 {params.intermediate_file} > {output} + rm {params.intermediate_file} + + else + + printf "There were no bins recovered.\n" > {output} + + fi + """ + + +rule metabat_binning: + """ + This rule runs metabat2 for binning contigs. + """ + + conda: + "envs/metabat.yaml" + input: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + bam = config["mapping_dir"] + "{ID}.bam" + params: + bins_dir = config["bins_dir"], + prefix = config["bins_dir"] + "{ID}-bin", + tmp_bins_file = "{ID}-bin-files.tmp", + tmp_rename_script = "{ID}-rename.tmp" + resources: + cpus = config["num_threads"] + output: + depth_file = config["mapping_dir"] + "{ID}-metabat-assembly-depth.tsv" + log: + config["logs_dir"] + "{ID}-bam-summarize-and-metabat.log" + benchmark: + "benchmarks/metabat_binning-{ID}-benchmarks.tsv" + shell: + """ + # only running if the assembly produced anything + if [ -s {input.assembly} ]; then + + jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} --percentIdentity 97 --minContigLength 1000 --minContigDepth 1.0 --referenceFasta {input.assembly} {input.bam} > {log} 2>&1 + + # only running if there are contigs with coverage information in the coverage file we just generated + if [ $(wc -l {output.depth_file} | sed 's/^ *//' | cut -f 1 -d " ") -gt 1 ]; then + metabat2 --inFile {input.assembly} --outFile {params.prefix} --abdFile {output.depth_file} -t {resources.cpus} >> {log} 2>&1 + else + printf "\n\nThere was no coverage info generated in {output.depth_file}, so no binning with metabat was performed.\n\n" >> {log} + fi + + # changing extensions from .fa to .fasta to match nt fasta extension elsewhere in GeneLab + find {params.bins_dir} -name {wildcards.ID}*.fa > {params.tmp_bins_file} + + if [ -s {params.tmp_bins_file} ]; then + paste -d " " <( sed 's/^/mv /' {params.tmp_bins_file} ) <( sed 's/.fa/.fasta/' {params.tmp_bins_file} ) > {params.tmp_rename_script} + bash {params.tmp_rename_script} + fi + + rm -rf {params.tmp_bins_file} {params.tmp_rename_script} + + else + + touch {output} + printf "Binning not performed because the assembly didn't produce anything.\n" > {log} + + fi + """ + + +rule combine_read_based_processing_taxonomy: + """ + This rule includes final outputs from read-based functional annotation process as inputs even though they aren't used just so + we can delete those working directories when done with them here (ensuring the other processes are already done with them). + """ + conda: + "envs/humann3.yaml" + input: + in_files = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_metaphlan_bugs_list.tsv", ID = sample_ID_list), + trigger1 = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", + trigger2 = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv" + params: + dirs_to_remove = " ".join(expand(config["read_based_dir"] + "{ID}-humann3-out-dir/", ID = sample_ID_list)) + output: + config["read_based_dir"] + config["additional_filename_prefix"] + f"Metaphlan-taxonomy{assay_suffix}.tsv" + benchmark: + "benchmarks/combine_read_based_processing_taxonomy-benchmarks.tsv" + shell: + """ + merge_metaphlan_tables.py {input.in_files} > {output} 2> /dev/null + + # removing redundant text from headers (using the -i flag to keep it portable with darwin shell) + sed -i.tmp 's/_metaphlan_bugs_list//g' {output} + rm -rf {output}.tmp {params.dirs_to_remove} + """ + + +rule gen_read_based_processing_KO_table: + """ + This rule summarizes the read-based humann annotations based on Kegg Orthlogy terms. + """ + conda: + "envs/humann3.yaml" + input: + gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families{assay_suffix}.tsv" + output: + gene_families_KOs_cpm = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-KO-cpm{assay_suffix}.tsv" + benchmark: + "benchmarks/gen_read_based_processing_KO_table-benchmarks.tsv" + shell: + """ + humann_regroup_table -i {input} -g uniref90_ko 2> /dev/null | humann_rename_table -n kegg-orthology 2> /dev/null | humann_renorm_table -o {output} --update-snames > /dev/null 2>&1 + """ + + +rule gen_normalized_read_based_processing_tables: + """ + This rule generates some normalized tables of the read-based functional outputs from + humann that are more readily suitable for across sample comparisons. + """ + conda: + "envs/humann3.yaml" + input: + gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families{assay_suffix}.tsv", + path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances{assay_suffix}.tsv" + output: + gene_families_cpm = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-cpm{assay_suffix}.tsv", + path_abundances_cpm = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances-cpm{assay_suffix}.tsv" + benchmark: + "benchmarks/gen_normalized_read_based_processing_tables-benchmarks.tsv" + shell: + """ + humann_renorm_table -i {input.gene_families} -o {output.gene_families_cpm} --update-snames > /dev/null 2>&1 + humann_renorm_table -i {input.path_abundances} -o {output.path_abundances_cpm} --update-snames > /dev/null 2>&1 + """ + + +rule split_read_based_processing_tables: + """ + The read-based functional annotation tables have taxonomic info and non-taxonomic info mixed + together initially. humann comes with utility scripts to split these. This rule does that, + generating non-taxonomically grouped functional info files and taxonomically grouped ones. + """ + conda: + "envs/humann3.yaml" + input: + gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial.tsv", + path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial.tsv", + path_coverages = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial.tsv" + params: + read_based_dir = config["read_based_dir"], + gene_families_initial_stratified = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial_stratified.tsv", + gene_families_initial_unstratified = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial_unstratified.tsv", + path_abundances_initial_stratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial_stratified.tsv", + path_abundances_initial_unstratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial_unstratified.tsv", + path_coverages_initial_stratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial_stratified.tsv", + path_coverages_initial_unstratified = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial_unstratified.tsv" + output: + gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families{assay_suffix}.tsv", + gene_families_grouped = config["read_based_dir"] + config["additional_filename_prefix"] + f"Gene-families-grouped-by-taxa{assay_suffix}.tsv", + path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances{assay_suffix}.tsv", + path_abundances_grouped = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-abundances-grouped-by-taxa{assay_suffix}.tsv", + path_coverages = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-coverages{assay_suffix}.tsv", + path_coverages_grouped = config["read_based_dir"] + config["additional_filename_prefix"] + f"Pathway-coverages-grouped-by-taxa{assay_suffix}.tsv", + benchmark: + "benchmarks/split_read_based_processing_tables-benchmarks.tsv" + shell: + """ + humann_split_stratified_table -i {input.gene_families} -o {params.read_based_dir} > /dev/null 2>&1 + mv {params.gene_families_initial_stratified} {output.gene_families_grouped} + mv {params.gene_families_initial_unstratified} {output.gene_families} + + humann_split_stratified_table -i {input.path_abundances} -o {params.read_based_dir} > /dev/null 2>&1 + mv {params.path_abundances_initial_stratified} {output.path_abundances_grouped} + mv {params.path_abundances_initial_unstratified} {output.path_abundances} + + humann_split_stratified_table -i {input.path_coverages} -o {params.read_based_dir} > /dev/null 2>&1 + mv {params.path_coverages_initial_stratified} {output.path_coverages_grouped} + mv {params.path_coverages_initial_unstratified} {output.path_coverages} + + rm {input} + """ + + +rule combine_read_based_processing_tables: + """ + This rule combines the read-based humann3 output functional tables from indiviual samples into single + tables across the GLDS dataset. + """ + conda: + "envs/humann3.yaml" + input: + gene_families = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_genefamilies.tsv", ID = sample_ID_list), + path_abundances = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathabundance.tsv", ID = sample_ID_list), + path_coverages = expand(config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathcoverage.tsv", ID = sample_ID_list) + params: + gene_fam_dir = config["read_based_dir"] + "gene-family-results/", + path_abund_dir = config["read_based_dir"] + "path-abundance-results/", + path_cov_dir = config["read_based_dir"] + "path-coverage-results/", + utilities_path = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/utility_mapping" + output: + gene_families = config["read_based_dir"] + config["additional_filename_prefix"] + "gene-families-initial.tsv", + path_abundances = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-abundances-initial.tsv", + path_coverages = config["read_based_dir"] + config["additional_filename_prefix"] + "pathway-coverages-initial.tsv" + benchmark: + "benchmarks/combine_read_based_processing_tables-benchmarks.tsv" + shell: + """ + # setting humann3 utilities location (can be off if we pointed to a previously installed database, and doesn't hurt to reset if it was already good-to-go) + humann_config --update database_folders utility_mapping {params.utilities_path} > /dev/null 2>&1 + + # they each need to be in the same directories to be merged + mkdir -p {params.gene_fam_dir} {params.path_abund_dir} {params.path_cov_dir} + cp {input.gene_families} {params.gene_fam_dir} + cp {input.path_abundances} {params.path_abund_dir} + cp {input.path_coverages} {params.path_cov_dir} + + humann_join_tables -i {params.gene_fam_dir} -o {output.gene_families} > /dev/null 2>&1 + humann_join_tables -i {params.path_abund_dir} -o {output.path_abundances} > /dev/null 2>&1 + humann_join_tables -i {params.path_cov_dir} -o {output.path_coverages} > /dev/null 2>&1 + + rm -rf {params.gene_fam_dir} {params.path_abund_dir} {params.path_cov_dir} + """ + + +if config["single_end_data"] != "TRUE": + # humann3 rule if paired-end data + + rule humann3_PE: + """ + This rule runs humann3 and metaphlan4 on each individual sample generating the + read-based functional annotations and taxonomic classifications. + """ + conda: + "envs/humann3.yaml" + input: + R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], + R2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"], + chocophlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_CHOCOPHLAN_TRIGGER_FILE"], + uniref_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UNIREF_TRIGGER_FILE"], + utility_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UTILITY_MAPPING_TRIGGER_FILE"], + metaphlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + "/" + config["METAPHLAN_TRIGGER_FILE"] + params: + combined_reads = config["read_based_dir"] + "{ID}-reads.tmp.fq.gz", + output_dir = config["read_based_dir"] + "{ID}-humann3-out-dir", + tmp_metaphlan = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/{ID}_metaphlan_bugs_list.tsv", + tmp_dir = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/", + metaphlan_dir = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + output: + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_genefamilies.tsv", + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathabundance.tsv", + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathcoverage.tsv", + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_metaphlan_bugs_list.tsv" + resources: + cpus = config["num_threads"], + mem_mb = config["humann3_memory_resources"] + log: + config["logs_dir"] + "{ID}-humann3-run.log" + benchmark: + "benchmarks/run_humann3-{ID}-benchmarks.tsv" + shell: + """ + cat {input.R1} {input.R2} > {params.combined_reads} + humann --input {params.combined_reads} --output {params.output_dir} --threads {resources.cpus} --output-basename {wildcards.ID} --metaphlan-options "--index mpa_vJan21_CHOCOPhlAnSGB_202103 --bowtie2db {params.metaphlan_dir} --unclassified_estimation --add_viruses --sample_id {wildcards.ID}" --bowtie-options "--sensitive --mm" > {log} 2>&1 + mv {params.tmp_metaphlan} {output[3]} + rm -rf {params.combined_reads} {params.tmp_dir} + """ + +else: + # humann3 rule if single-end data + + rule humann3_SE: + """ + This rule runs humann3 and metaphlan4 on each individual sample generating the + read-based functional annotations and taxonomic classifications. + """ + conda: + "envs/humann3.yaml" + input: + R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"], + chocophlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_CHOCOPHLAN_TRIGGER_FILE"], + uniref_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UNIREF_TRIGGER_FILE"], + utility_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UTILITY_MAPPING_TRIGGER_FILE"], + metaphlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + "/" + config["METAPHLAN_TRIGGER_FILE"] + params: + output_dir = config["read_based_dir"] + "{ID}-humann3-out-dir", + tmp_metaphlan = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/{ID}_metaphlan_bugs_list.tsv", + tmp_dir = config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_humann_temp/", + metaphlan_dir = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + output: + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_genefamilies.tsv", + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathabundance.tsv", + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_pathcoverage.tsv", + config["read_based_dir"] + "{ID}-humann3-out-dir/{ID}_metaphlan_bugs_list.tsv" + resources: + cpus = config["num_threads"], + mem_mb = config["humann3_memory_resources"] + log: + config["logs_dir"] + "{ID}-humann3-run.log" + benchmark: + "benchmarks/run_humann3-{ID}-benchmarks.tsv" + shell: + """ + humann --input {input.R1} --output {params.output_dir} --threads {resources.cpus} --output-basename {wildcards.ID} --metaphlan-options "--bowtie2db {params.metaphlan_dir} --unclassified_estimation --add_viruses --sample_id {wildcards.ID}" --bowtie-options "--sensitive --mm" > {log} 2>&1 + mv {params.tmp_metaphlan} {output[3]} + rm -rf {params.tmp_dir} + """ + + +rule make_combined_contig_tax_tables: + conda: + "envs/bit.yaml" + input: + expand(config["annotations_and_tax_dir"] + "{ID}-contig-coverage-and-tax.tsv", ID = sample_ID_list) + params: + out_prefix = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined", + tmp_out = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-contig-level-taxonomy-coverages.tsv", + tmp_out_CPM = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-contig-level-taxonomy-coverages-CPM.tsv" + output: + combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", + norm_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-contig-level-taxonomy-coverages-CPM{assay_suffix}.tsv" + benchmark: + "benchmarks/make_combined_contig_tax_tables-benchmarks.tsv" + shell: + """ + bit-GL-combine-contig-tax-tables {input} -o {params.out_prefix} + # renaming to have GL assay-specific suffix + mv {params.tmp_out} {output.combined_tax} + mv {params.tmp_out_CPM} {output.norm_combined_tax} + """ + + +rule make_combined_gene_level_tables: + conda: + "envs/bit.yaml" + input: + expand(config["annotations_and_tax_dir"] + "{ID}-gene-coverage-annotation-and-tax.tsv", ID = sample_ID_list) + params: + out_prefix = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined", + tmp_combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-KO-function-coverages.tsv", + tmp_norm_combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-KO-function-coverages-CPM.tsv", + tmp_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-taxonomy-coverages.tsv", + tmp_norm_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + "Combined-gene-level-taxonomy-coverages-CPM.tsv" + output: + combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", + norm_combined_annots = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-KO-function-coverages-CPM{assay_suffix}.tsv", + combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", + norm_combined_tax = config["combined_output_dir"] + config["additional_filename_prefix"] + f"Combined-gene-level-taxonomy-coverages-CPM{assay_suffix}.tsv" + benchmark: + "benchmarks/make_combined_gene_level_tables-benchmarks.tsv" + shell: + """ + bit-GL-combine-KO-and-tax-tables {input} -o {params.out_prefix} + # renaming to have GL assay-specific suffix + mv {params.tmp_combined_annots} {output.combined_annots} + mv {params.tmp_norm_combined_annots} {output.norm_combined_annots} + mv {params.tmp_combined_tax} {output.combined_tax} + mv {params.tmp_norm_combined_tax} {output.norm_combined_tax} + """ + + +rule combine_contig_tax_and_coverage: + """ + This rule combines the contig-level taxonomic and coverage information for each individual sample. + """ + input: + cov = config["mapping_dir"] + "{ID}-contig-coverages.tsv", + tax = config["annotations_and_tax_dir"] + "{ID}-contig-tax.tsv" + params: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + AAs = config["genes_dir"] + "{ID}-genes.faa", + contig_tmp = config["annotations_and_tax_dir"] + "{ID}-contig.tmp", + header_tmp = config["annotations_and_tax_dir"] + "{ID}-contig-header.tmp", + contig_p1_tmp = config["annotations_and_tax_dir"] + "{ID}-contig-p1.tmp", + tax_col_tmp = config["annotations_and_tax_dir"] + "{ID}-tax-col.tmp" + output: + config["annotations_and_tax_dir"] + "{ID}-contig-coverage-and-tax.tsv" + benchmark: + "benchmarks/combine_contig_tax_and_coverage-{ID}-benchmarks.tsv" + shell: + """ + # only running if the assembly produced anything + if [ -s {params.assembly} ]; then + + # if there were no genes called, there is no contig-level taxonomy, so dealing with that here + if [ -s {params.AAs} ]; then + paste <( tail -n +2 {input.cov} | sort -V -k 1 ) <( tail -n +2 {input.tax} | sort -V -k 1 | cut -f 2- ) > {params.contig_tmp} + paste <( head -n 1 {input.cov} ) <( head -n 1 {input.tax} | cut -f 2- ) > {params.header_tmp} + cat {params.header_tmp} {params.contig_tmp} > {output} + rm -rf {params.contig_tmp} {params.header_tmp} + rm -rf {input} + + else + + paste <( tail -n +2 {input.cov} | sort -V -k 1 ) > {params.contig_p1_tmp} + sed 's/.*/NA/g' {params.contig_p1_tmp} > {params.tax_col_tmp} + paste {params.contig_p1_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} {params.tax_col_tmp} > {params.contig_tmp} + cat <( printf "contig_ID\tcoverage\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" ) {params.contig_tmp} > {output} + rm -rf {params.contig_p1_tmp} {params.tax_col_tmp} {params.contig_tmp} + rm -rf {input} + + fi + + else + + printf "contig_ID\tcoverage\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output} + rm -rf {input} + + fi + """ + + +rule combine_gene_annots_tax_and_coverage: + """ + This rule combines the gene-level functional annotations, taxonomic classifications, and coverage information for each individual sample. + """ + input: + cov = config["mapping_dir"] + "{ID}-gene-coverages.tsv", + annots = config["annotations_and_tax_dir"] + "{ID}-annotations.tsv", + tax = config["annotations_and_tax_dir"] + "{ID}-gene-tax.tsv" + params: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + AAs = config["genes_dir"] + "{ID}-genes.faa", + gene_tmp = config["annotations_and_tax_dir"] + "{ID}-gene.tmp", + header_tmp = config["annotations_and_tax_dir"] + "{ID}-gene-header.tmp" + output: + config["annotations_and_tax_dir"] + "{ID}-gene-coverage-annotation-and-tax.tsv" + benchmark: + "benchmarks/combine_gene_annots_tax_and_coverage-{ID}-benchmarks.tsv" + shell: + """ + # only running if the assembly produced anything and genes were identified (they are required for this) + if [ -s {params.assembly} ] && [ -s {params.AAs} ]; then + + paste <( tail -n +2 {input.cov} | sort -V -k 1 ) <( tail -n +2 {input.annots} | sort -V -k 1 | cut -f 2- ) <( tail -n +2 {input.tax} | sort -V -k 1 | cut -f 2- ) > {params.gene_tmp} + paste <( head -n 1 {input.cov} ) <( head -n 1 {input.annots} | cut -f 2- ) <( head -n 1 {input.tax} | cut -f 2- ) > {params.header_tmp} + + cat {params.header_tmp} {params.gene_tmp} > {output} + + rm -rf {params.gene_tmp} {params.header_tmp} + rm -rf {input} + + else + + printf "gene_ID\tcoverage\tKO_ID\tKO_function\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n" > {output} + rm -rf {input} + + fi + """ + + +rule get_cov_and_det: + """ + This rule pulls out coverage and detection information for each sample, gene-level and contig-level, + and filters the gene-level coverage information based on requiring at least 50% detection. + """ + + conda: + "envs/mapping.yaml" + input: + bam = config["mapping_dir"] + "{ID}.bam", + nt = config["genes_dir"] + "{ID}-genes.fasta" + params: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + gene_cov_and_det_tmp = config["mapping_dir"] + "{ID}-gene-cov-and-det.tmp", + contig_cov_and_det_tmp = config["mapping_dir"] + "{ID}-contig-cov-and-det.tmp", + gene_cov_tmp = config["mapping_dir"] + "{ID}-gene-cov.tmp", + contig_cov_tmp = config["mapping_dir"] + "{ID}-contig-cov.tmp", + pileup_mem = config["pileup_mem"] + output: + gene_covs = config["mapping_dir"] + "{ID}-gene-coverages.tsv", + contig_covs = config["mapping_dir"] + "{ID}-contig-coverages.tsv" + resources: + mem_mb = config["pileup_memory_resources"] + log: + config["logs_dir"] + "{ID}-pileup.log" + benchmark: + "benchmarks/get_cov_and_det-{ID}-benchmarks.tsv" + shell: + """ + # only running if the assembly produced anything + if [ -s {params.assembly} ]; then + + # only running on genes also if genes were identified + if [ -s {input.nt} ]; then + + pileup.sh -Xmx{params.pileup_mem} -in {input.bam} fastaorf={input.nt} outorf={params.gene_cov_and_det_tmp} out={params.contig_cov_and_det_tmp} > {log} 2>&1 + + # filtering coverages based on detection + # genes + grep -v "#" {params.gene_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $10 <= 0.5 ) $4 = 0 }} {{ print $1,$4 }} ' > {params.gene_cov_tmp} + cat <( printf "gene_ID\tcoverage\n" ) {params.gene_cov_tmp} > {output.gene_covs} + + # contigs + grep -v "#" {params.contig_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $5 <= 50 ) $2 = 0 }} {{ print $1,$2 }} ' > {params.contig_cov_tmp} + cat <( printf "contig_ID\tcoverage\n" ) {params.contig_cov_tmp} > {output.contig_covs} + + # removing intermediate files + rm {params.gene_cov_and_det_tmp} {params.contig_cov_and_det_tmp} {params.gene_cov_tmp} {params.contig_cov_tmp} + + else + + pileup.sh -in {input.bam} out={params.contig_cov_and_det_tmp} > {log} 2>&1 + + # filtering coverages based on detection + # contigs + grep -v "#" {params.contig_cov_and_det_tmp} | awk -F $'\\t' ' BEGIN {{OFS=FS}} {{ if ( $5 <= 50 ) $2 = 0 }} {{ print $1,$2 }} ' > {params.contig_cov_tmp} + cat <( printf "contig_ID\tcoverage\n" ) {params.contig_cov_tmp} > {output.contig_covs} + + # writing out empty genes coverage file + printf "gene_ID\tcoverage\n" > {output.gene_covs} + printf "\n\nGene-level coverage info not recovered because the assembly didn't have any genes identified.\n" >> {log} + + # removing intermediate files + rm {params.contig_cov_and_det_tmp} {params.contig_cov_tmp} + + fi + + else + + printf "gene_ID\tcoverage\n" > {output.gene_covs} + printf "contig_ID\tcoverage\n" > {output.contig_covs} + printf "Coverage info not recovered because the assembly didn't produce anything.\n" > {log} + + fi + """ + + +if config["single_end_data"] != "TRUE": + # mapping rule if paired-end data + + rule mapping_PE: + """ + This rule builds the bowtie2 index and runs the mapping for each sample. + """ + conda: + "envs/mapping.yaml" + input: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], + R2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] + params: + index = config["mapping_dir"] + "{ID}-index", + mapping_info = config["mapping_dir"] + "{ID}-mapping-info.txt", + num_threads = config["num_threads"] + resources: + cpus = config["num_threads"], + mem_mb = config["mapping_memory_resources"] + output: + config["mapping_dir"] + "{ID}.bam" + log: + config["logs_dir"] + "{ID}-bowtie2-build.log" + benchmark: + "benchmarks/run_mapping-{ID}-benchmarks.tsv" + shell: + """ + # only running if the assembly produced anything + if [ -s {input.assembly} ]; then + + bowtie2-build {input.assembly} {params.index} > {log} 2>&1 + bowtie2 --mm -q --threads {params.num_threads} -x {params.index} -1 {input.R1} -2 {input.R2} --no-unal 2> {params.mapping_info} | samtools view -b | samtools sort -@ {params.num_threads} > {output} 2> /dev/null + rm {params.index}* + + else + + touch {output} + printf "Mapping not performed because the assembly didn't produce anything.\n" > {log} + + fi + """ + +else: + # mapping rule if single-end data + + rule mapping_SE: + """ + This rule builds the bowtie2 index and runs the mapping for each sample. + """ + conda: + "envs/mapping.yaml" + input: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + params: + index = config["mapping_dir"] + "{ID}-index", + mapping_info = config["mapping_dir"] + "{ID}-mapping-info.txt", + num_threads = config["num_threads"] + resources: + cpus = config["num_threads"], + mem_mb = config["mapping_memory_resources"] + output: + config["mapping_dir"] + "{ID}.bam" + log: + config["logs_dir"] + "{ID}-bowtie2-build.log" + benchmark: + "benchmarks/run_mapping-{ID}-benchmarks.tsv" + shell: + """ + # only running if the assembly produced anything + if [ -s {input.assembly} ]; then + + bowtie2-build {input.assembly} {params.index} > {log} 2>&1 + bowtie2 --mm -q --threads {params.num_threads} -x {params.index} -r {input.R1} --no-unal 2> {params.mapping_info} | samtools view -b | samtools sort -@ {params.num_threads} > {output} 2> /dev/null + rm {params.index}* + + else + + touch {output} + printf "Mapping not performed because the assembly didn't produce anything.\n" > {log} + + fi + """ + + +rule tax_classification: + """ + This rule runs the gene- and contig-level taxonomic classifications for each assembly. + """ + + conda: + "envs/cat.yaml" + input: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + AA = config["genes_dir"] + "{ID}-genes.faa", + cat_db_trigger = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"] + output: + gene_tax_out = config["annotations_and_tax_dir"] + "{ID}-gene-tax.tsv", + contig_tax_out = config["annotations_and_tax_dir"] + "{ID}-contig-tax.tsv" + params: + tmp_out_prefix = config["annotations_and_tax_dir"] + "{ID}-tax-out.tmp", + tmp_genes = config["annotations_and_tax_dir"] + "{ID}-gene-tax.tmp", + tmp_contigs = config["annotations_and_tax_dir"] + "{ID}-contig-tax.tmp", + cat_db = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + config["CAT_DB"], + cat_tax = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + config["CAT_TAX"], + block_size = config["block_size"] + resources: + cpus = config["num_cpus"], + mem_mb = config["CAT_memory_resources"] + log: + config["logs_dir"] + "{ID}-CAT.log" + benchmark: + "benchmarks/run_tax_classification-{ID}-benchmarks.tsv" + shell: + """ + # only running if assembly produced any contigs and genes were identified (they are required for this) + if [ -s {input.assembly} ] && [ -s {input.AA} ]; then + + CAT contigs -d {params.cat_db} -t {params.cat_tax} -n {resources.cpus} -r 3 --top 4 --I_know_what_Im_doing -c {input.assembly} -p {input.AA} -o {params.tmp_out_prefix} --no_stars --block_size {params.block_size} --index_chunks 2 --force > {log} 2>&1 + + # adding names to gene classifications + CAT add_names -i {params.tmp_out_prefix}.ORF2LCA.txt -o {params.tmp_genes} -t {params.cat_tax} --only_official --exclude_scores >> {log} 2>&1 + + # formatting gene classifications + bash scripts/format-gene-tax-classifications.sh {params.tmp_genes} {output.gene_tax_out} + + # adding names to contig classifications + CAT add_names -i {params.tmp_out_prefix}.contig2classification.txt -o {params.tmp_contigs} -t {params.cat_tax} --only_official --exclude_scores >> {log} 2>&1 + + # formatting contig classifications + bash scripts/format-contig-tax-classifications.sh {params.tmp_contigs} {output.contig_tax_out} + + rm -rf {params.tmp_out_prefix}* {params.tmp_genes} {params.tmp_contigs} + + else + + touch {output} + printf "Assembly-based taxonomic classification not performed because the assembly didn't produce anything and/or no genes were identified.\n" > {log} + + fi + """ + + +rule KO_annotation: + """ + This rule runs the gene-level (KO) functional annotation for each sample. + """ + conda: + "envs/kofamscan.yaml" + input: + AAs = config["genes_dir"] + "{ID}-genes.faa", + kofamscan_db_trigger = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/" + config["KOFAMSCAN_TRIGGER_FILE"] + output: + config["annotations_and_tax_dir"] + "{ID}-annotations.tsv" + params: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta", + ko_db_dir = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"], + tmp_out = config["annotations_and_tax_dir"] + "{ID}-KO-tab.tmp", + tmp_dir = config["annotations_and_tax_dir"] + "{ID}-tmp-KO-dir" + resources: + cpus = config["num_cpus"], + mem_mb = config["KOFamScan_memory_resources"] + log: + config["logs_dir"] + "{ID}-kofamscan.log" + benchmark: + "benchmarks/run_KO_annotation-{ID}-benchmarks.tsv" + shell: + """ + # only running if assembly produced any contigs and genes were identified (they are required for this) + if [ -s {params.assembly} ] && [ -s {input.AAs} ]; then + + exec_annotation -p {params.ko_db_dir}/profiles/ -k {params.ko_db_dir}/ko_list --cpu {resources.cpus} -f detail-tsv -o {params.tmp_out} --tmp-dir {params.tmp_dir} --report-unannotated {input.AAs} > {log} 2>&1 + + bit-filter-KOFamScan-results -i {params.tmp_out} -o {output} + + rm -rf {params.tmp_out} {params.tmp_dir} + + else + + touch {output} + printf "Functional annotations not performed because the assembly didn't produce anything and/or no genes were identified.\n" > {log} + + fi + """ + + +rule call_genes: + """ + This rule calls genes on each assembly file. + """ + + conda: + "envs/prodigal.yaml" + input: + assembly = config["assemblies_dir"] + "{ID}-assembly.fasta" + output: + AA = config["genes_dir"] + "{ID}-genes.faa", + nt = config["genes_dir"] + "{ID}-genes.fasta", + gff = config["genes_dir"] + "{ID}-genes.gff" + log: + config["logs_dir"] + "{ID}-prodigal.log" + benchmark: + "benchmarks/call_genes-{ID}-benchmarks.tsv" + shell: + """ + # only running if assembly produced any contigs + if [ -s {input.assembly} ]; then + + prodigal -q -c -p meta -a {output.AA} -d {output.nt} -f gff -o {output.gff} -i {input.assembly} > {log} 2>&1 + + # removing line-wraps + bit-remove-wraps {output.AA} > {output.AA}.tmp 2> /dev/null && mv {output.AA}.tmp {output.AA} + bit-remove-wraps {output.nt} > {output.nt}.tmp 2> /dev/null && mv {output.nt}.tmp {output.nt} + + else + + touch {output} + printf "Gene-calling not performed because the assembly didn't produce anything.\n" > {log} + + fi + """ + + +rule summarize_assemblies: + """ + This rule summarizes and reports general stats for all individual sample assemblies in one table. + """ + conda: + "envs/bit.yaml" + input: + expand(config["assemblies_dir"] + "{ID}-assembly.fasta", ID = sample_ID_list) + output: + config["assemblies_dir"] + config["additional_filename_prefix"] + f"assembly-summaries{assay_suffix}.tsv" + benchmark: + "benchmarks/summarize_assemblies-benchmarks.tsv" + shell: + """ + bit-summarize-assembly -o {output} {input} + """ + + +if config["single_end_data"] != "TRUE": + # assembly rule if paired-end data + rule assemble_PE: + """ + This rule handles running the assembly for each individual sample. + """ + conda: + "envs/megahit.yaml" + input: + R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], + R2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] + params: + assemblies_dir = config["assemblies_dir"], + max_mem = config["max_mem_megahit"], + failed_assemblies_file = config["assemblies_dir"] + config["additional_filename_prefix"] + f"Failed-assemblies{assay_suffix}.tsv" + resources: + cpus = config["num_threads"], + mem_mb = config["megahit_memory_resources"] + output: + config["assemblies_dir"] + "{ID}-assembly.fasta" + log: + config["logs_dir"] + "{ID}-assembly.log" + benchmark: + "benchmarks/assemble-{ID}-benchmarks.tsv" + shell: + """ + # removing output directory if exists already but rule still needs to be run (because there is no --force option to megahit i dont't think): + rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ + + megahit -1 {input.R1} -2 {input.R2} -m {params.max_mem} -t {resources.cpus} --min-contig-len 500 -o {params.assemblies_dir}{wildcards.ID}-megahit-out > {log} 2>&1 + bit-rename-fasta-headers -i {params.assemblies_dir}{wildcards.ID}-megahit-out/final.contigs.fa -w c_{wildcards.ID} -o {output} + + rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ + + # checking the assembly produced anything (megahit can run, produce the output fasta, but it will be empty if no contigs were assembled) + if [ ! -s {output} ]; then + printf "{wildcards.ID}\tNo contigs assembled\n" >> {params.failed_assemblies_file} + fi + """ + +else: + # assembly rule if single-end data + rule assemble_SE: + """ + This rule handles running the assembly for each individual sample. + """ + conda: + "envs/megahit.yaml" + input: + R1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + params: + assemblies_dir = config["assemblies_dir"], + max_mem = config["max_mem_megahit"], + failed_assemblies_file = config["assemblies_dir"] + config["additional_filename_prefix"] + f"Failed-assemblies{assay_suffix}.tsv" + resources: + cpus = config["num_threads"], + mem_mb = config["megahit_memory_resources"] + output: + config["assemblies_dir"] + "{ID}-assembly.fasta" + log: + config["logs_dir"] + "{ID}-assembly.log" + benchmark: + "benchmarks/assemble-{ID}-benchmarks.tsv" + shell: + """ + # removing output directory if exists already but rule still needs to be run (because there is no --force option to megahit i dont't think): + rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ + + megahit -r {input.R1} -m {params.max_mem} -t {resources.cpus} --min-contig-len 500 -o {params.assemblies_dir}{wildcards.ID}-megahit-out > {log} 2>&1 + bit-rename-fasta-headers -i {params.assemblies_dir}{wildcards.ID}-megahit-out/final.contigs.fa -w c_{wildcards.ID} -o {output} + + rm -rf {params.assemblies_dir}{wildcards.ID}-megahit-out/ + + # checking the assembly produced anything (megahit can run, produce the output fasta, but it will be empty if no contigs were assembled) + if [ ! -s {output} ]; then + printf "{wildcards.ID}\tNo contigs assembled\n" >> {params.failed_assemblies_file} + fi + """ + + +if config["single_end_data"] != "TRUE": + # quality-trimming/filtering rule if this is paired-end data + # quality-trimming/filtering rule run slightly different if data are generated with Swift 1S library prep + if config["swift_1S"] == "TRUE": + + rule bbduk_PE: + """ + This rule runs quality filtering/trimming on raw input fastq files for each individual sample. + """ + + conda: + "envs/qc.yaml" + input: + in1 = config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"], + in2 = config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"] + output: + out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], + out2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] + log: + config["logs_dir"] + "{ID}-bbduk.log" + benchmark: + "benchmarks/bbduk-{ID}-benchmarks.tsv" + shell: + """ + bbduk.sh in={input.in1} in2={input.in2} out1={output.out1} out2={output.out2} \ + ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ + trimq=10 mlf=0.5 maxns=0 swift=t > {log} 2>&1 + """ + + else: + + rule bbduk_PE: + """ + This rule runs quality filtering/trimming on raw input fastq files for each individual sample. + """ + + conda: + "envs/qc.yaml" + input: + in1 = config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"], + in2 = config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"] + output: + out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], + out2 = config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] + log: + config["logs_dir"] + "{ID}-bbduk.log" + benchmark: + "benchmarks/bbduk-{ID}-benchmarks.tsv" + shell: + """ + bbduk.sh in={input.in1} in2={input.in2} out1={output.out1} out2={output.out2} \ + ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ + trimq=10 mlf=0.5 maxns=0 > {log} 2>&1 + """ + +else: + # quality-trimming/filtering rule if this is single-end data + # quality-trimming/filtering rule run slightly different if data are generated with Swift 1S library prep + if config["swift_1S"] == "TRUE": + + rule bbduk_SE: + """ + This rule runs quality filtering/trimming on raw input fastq files for each individual sample. + """ + + conda: + "envs/qc.yaml" + input: + in1 = config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] + output: + out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + log: + config["logs_dir"] + "{ID}-bbduk.log" + benchmark: + "benchmarks/bbduk-{ID}-benchmarks.tsv" + shell: + """ + bbduk.sh in={input.in1} out1={output.out1} \ + ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ + trimq=10 mlf=0.5 maxns=0 swift=t > {log} 2>&1 + """ + + else: + + rule bbduk_SE: + """ + This rule runs quality filtering/trimming on raw input fastq files for each individual sample. + """ + + conda: + "envs/qc.yaml" + input: + in1 = config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] + output: + out1 = config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + log: + config["logs_dir"] + "{ID}-bbduk.log" + benchmark: + "benchmarks/bbduk-{ID}-benchmarks.tsv" + shell: + """ + bbduk.sh in={input.in1} out1={output.out1} \ + ref=${{CONDA_PREFIX}}/opt/bbmap-38.86-0/resources/adapters.fa ktrim=l k=17 ftm=5 qtrim=rl \ + trimq=10 mlf=0.5 maxns=0 > {log} 2>&1 + """ + + +if config["single_end_data"] != "TRUE": + + # QC rules if this is paired-end data + rule raw_multiqc_PE: + """ + This rule collates all raw fastqc outputs. + """ + + conda: + "envs/qc.yaml" + input: + expand(config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list), + expand(config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) + params: + reads_dir = config["raw_reads_dir"], + int_out_dir = config["additional_filename_prefix"] + "raw_multiqc_report", + out_filename_prefix = config["additional_filename_prefix"] + "raw_multiqc", + int_out_data_dir = config["additional_filename_prefix"] + "raw_multiqc_data", + int_html_file = config["additional_filename_prefix"] + "raw_multiqc.html", + int_zip = config["additional_filename_prefix"] + "raw_multiqc_report.zip", + config_file = "config/multiqc.config" + output: + final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip" + benchmark: + "benchmarks/raw_multiqc-benchmarks.tsv" + shell: + """ + multiqc -q -n {params.out_filename_prefix} --force --cl-config 'max_table_rows: 99999999' --interactive --config {params.config_file} {input} > /dev/null 2>&1 + + # removing the individual fastqc files + rm -rf {params.reads_dir}*fastqc* + + # making an output report directory and moving things into it + mkdir -p {params.int_out_dir} + mv {params.int_html_file} {params.int_out_data_dir} {params.int_out_dir} + + # zipping and removing unzipped dir + zip -q -r {params.int_zip} {params.int_out_dir} && rm -rf {params.int_out_dir} + + # moving to final wanted location + mv {params.int_zip} {output.final_out_zip} + """ + + + rule raw_fastqc_PE: + """ + This rule runs fastqc on all raw input fastq files. + """ + + conda: + "envs/qc.yaml" + input: + config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"], + config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"] + output: + config["raw_reads_dir"] + "{ID}" + config["raw_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", + config["raw_reads_dir"] + "{ID}" + config["raw_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" + benchmark: + "benchmarks/raw_fastqc-{ID}-benchmarks.tsv" + shell: + """ + fastqc {input} -t 2 -q + """ + + + use rule raw_multiqc_PE as filtered_multiqc_PE with: + input: + expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list), + expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) + params: + reads_dir = config["filtered_reads_dir"], + int_out_dir = config["additional_filename_prefix"] + "filtered_multiqc_report", + out_filename_prefix = config["additional_filename_prefix"] + "filtered_multiqc", + int_out_data_dir = config["additional_filename_prefix"] + "filtered_multiqc_data", + int_html_file = config["additional_filename_prefix"] + "filtered_multiqc.html", + int_zip = config["additional_filename_prefix"] + "filtered_multiqc_report.zip", + config_file = "config/multiqc.config" + output: + final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip" + benchmark: + "benchmarks/filtered_multiqc-benchmarks.tsv" + + + use rule raw_fastqc_PE as filtered_fastqc_PE with: + input: + config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"], + config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"] + output: + config["filtered_reads_dir"] + "{ID}" + config["filtered_R1_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", + config["filtered_reads_dir"] + "{ID}" + config["filtered_R2_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" + benchmark: + "benchmarks/filtered_fastqc-{ID}-benchmarks.tsv" + + + +else: + # QC rules if this is single-end data + rule raw_multiqc_SE: + """ + This rule collates all raw fastqc outputs. + """ + + conda: + "envs/qc.yaml" + input: + expand(config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) + params: + reads_dir = config["raw_reads_dir"], + int_out_dir = config["additional_filename_prefix"] + "raw_multiqc_report", + out_filename_prefix = config["additional_filename_prefix"] + "raw_multiqc", + int_out_data_dir = config["additional_filename_prefix"] + "raw_multiqc_data", + int_html_file = config["additional_filename_prefix"] + "raw_multiqc.html", + int_zip = config["additional_filename_prefix"] + "raw_multiqc_report.zip", + config_file = "config/multiqc.config" + output: + final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"raw_multiqc{assay_suffix}_report.zip" + benchmark: + "benchmarks/raw_multiqc-benchmarks.tsv" + shell: + """ + multiqc -q -n {params.out_filename_prefix} --force --cl-config 'max_table_rows: 99999999' --interactive --config {params.config_file} {input} > /dev/null 2>&1 + + # removing the individual fastqc files + rm -rf {params.reads_dir}*fastqc* + + # making an output report directory and moving things into it + mkdir -p {params.int_out_dir} + mv {params.int_html_file} {params.int_out_data_dir} {params.int_out_dir} + + # zipping and removing unzipped dir + zip -q -r {params.int_zip} {params.int_out_dir} && rm -rf {params.int_out_dir} + + # moving to final wanted location + mv {params.int_zip} {output.final_out_zip} + """ + + + rule raw_fastqc_SE: + """ + This rule runs fastqc on all raw input fastq files. + """ + + conda: + "envs/qc.yaml" + input: + config["raw_reads_dir"] + "{ID}" + config["raw_suffix"] + output: + config["raw_reads_dir"] + "{ID}" + config["raw_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" + benchmark: + "benchmarks/raw_fastqc-{ID}-benchmarks.tsv" + shell: + """ + fastqc {input} -t 2 -q + """ + + + use rule raw_multiqc_SE as filtered_multiqc_SE with: + input: + expand(config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip", ID = sample_ID_list) + params: + reads_dir = config["filtered_reads_dir"], + int_out_dir = config["additional_filename_prefix"] + "filtered_multiqc_report", + out_filename_prefix = config["additional_filename_prefix"] + "filtered_multiqc", + int_out_data_dir = config["additional_filename_prefix"] + "filtered_multiqc_data", + int_html_file = config["additional_filename_prefix"] + "filtered_multiqc.html", + int_zip = config["additional_filename_prefix"] + "filtered_multiqc_report.zip", + config_file = "config/multiqc.config" + output: + final_out_zip = config["fastqc_out_dir"] + config["additional_filename_prefix"] + f"filtered_multiqc{assay_suffix}_report.zip" + benchmark: + "benchmarks/filtered_multiqc-benchmarks.tsv" + + + use rule raw_fastqc_SE as filtered_fastqc_SE with: + input: + config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"] + output: + config["filtered_reads_dir"] + "{ID}" + config["filtered_suffix"].rsplit(".", 2)[0] + "_fastqc.zip" + benchmark: + "benchmarks/filtered_fastqc-{ID}-benchmarks.tsv" + + +### database checking and setup rules ### +rule setup_CAT_db: + """ + This rule checks for the CAT reference database, and downloads if needed. + """ + + conda: + "envs/cat.yaml" + output: + cat_db_trigger = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + "/" + config["CAT_TRIGGER_FILE"] + params: + cat_db_dir = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"], + compressed_cat = config["REF_DB_ROOT_DIR"] + config["CAT_DL_FILE"], + compressed_nr_faa = config["REF_DB_ROOT_DIR"] + config["CAT_DIR"] + config["CAT_DB"] + "/2021-01-07.nr.gz", + cat_dl_link = config["CAT_DL_LINK"], + REF_DB_ROOT_DIR = config["REF_DB_ROOT_DIR"] + log: + config["logs_dir"] + "setup-CAT-db.log" + benchmark: + "benchmarks/setup_CAT_db-benchmarks.tsv" + shell: + """ + mkdir -p {params.REF_DB_ROOT_DIR} + + printf "### Setting up CAT reference database ###\n\n" > {log} 2>&1 + + printf " Downloading reference db:\n\n" >> {log} 2>&1 + curl -L -C - -o {params.compressed_cat} {params.cat_dl_link} >> {log} 2>&1 + + printf "\n\n Extracting reference db:\n\n" >> {log} 2>&1 + tar -xvzf {params.compressed_cat} -C {params.REF_DB_ROOT_DIR} >> {log} 2>&1 + + rm {params.compressed_cat} {params.compressed_nr_faa} + + touch {output.cat_db_trigger} + """ + + +rule setup_KOFamScan_db: + """ + This rule checks for the KOFamScan db (minimally currently) and downloads if needed. + """ + + conda: + "envs/kofamscan.yaml" + output: + kofamscan_db_trigger = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/" + config["KOFAMSCAN_TRIGGER_FILE"] + params: + ko_db_dir = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"], + compressed_ko_list = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/ko_list.gz", + compressed_profiles = config["REF_DB_ROOT_DIR"] + config["KOFAMSCAN_DIR"] + "/profiles.tar.gz" + log: + config["logs_dir"] + "setup-kofamscan-db.log" + benchmark: + "benchmarks/setup_KOFamScan_db-benchmarks.tsv" + shell: + """ + mkdir -p {params.ko_db_dir} + + printf "### Setting up KOFamScan reference database ###\n\n" > {log} 2>&1 + + # using https instead of ftp for those whose systems that don't have access to the ftp servers + + printf "\n Downloading ko_list file:\n\n" >> {log} 2>&1 + + if ! curl -L -C - --connect-timeout 15 -o {params.compressed_ko_list} ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz >> {log} 2>&1 + then + printf "\n\n Downloading via http since ftp seemed to fail making the connection:\n\n" + curl -L -C - -o {params.compressed_ko_list} https://www.genome.jp/ftp/db/kofam/ko_list.gz >> {log} 2>&1 + fi + + printf "\n\n Downloading profiles.tar.gz file:\n\n" >> {log} 2>&1 + + + if ! curl -L -C - --connect-timeout 15 -o {params.compressed_profiles} ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz >> {log} 2>&1 + then + printf "\n\n Downloading via http since ftp seemed to fail making the connection:\n\n" + curl -L -C - -o {params.compressed_profiles} https://www.genome.jp/ftp/db/kofam/profiles.tar.gz >> {log} 2>&1 + fi + + printf "\n\n Decompressing profiles.tar.gz file:\n\n" >> {log} 2>&1 + tar -xzf {params.compressed_profiles} -C {params.ko_db_dir} >> {log} 2>&1 + rm {params.compressed_profiles} + + gunzip {params.compressed_ko_list} + + touch {output.kofamscan_db_trigger} + """ + + +rule setup_gtdbtk_db: + """ + This rule checks for the gtdb-tk db (minimally currently) and downloads if needed. + """ + + conda: + "envs/gtdb-tk.yaml" + output: + gtdbtk_db_trigger = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + "/" + config["GTDB_TRIGGER_FILE"] + params: + gtdbtk_db_dir = config["REF_DB_ROOT_DIR"] + config["GTDB_DATA_PATH"] + log: + config["logs_dir"] + "setup-gtdbtk-db.log" + benchmark: + "benchmarks/setup_gtdbtk_db-benchmarks.tsv" + shell: + """ + mkdir -p {params.gtdbtk_db_dir} + + # storing current working directory to be able to send the log file here + working_dir=$(pwd) + + cd {params.gtdbtk_db_dir} + + # adding wanted location to this conda env PATH (gtdb-tk looks in the GTDBTK_DATA_PATH variable), + # so will be set when the conda environment is started from now on + mkdir -p ${{CONDA_PREFIX}}/etc/conda/activate.d/ + echo 'export GTDBTK_DATA_PATH={params.gtdbtk_db_dir}' >> ${{CONDA_PREFIX}}/etc/conda/activate.d/set_env_vars.sh + + # but still needs to be set for this particular session that is downloading and setting up the db + GTDBTK_DATA_PATH={params.gtdbtk_db_dir} + + # now downloading + download-db.sh > ${{working_dir}}/{log} 2>&1 + + cd - > /dev/null + + touch {output.gtdbtk_db_trigger} + """ + + +rule setup_humann3_dbs: + """ + This rule checks for the databases required for humann3, downloads if needed. + """ + + conda: + "envs/humann3.yaml" + output: + chocophlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_CHOCOPHLAN_TRIGGER_FILE"], + uniref_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UNIREF_TRIGGER_FILE"], + utility_db_trigger = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"] + "/" + config["HUMANN3_UTILITY_MAPPING_TRIGGER_FILE"], + metaphlan_db_trigger = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + "/" + config["METAPHLAN_TRIGGER_FILE"] + params: + humann3_dbs_dir = config["REF_DB_ROOT_DIR"] + config["HUMANN3_DBS_DIR"], + metaphlan_dir = config["REF_DB_ROOT_DIR"] + config["METAPHLAN4_DB_DIR"] + resources: + mem_mb = 100000 + log: + config["logs_dir"] + "setup-humann3-dbs.log" + benchmark: + "benchmarks/setup_humann3_dbs-benchmarks.tsv" + shell: + """ + mkdir -p {params} + + printf "### Setting up humann3 reference databases ###\n\n" > {log} 2>&1 + + if [ ! -f {output.chocophlan_db_trigger} ] + then + printf " Downloading full chocophlan db:\n\n" >> {log} 2>&1 + humann3_databases --download chocophlan full {params.humann3_dbs_dir} >> {log} 2>&1 + touch {output.chocophlan_db_trigger} + fi + + if [ ! -f {output.uniref_db_trigger} ] + then + printf "\n\n Downloading uniref90_ec_filtered_diamond db:\n\n" >> {log} 2>&1 + humann3_databases --download uniref uniref90_ec_filtered_diamond {params.humann3_dbs_dir} >> {log} 2>&1 + touch {output.uniref_db_trigger} + fi + + if [ ! -f {output.utility_db_trigger} ] + then + printf "\n\n Downloading full utility_mapping db:\n\n" >> {log} 2>&1 + humann3_databases --download utility_mapping full {params.humann3_dbs_dir} >> {log} 2>&1 + touch {output.utility_db_trigger} + fi + + if [ ! -f {output.metaphlan_db_trigger} ] + then + printf "\n\n Downloading metaphlan db:\n\n" >> {log} 2>&1 + metaphlan --install --index mpa_vJan21_CHOCOPhlAnSGB_202103 --bowtie2db {params.metaphlan_dir} >> {log} 2>&1 + # above added due to issues discussed here: https://forum.biobakery.org/t/metaphlan-v4-0-2-and-huma-3-6-metaphlan-taxonomic-profile-provided-was-not-generated-with-the-expected-database/4296/29 + # metaphlan --install --bowtie2db {params.metaphlan_dir} >> {log} 2>&1 + touch {output.metaphlan_db_trigger} + fi + """ + + +rule clean_all: + shell: + """ + rm -rf {dirs_to_create} + """ diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh deleted file mode 100755 index 4ac5d2de..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/bin/clean-paths.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash -set -e -# only built for use on N288 cluster -# example usage: bash clean-paths.sh -# making sure by chance we are not overwriting a wanted file called 't' - -if [ -s t ]; then -printf "\n This simple program temporarily writes to a file called 't'\n" -printf " Since that exists already here, we are not going to continue.\n\n" -exit -fi - - -sed 's|/global/data/Data_Processing/Metagenomics_Datasets/GLDS_Datasets/||g' ${1} \ -| sed 's|/global/data/Data_Processing/Amplicon_Datasets/GLDS_Datasets/||g' \ -| sed 's|/global/data/Data_Processing/Metagenomics_Datasets/||g' \ -| sed 's|/global/data/Data_Processing/Amplicon_Datasets/||g' \ -| sed 's|/global/smf/miniconda38_admin/envs/[^/]*/||g' \ -| sed 's|/[^ ]*/GLDS-|GLDS-|g' \ -| sed 's|/global/[^ ]*||g' > t && mv t ${1} \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml new file mode 100644 index 00000000..d340c671 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/config.yaml @@ -0,0 +1,258 @@ +############################################################################################ +## Configuration file for GeneLab Illumina metagenomics processing workflow ## +## Developed by Michael D. Lee (Mike.Lee@nasa.gov) ## +############################################################################################ + +############################################################ +##################### VARIABLES TO SET ##################### +############################################################ + +############################################################################ +##### This first set of variables needs to match what is on our system ##### +############################################################################ + +## single-column file with unique portion of sample names +sample_info_file: + "unique-sample-IDs.txt" + +## raw reads directory (can be relative to workflow directory, or needs to be full path) +raw_reads_dir: + "../Raw_Sequence_Data/" + +## if data are single-end only (only one read-file per sample), set this to "TRUE", anything else is considered paired-end +single_end_data: + "" + +## raw read suffixes (region following the unique part of the sample names) + # e.g. for "Sample-1_R1_raw.fastq.gz" would be "_R1_raw.fastq.gz" +raw_R1_suffix: + "_R1_raw.fastq.gz" +raw_R2_suffix: + "_R2_raw.fastq.gz" + + # if single-end data, set this one (others above don't matter) +raw_suffix: + "_raw.fastq.gz" + +## root directory of reference databases (or where they will be downloaded if they don't exist yet) + # this should be provided as a full path (starting with `/`) and include the ending `/` as in the + # below example (note that the the `~/` home shortcut is not expanded + # by snakemake's evaluation of files, so don't use that) + # also note, if this is a GeneLab processed dataset config file, the path may + # have been modified for security purposes and no longer be listed as a full path here +REF_DB_ROOT_DIR: + "/path/to/ref-dbs/" + +###################################################################### +##### The rest only need to be altered if we want to change them ##### +###################################################################### + +## run assembly-based workflow, read-based, or both +# (values need to be one of: "assembly-based", "read-based", or "both") +workflow: + "both" + +## number of threads to use PER snakemake job (which is set with the -j parameter passed to snakemake call) + # passed to megahit, bowtie2, samtools, metabat2, checkm-pplacer (many may be running concurrently) +num_threads: + 8 + +## number of CPUs to use PER snakemake job + # passed to KOFamScan, CAT, checkm (many may be running concurrently) +num_cpus: + 8 + +## number of cpus passed to pplacer by gtdb-tk and checkm, pplacer can have issues with memory with multiple cpus; see e.g. https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes +gtdb_tk_checkm_pplacer_cpus: + 1 + +## number of CPUs to use for gtdb-tk (only 1 gtdb-tk job will be run, so not multiplied) +gtdb_tk_num_cpus: + 8 + +## scratch directory for gtdb-tk, if wanting to use disk space instead of RAM, can be memory intensive; see https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes + # leave empty if wanting to use memory, the default, put in quotes the path to a directory that already exists if wanting to use disk space +gtdb_tk_scratch_location: + "" + +## maximum memory allowed passed to megahit assembler + # can be set either by proportion of available on system, e.g. 0.5 + # or by absolute value in bytes, e.g. 100e9 would be 100 GB +max_mem_megahit: + 100e9 + +## Block size variable for CAT/diamond, lower value means less RAM usage; see https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options#memory--performance-options +block_size: + 4 + +## reduced_tree option for checkm, limits the RAM usage to 16GB; https://github.com/Ecogenomics/CheckM/wiki/Genome-Quality-Commands#tree + # "TRUE" for yes, anything will be considered "FALSE" and the default full tree will be used +reduced_tree: + "" + +## MAG filtering cutoffs based on checkm quality assessments (in percent); see https://github.com/Ecogenomics/CheckM/wiki/Reported-Statistics +minimum_estimated_completion: + 90 +maximum_estimated_redundancy: + 10 +maximum_estimated_strain_heterogeneity: + 50 + +## quality trimmed/filtered suffixes +filtered_R1_suffix: + "_R1_filtered.fastq.gz" +filtered_R2_suffix: + "_R2_filtered.fastq.gz" + +# if single-end +filtered_suffix: + "_filtered.fastq.gz" + +## output directories (all relative to processing directory, will be created) +fastqc_out_dir: + "../FastQC_Outputs/" +filtered_reads_dir: + "../Filtered_Sequence_Data/" +assembly_based_dir: + "../Assembly-based_Processing/" +assemblies_dir: + "../Assembly-based_Processing/assemblies/" +genes_dir: + "../Assembly-based_Processing/predicted-genes/" +annotations_and_tax_dir: + "../Assembly-based_Processing/annotations-and-taxonomy/" +mapping_dir: + "../Assembly-based_Processing/read-mapping/" +combined_output_dir: + "../Assembly-based_Processing/combined-outputs/" +bins_dir: + "../Assembly-based_Processing/bins/" +MAGs_dir: + "../Assembly-based_Processing/MAGs/" +read_based_dir: + "../Read-based_Processing/" +logs_dir: + "logs/" + + +## additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets) +# leave as empty, i.e. "", if not wanted, include separator at end if adding one, e.g. "Swift1S_" +additional_filename_prefix: + "" + + +## setting for trimming recommended when working with Swift 1S libraries + # adds `swift=t` setting to bbduk quality trimming/filtering command + # for info on this see, e.g., https://swiftbiosci.com/wp-content/uploads/2019/03/16-0853-Tail-Trim-Final-442019.pdf + # set to "TRUE" if data was generated with Swift 1S library prep +swift_1S: + "FALSE" + +## memory used by bbmap's pileup.sh (within the get_cov_and_det rule) +# passed as the -Xmx parameter, 20g means 20 gigs of RAM, 20m means 20 megabytes +# 5g should be sufficient for most assemblies, but if that rule is failing, this may need to be increased +pileup_mem: + "5g" + +################################################################################################################ +##### Resource specifications that may need to be changed (mostly only necessary if using a job scheduler) ##### +####### Could leave these as-is to start, but they are here to be increased if a job fails due to memory ####### +################################################################################################################ + +### these are all passed in the "resources" directive of their respective rules in the Snakefile, going to + # the "mem_mb" argument (so should be provided in terms of megabytes) + +# passed to megahit in the assembly_PE and assembly_SE rules + # this should match what is passed to "max_mem_megahit" above, though it needs to be written differently + # this is passed as "mem_mb", so 100000 would be equal to the default 100e9 set above for "max_mem_megahit" +megahit_memory_resources: + 100000 + +# passed to pileup.sh within the get_cov_and_det rule + # should match what is passed to "pileup_mem" above, though needs to be written differently + # this is passed as "mem_mb", e.g., 5g above, for 5 gigabytes, would be 5000 megabytes, so we need to set this variable to 5000 +pileup_memory_resources: + 5000 + +# passed to mapping_SE and mapping_PE rules, passed as "mem_mb", so 25000 here means 25 gigabytes of memory will be allocated by the scheduler +mapping_memory_resources: + 25000 + +# passed to rule gtdbtk_on_MAGs +gtdbtk_memory_resources: + 500000 + +# passed to rule checkm_on_bins: +checkm_memory_resources: + 250000 + +# passed to humann3 on rules humann3_PE and humann3_SE: +humann3_memory_resources: + 100000 + +# passed to CAT in tax_classification rule: +CAT_memory_resources: + 40000 + +# passed to KOFamScan in rule KO_annotation +KOFamScan_memory_resources: + 5000 + + +####################################################### +################# REFERENCE DATABASES ################# +####################################################### +# The below variables probably shouldn't be changed unless we really want to for some reason. +# The workflow will check the location pointed to above for the below databases, and install them +# if they are not already there. It looks for the below "TRIGGER" filenames (they +# all end with "*_DB_SETUP") in the directory for each database, which it creates when +# it sets them up initially. If we want to point to DBs that already exist on our setup, +# we need to add these (empty) files to their respective directories. The +# workflow just checks the file is there to know it doesn't need to setup the DB. +# +# All together, after installed and unpacked, these will take up about 240 GB. But may +# require up to 500 GB during installation and initial un-packing. + +## specific database locations +KOFAMSCAN_DIR: + "kofamscan_db" +KOFAMSCAN_TRIGGER_FILE: + "KO_DB_SETUP" +CAT_DIR: + "CAT_prepare_20210107" +CAT_DL_FILE: + "CAT_prepare_20210107.tar.gz" +CAT_DL_LINK: + "tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" +CAT_TRIGGER_FILE: + "CAT_DB_SETUP" +CAT_DB: + "/2021-01-07_CAT_database" +CAT_TAX: + "/2021-01-07_taxonomy" +GTDB_DATA_PATH: + "GTDB-tk-ref-db" +GTDB_TRIGGER_FILE: + "GTDBTK_DB_SETUP" +HUMANN3_DBS_DIR: + "humann3-db" +HUMANN3_CHOCOPHLAN_TRIGGER_FILE: + "CHOCOPHLAN_DB_SETUP" +HUMANN3_UNIREF_TRIGGER_FILE: + "UNIREF_DB_SETUP" +HUMANN3_UTILITY_MAPPING_TRIGGER_FILE: + "UTILITY_MAPPING_SETUP" +METAPHLAN4_DB_DIR: + "metaphlan4-db" +METAPHLAN_TRIGGER_FILE: + "METAPHLAN4_DB_SETUP" + +## example usage command ## +# snakemake --use-conda --conda-prefix ${CONDA_PREFIX}/envs -j 2 -p + +# `--use-conda` – this specifies to use the conda environments included in the workflow +# `--conda-prefix` – this allows us to point to where the needed conda environments should be stored. Including this means if we use the workflow on a different dataset somewhere else in the future, it will re-use the same conda environments rather than make new ones. The value listed here, `${CONDA_PREFIX}/envs`, is the default location for conda environments (the variable `${CONDA_PREFIX}` will be expanded to the appropriate location on whichever system it is run on). +# `-j` – this lets us set how many jobs Snakemake should run concurrently (keep in mind that many of the thread and cpu parameters set in the config.yaml file will be multiplied by this) +# `-p` – specifies to print out each command being run to the screen + +# See `snakemake -h` for more options and details. diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml index 1e286a7c..f228b5fa 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/envs/cat.yaml @@ -1,4 +1,3 @@ -name: CAT channels: - conda-forge - bioconda diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py new file mode 100644 index 00000000..01f44186 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python + +""" +This is an ad hoc script for the corresponding workflow. It combines and summarizes all the group sample gene-level coverages by KO annotations (1 table) and taxonomy (another table), +including not annotated and not classified, and also produces CPM (coverage-per-million) normalized versions. + +Modified from my `bit-GL-combine-KO-and-tax-tables`: https://github.com/AstrobioMike/bioinf_tools +""" + +import os +import sys +import argparse +import textwrap +import pandas as pd +from math import isnan +from numpy import NaN + +parser = argparse.ArgumentParser(description="This is an ad hoc script for the corresponding workflow. It combines and summarizes \ + all the group sample gene-level coverages by KO annotations (1 table) and taxonomy (another table),\ + including not annotated and not classified, and also produces CPM (coverage-per-million) normalized versions.") + +required = parser.add_argument_group('required arguments') + +required.add_argument("input_tables", metavar="input-tables", type=str, nargs="+", help="Input coverage, annotation, and tax tables (as written, expected to end with extension '.tsv'.") +parser.add_argument("-o", "--output-dir", help='Desired output directory (default: "./")', action="store", default="./", dest="output_dir") + +if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(0) + +args = parser.parse_args() + +################################################################################ + + +def main(): + + check_all_inputs_exist(args.input_tables) + + input_files, group_names = setup_input_lists(args.input_tables) + + KO_dict, tax_dict = {}, {} + + na_taxids = [] + + KO_collapsed_tabs, tax_collapsed_tabs, KO_dict, tax_dict = process_each_table(input_files, KO_dict, tax_dict, na_taxids) + + combined_KO_tab, combined_norm_KO_tab, combined_tax_tab, combined_norm_tax_tab = combine_tabs(KO_collapsed_tabs, tax_collapsed_tabs, KO_dict, tax_dict) + + # writing out tables + combined_KO_tab.to_csv(args.output_dir + "All-combined-KO-function-coverages.tsv", index=False, sep="\t") + combined_norm_KO_tab.to_csv(args.output_dir + "All-combined-KO-function-CPM-normalized-coverages.tsv", index=False, sep="\t") + + combined_tax_tab.to_csv(args.output_dir + "All-combined-taxonomy-coverages.tsv", index=False, sep="\t") + combined_norm_tax_tab.to_csv(args.output_dir + "All-combined-taxonomy-CPM-normalized-coverages.tsv", index=False, sep="\t") + + +################################################################################ + + +# setting some colors +tty_colors = { + 'green' : '\033[0;32m%s\033[0m', + 'yellow' : '\033[0;33m%s\033[0m', + 'red' : '\033[0;31m%s\033[0m' +} + + +### functions ### +def color_text(text, color='green'): + if sys.stdout.isatty(): + return tty_colors[color] % text + else: + return text + + +def wprint(text): + """ print wrapper """ + + print(textwrap.fill(text, width=80, initial_indent=" ", + subsequent_indent=" ", break_on_hyphens=False)) + + +def check_all_inputs_exist(input_tables): + + for file in input_tables: + if not os.path.exists(file): + print("") + wprint(color_text("It seems the specified input file '" + str(file) + "' can't be found.", "yellow")) + print("\nExiting for now.\n") + sys.exit(1) + + +def setup_input_lists(input_tables): + """ setting up input lists for file locations and sample names """ + + input_files = [] + group_names = [] + + for group in input_tables: + input_files.append(group) + group_names.append(os.path.splitext(os.path.basename(group))[0].replace("-gene-coverages-annotations-and-tax", "")) + + return(input_files, group_names) + + +def add_to_KO_dict(table, KO_dict): + """ function for building KO mapping dictionary """ + + for index, row in table.iterrows(): + + if str(row["KO_ID"]).startswith("K"): + + if str(row["KO_ID"]) not in KO_dict: + + KO_dict[row["KO_ID"]] = row["KO_function"] + + return(KO_dict) + + +def add_to_tax_dict(table, tax_dict, na_taxids): + """ function for building tax mapping dictionary """ + + for index, row in table.iterrows(): + + # skipping if not classified + if not pd.isna(row["taxid"]): + + if not row["taxid"] in tax_dict: + tax_dict[row["taxid"]] = row[["domain", "phylum", "class", "order", "family", "genus", "species"]].tolist() + + # some taxids have all NA for these ranks (like 1 and 131567), keep track so can sum with not classified + if len(set(row[["domain", "phylum", "class", "order", "family", "genus", "species"]].tolist())) == 1: + na_taxids.append(row["taxid"]) + + + return(tax_dict, na_taxids) + + +def get_na_taxids(tax_dict): + """ some taxids have all NA for these ranks (like 1 and 131567), keeping track of those so can sum together with the "Not classified" row """ + + na_taxids = [] + + for key in tax_dict: + if len(set(mock_dict[key])) == 1: + na_taxids.append(key) + + return(na_taxids) + + +def process_each_table(input_files, KO_dict, tax_dict, na_taxids): + """ reads in each table, normalizes coverage values, collapses based on KO annotations """ + + KO_collapsed_tabs = [] + tax_collapsed_tabs = [] + + for i in range(len(input_files)): + + tab = pd.read_csv(input_files[i], sep="\t", dtype = {'taxid': str}) + + # getting sample column names + sample_cols = tab.columns[11:].tolist() + + # building dictionaries that will hold all KO terms and taxa from all input files + KO_dict = add_to_KO_dict(tab, KO_dict) + tax_dict, na_taxids = add_to_tax_dict(tab, tax_dict, na_taxids) + + # making collapsed KO and tax tables + KO_tab = tab[['KO_ID'] + sample_cols].copy() + # collapsing based on KO terms + KO_tab = KO_tab.groupby(by = ['KO_ID'], dropna = False).sum() + + tax_tab = tab[['taxid'] + sample_cols].copy() + # setting any taxids that are all NA at these standard ranks to "NA" (some have an assigned taxid, but don't have a D/P/C/O/F/G/S taxid, like 1 and 131567) + tax_tab.replace(na_taxids, NaN, inplace = True) + # collapsing based on tax + tax_tab = tax_tab.groupby(by = ['taxid'], dropna = False).sum() + + # appending to lists of tables + KO_collapsed_tabs.append(KO_tab) + tax_collapsed_tabs.append(tax_tab) + + return(KO_collapsed_tabs, tax_collapsed_tabs, KO_dict, tax_dict) + + +def add_KO_functions(tab, KO_dict): + """ adds KO functions to combined table based on KO_ID and KO_dict object holding mappings """ + + KO_functions = [] + + for KO in tab.KO_ID: + + if KO in KO_dict: + + KO_functions.append(str(KO_dict[KO])) + + else: + + KO_functions.append("Not annotated") + + tab.insert(1, "KO_function", KO_functions) + + return(tab) + +def add_tax_info(tab, tax_dict): + """ adds lineage info back to combined table based on taxid and tax_dict object holding mappings """ + + domain_list, phylum_list, class_list, order_list, family_list, genus_list, species_list = [], [], [], [], [], [], [] + + for taxid in tab.taxid: + + if taxid in tax_dict: + + if isinstance(tax_dict[taxid][0], str): + domain_list.append(tax_dict[taxid][0]) + else: + domain_list.append("NA") + + if isinstance(tax_dict[taxid][1], str): + phylum_list.append(tax_dict[taxid][1]) + else: + phylum_list.append("NA") + + if isinstance(tax_dict[taxid][2], str): + class_list.append(tax_dict[taxid][2]) + else: + class_list.append("NA") + + if isinstance(tax_dict[taxid][3], str): + order_list.append(tax_dict[taxid][3]) + else: + order_list.append("NA") + + if isinstance(tax_dict[taxid][4], str): + family_list.append(tax_dict[taxid][4]) + else: + family_list.append("NA") + + if isinstance(tax_dict[taxid][5], str): + genus_list.append(tax_dict[taxid][5]) + else: + genus_list.append("NA") + + + if isinstance(tax_dict[taxid][6], str): + species_list.append(tax_dict[taxid][6]) + else: + species_list.append("NA") + + else: + domain_list.append("NA") + phylum_list.append("NA") + class_list.append("NA") + order_list.append("NA") + family_list.append("NA") + genus_list.append("NA") + species_list.append("NA") + + tab.insert(1, "domain", domain_list) + tab.insert(2, "phylum", phylum_list) + tab.insert(3, "class", class_list) + tab.insert(4, "order", order_list) + tab.insert(5, "family", family_list) + tab.insert(6, "genus", genus_list) + tab.insert(7, "species", species_list) + + return(tab) + + +def combine_tabs(KO_tab_list, tax_tab_list, KO_dict, tax_dict): + """ combines all KO tables into one and all tax tables into one """ + + # combining KO tabs + KO_combined_tab = pd.concat(KO_tab_list, axis=1).drop_duplicates().fillna(0).sort_index() + + # moving index to be column and changing that NaN to be "Not annotated", and naming column back to KO_ID + KO_combined_tab = KO_combined_tab.reset_index().fillna("Not annotated") + KO_combined_tab.rename(columns = {"index":'KO_ID'}, inplace = True) + + # adding KO functions + KO_combined_tab = add_KO_functions(KO_combined_tab, KO_dict) + + # combining tax tabs + tax_combined_tab = pd.concat(tax_tab_list, axis=1).drop_duplicates().fillna(0).sort_index() + + # moving index to be column and naming column back to taxid + tax_combined_tab = tax_combined_tab.reset_index() + + # changing the NaN to be "Not annotated" and naming column back to taxid + tax_combined_tab['index'] = tax_combined_tab['index'].fillna("Not classified") + tax_combined_tab.rename(columns = {"index":'taxid'}, inplace = True) + + # adding tax full lineage info + tax_combined_tab = add_tax_info(tax_combined_tab, tax_dict) + + # making CPM-normalized versions of each + KO_combined_norm_tab = KO_combined_tab.copy() + tax_combined_norm_tab = tax_combined_tab.copy() + + # getting sample column names + sample_cols = KO_combined_norm_tab.columns[2:].tolist() + + # making normalized versions + for col in sample_cols: + KO_combined_norm_tab[col] = KO_combined_norm_tab[col] / KO_combined_norm_tab[col].sum() * 1000000 + tax_combined_norm_tab[col] = tax_combined_norm_tab[col] / tax_combined_norm_tab[col].sum() * 1000000 + + return(KO_combined_tab, KO_combined_norm_tab, tax_combined_tab, tax_combined_norm_tab) + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh new file mode 100644 index 00000000..7c006303 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -e + +ls benchmarks/ > benchmark-filenames.tmp + +head -n 1 benchmarks/$( head -n 1 benchmark-filenames.tmp ) > benchmark-header.tmp + +paste <( printf "process" ) benchmark-header.tmp > building-tab.tmp + +for file in $(cat benchmark-filenames.tmp) +do + + cat <( paste <( echo ${file} | sed 's/-benchmarks.tsv//' ) <( tail -n +2 benchmarks/${file} ) ) >> building-tab.tmp + +done + +mv building-tab.tmp all-benchmarks.tsv +rm -rf benchmark-filenames.tmp benchmark-header.tmp diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py new file mode 100644 index 00000000..f6737c0f --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +""" +This is an ad hoc script for the corresponding workflow. It combines the sample gene-level coverage files, taxonomy, and KO annotations into one table for each group of samples. +It produces 2 output files, one normalized to coverage-per-million, one not normalized. + +Modified from my `bit-GL-combine-KO-and-tax-tables`: https://github.com/AstrobioMike/bioinf_tools +""" + +import os +import sys +import argparse +import textwrap +import pandas as pd +from math import isnan +from numpy import NaN + +parser = argparse.ArgumentParser(description="This is an ad hoc script for the corresponding workflow. It combines the individual sample gene-level coverage files and KO annotations into one table. \ + It produces 2 output files, one normalized to coverage-per-million, one not normalized.") + +required = parser.add_argument_group('required arguments') + + +required.add_argument("input_coverage_tables", metavar="input-coverage-tables", type=str, nargs="+", help="Input gene-level coverage tables (as written, expected to end with extension '.tsv'.") +required.add_argument("-a", "--KO-annotations-file", help="Input KO annotation table", action="store", dest="KO_tab") +required.add_argument("-t", "--taxonomy-file", help="Input taxonomy table", action="store", dest="tax_tab") +required.add_argument("-g", "--group-ID", help="Group ID", action="store", dest="group") +parser.add_argument("-o", "--output-dir", help="Output directory", action="store", dest="output_dir", default="./") + +if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(0) + +args = parser.parse_args() + +################################################################################ + + +def main(): + + check_all_inputs_exist(args.input_coverage_tables, args.KO_tab, args.tax_tab) + + input_files, sample_names = setup_input_lists(args.input_coverage_tables, args.group) + + building_df = combine_KO_and_tax_tab(args.KO_tab, args.tax_tab) + + unnormd_combined_tab, normd_combined_tab = process_and_combine_each_coverage_table(input_files, sample_names) + + unnormd_final_tab, normd_final_tab = combine_all(building_df, unnormd_combined_tab, normd_combined_tab) + + # writing out + unnormd_final_tab.to_csv(args.output_dir + args.group + "-gene-coverages-annotations-and-tax.tsv", index=False, sep="\t", na_rep = "NA") + normd_final_tab.to_csv(args.output_dir + args.group + "-CPM-normalized-gene-coverages-annotations-and-tax.tsv", index=False, sep="\t", na_rep = "NA") + + +################################################################################ + + +# setting some colors +tty_colors = { + 'green' : '\033[0;32m%s\033[0m', + 'yellow' : '\033[0;33m%s\033[0m', + 'red' : '\033[0;31m%s\033[0m' +} + + +### functions ### +def color_text(text, color='green'): + if sys.stdout.isatty(): + return tty_colors[color] % text + else: + return text + + +def wprint(text): + """ print wrapper """ + + print(textwrap.fill(text, width=80, initial_indent=" ", + subsequent_indent=" ", break_on_hyphens=False)) + + +def check_all_inputs_exist(input_tables, KO_tab, tax_tab): + + for file in input_tables + [KO_tab] + [tax_tab]: + if not os.path.exists(file): + print("") + wprint(color_text("It seems the specified input file '" + str(file) + "' can't be found.", "yellow")) + print("\nExiting for now.\n") + sys.exit(1) + + +def setup_input_lists(input_tables, group): + """ setting up input lists for file locations and sample names """ + + input_files = [] + sample_names = [] + + for sample in input_tables: + input_files.append(sample) + sample_names.append(os.path.splitext(os.path.basename(sample))[0].replace("-" + group + "-gene-coverages", "")) + + return(input_files, sample_names) + + +def combine_KO_and_tax_tab(KO_tab, tax_tab): + + KO_df = pd.read_csv(KO_tab, sep="\t") + tax_df = pd.read_csv(tax_tab, sep="\t", dtype = {"taxid": pd.Int64Dtype()}) # this is needed to keep the taxids without a decimal while handling those that are NA + combined_df = pd.merge(KO_df, tax_df) + return(combined_df) + +def process_and_combine_each_coverage_table(input_files, sample_names): + """ reads in each table, creates combined tables, one normalized to coverage-per-million, one not normalized """ + + normd_tabs = [] + unnormd_tabs = [] + + # iterator to access the same input file and sample name + for i in range(len(input_files)): + + unnormd_tab = pd.read_csv(input_files[i], sep="\t") + + # generating a normalized version + normd_tab = unnormd_tab.copy() + normd_tab.coverage = normd_tab.coverage / normd_tab.coverage.sum() * 1000000 + + # changing coverage column headers to be sample name + unnormd_tab.rename(columns = {"coverage":sample_names[i]}, inplace = True) + normd_tab.rename(columns = {"coverage":sample_names[i]}, inplace = True) + + # adding to lists + unnormd_tabs.append(unnormd_tab) + normd_tabs.append(normd_tab) + + + # combining tables + unnormd_combined_tab = pd.concat(unnormd_tabs, axis = 1).T.drop_duplicates().T + normd_combined_tab = pd.concat(normd_tabs, axis = 1).T.drop_duplicates().T + + return(unnormd_combined_tab, normd_combined_tab) + +def combine_all(building_df, unnormd_combined_tab, normd_combined_tab): + """ combines KO annotations and tax with coverage tables """ + + final_unnormd_tab = pd.merge(building_df, unnormd_combined_tab) + final_normd_tab = pd.merge(building_df, normd_combined_tab) + + return(final_unnormd_tab, final_normd_tab) + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh new file mode 100644 index 00000000..984b427d --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +awk -F $'\t' ' BEGIN { OFS = FS } { if ( $2 == "classification" ) { print $1,$4,$6,$7,$8,$9,$10,$11,$12 } \ + else if ( $2 == "no taxid assigned" ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ + else { n=split($4,lineage,";"); print $1,lineage[n],$6,$7,$8,$9,$10,$11,$12 } } ' ${1} \ + | sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/^# contig/contig_ID/' | sed 's/lineage/taxid/' > ${2} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh new file mode 100644 index 00000000..7e7e9dff --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + + awk -F $'\t' ' BEGIN { OFS = FS } { if ( $3 == "lineage" ) { print $1,$3,$5,$6,$7,$8,$9,$10,$11 } \ + else if ( $2 == "ORF has no hit to database" || $2 ~ /^no taxid found/ ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ + else { n=split($3,lineage,";"); print $1,lineage[n],$5,$6,$7,$8,$9,$10,$11 } } ' ${1} \ + | sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/# ORF/gene_ID/' | sed 's/lineage/taxid/' > ${2} diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh new file mode 100644 index 00000000..39b3aef0 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash + +sample_IDs_file=${1} +assemblies_dir=${2} +genes_dir=${3} +mapping_dir=${4} +bins_dir=${5} +MAGs_dir=${6} +output=${7} + +# starting output file +printf "Sample_ID\tassembly_produced\tgene_calls_identified\tread_mapping_successful\tbins_recovered\tMAGs_recovered\n" > ${output} + +# looping through all input files and generating columns for final table +for sample in $(cat ${sample_IDs_file}) +do + + # checking assembly + if [ ! -s ${assemblies_dir}/${sample}-assembly.fasta ]; then + printf "No\n" >> assembly-status.tmp + + # removing empty output fasta + rm -rf ${assemblies_dir}/${sample}-assembly.fasta + + else + printf "Yes\n" >> assembly-status.tmp + fi + + # checking gene calls + if [ ! -s ${genes_dir}/${sample}-genes.faa ]; then + printf "No\n" >> genes-status.tmp + + # removing empty output files + rm -rf ${genes_dir}/${sample}-genes.faa ${genes_dir}/${sample}-genes.fasta ${genes_dir}/${sample}-genes.gff + + else + printf "Yes\n" >> genes-status.tmp + fi + + # checking read-mapping outputs + if [ ! -s ${mapping_dir}/${sample}.bam ]; then + printf "No\n" >> mapping-status.tmp + + # removing empty output files + rm -rf ${mapping_dir}/${sample}.bam ${mapping_dir}/${sample}-metabat-assembly-depth.tsv + + else + printf "Yes\n" >> mapping-status.tmp + fi + + # getting number of bins recovered if any produced + if compgen -G "${bins_dir}*.fasta" > /dev/null; then + num_bins=$(ls ${bins_dir}*.fasta | grep -c "${sample}-bin.[0-9]*.fasta") + printf "${num_bins}\n" >> bins-status.tmp + else + printf "0\n" >> bins-status.tmp + fi + + # getting number of MAGs recovered + if compgen -G "${MAGs_dir}*.fasta" >/dev/null; then + num_MAGs=$(ls ${MAGs_dir}*.fasta | grep -c "${sample}-MAG-[0-9]*.fasta") + printf "${num_MAGs}\n" >> MAGs-status.tmp + else + printf "0\n" >> MAGs-status.tmp + fi + +done + +# combining, adding to output file and removing intermediates +cat <( paste ${sample_IDs_file} assembly-status.tmp \ + genes-status.tmp mapping-status.tmp \ + bins-status.tmp MAGs-status.tmp ) >> ${output} + +rm assembly-status.tmp genes-status.tmp mapping-status.tmp bins-status.tmp MAGs-status.tmp diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py new file mode 100644 index 00000000..a0c172fd --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import argparse + +parser = argparse.ArgumentParser(description='This script does whatever it needs to do.') + +required = parser.add_argument_group('required arguments') + +required.add_argument("-i", "--input-tsv", help='no help for you, come back, 2 years!', action="store", required=True) +required.add_argument("-w", "--wanted-things", help="what'd i tell you?", action="store", required=True) +required.add_argument("-M", "--MAG-ID", action="store", required=True) + +parser.add_argument("-o", "--output_tsv", help='Default: "out.tsv"', action="store", dest="output_tsv", default="out.tsv") + +args = parser.parse_args() + +targets_set = set(line.strip() for line in open(args.wanted_things)) + +out_tab = open(args.output_tsv, "a") + +for line in open(args.input_tsv): + line = line.strip().split("\t") + if line[2] != "NA": + + # dropping last coding seq # field so matches contig ID + if line[0].rsplit('_', 1)[0] in targets_set: + + out_tab.write(str(args.MAG_ID) + "\t" + line[2] + "\n") + +out_tab.close() diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py new file mode 100644 index 00000000..2acb7e3e --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +import subprocess +import sys + +jobid = sys.argv[1] + +# if wanting to use, this should be added to the snakemake call from the root workflow dir: `--cluster-status scripts/slurm-status.py` + +output = str(subprocess.check_output("sacct -j %s --format State --noheader | head -1 | awk '{print $1}'" % jobid, shell=True).strip()) + +running_status=["PENDING", "CONFIGURING", "COMPLETING", "RUNNING", "SUSPENDED"] +if "COMPLETED" in output: + print("success") +elif any(r in output for r in running_status): + print("running") +else: + print("failed") diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py new file mode 100644 index 00000000..830c62dc --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import argparse +import pandas as pd + +parser = argparse.ArgumentParser(description='This script swaps the MAG IDs back to what they were prior to running KEGGDecoder.') + +required = parser.add_argument_group('required arguments') + +required.add_argument("-i", "--input-tsv", help='Output table from KEGGDecoder', action="store", required=True) +required.add_argument("-m", "--map-tsv", help='Tab-delimited map with 1st column holding original name, and 2nd column holding modified name', action="store", required=True) + +parser.add_argument("-o", "--output-tsv", help='Output table with adjusted MAG IDs (Default: "out.tsv")', action="store", default="out.tsv") + +args = parser.parse_args() + +# reading in mapping file into dictionary +map_dict = {} +with open(args.map_tsv) as mapping: + for line in mapping: + line = line.strip().split("\t") + map_dict[line[1]] = line[0] + +# reading in output table from KEGGDecoder +in_tab = pd.read_csv(args.input_tsv, sep = "\t", index_col = 0) + +# renaming back to what they were before modifying to be compliant with KEGGDecoder +mod_tab = in_tab.rename(index = map_dict) + +# writing out modified file +mod_tab.to_csv(args.output_tsv, sep = "\t") From c7505897ff466e8f9af2c1c9ceb3da7954770a06 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 27 Aug 2024 18:02:49 -0500 Subject: [PATCH 19/48] fixed README typo --- .../NF_MGIllumina/README.md | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 7e2899b2..28580e51 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -3,12 +3,12 @@ ## General Workflow Info ### Implementation Tools + The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. > **Note on reference databases** > Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. - ## Utilizing the Workflow 1. [Install Nextflow and Singularity](#1-install-nextflow-and-singularity) @@ -19,21 +19,21 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M 3. [Fetch Singularity Images](#3-fetch-singularity-images) -4. [Run the workflow](#3-run-the-workflow) - 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#3a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#3b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#3c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 3d. [Modify parameters and cpu resources in the nextflow config file](#3d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) +4. [Run the workflow](#4-run-the-workflow) + 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 3d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) -4. [Workflow outputs](#4-workflow-outputs) - 4a. [Main outputs](#4a-main-outputs) - 4b. [Resource logs](#4b-resource-logs) +5. [Workflow outputs](#5-workflow-outputs) + 4a. [Main outputs](#5a-main-outputs) + 4b. [Resource logs](#5b-resource-logs)
--- -### 1. Install Nextflow and Singularity +### 1. Install Nextflow and Singularity #### 1a. Install Nextflow @@ -70,6 +70,7 @@ All files required for utilizing the NF_XXX GeneLab workflow for processing meta wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina/NF_MGIllumina.zip unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X ``` +
--- @@ -79,13 +80,13 @@ unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_MGIllumina workflow: + > Note: This command should be run in the location containing the `NF_MGIllumina` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. ```bash bash ./bin/prepull_singularity.sh nextflow.config ``` - Once complete, a `singularity` folder containing the Singularity images will be created. Run the following command to export this folder as a Nextflow configuration environment variable to ensure Nextflow can locate the fetched images: ```bash @@ -135,17 +136,17 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. From c6c929a29450d03ee2ada3fc54f07d9165b470e2 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 27 Aug 2024 18:06:11 -0500 Subject: [PATCH 20/48] fixed README typo --- .../Workflow_Documentation/NF_MGIllumina/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 28580e51..6e5ece95 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -20,14 +20,14 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M 3. [Fetch Singularity Images](#3-fetch-singularity-images) 4. [Run the workflow](#4-run-the-workflow) - 3a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 3b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 3c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 3d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + 4a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) + 4b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 4d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) 5. [Workflow outputs](#5-workflow-outputs) - 4a. [Main outputs](#5a-main-outputs) - 4b. [Resource logs](#5b-resource-logs) + 5a. [Main outputs](#5a-main-outputs) + 5b. [Resource logs](#5b-resource-logs)
From 54effb8e7fc7c468291b20e8191d7823a3bdcdbd Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 27 Aug 2024 18:11:55 -0500 Subject: [PATCH 21/48] fixed README typo --- .../Illumina/Workflow_Documentation/NF_MGIllumina/README.md | 2 +- Metagenomics/Illumina/Workflow_Documentation/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 6e5ece95..9a405619 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -162,7 +162,7 @@ Once you've downloaded the workflow template, you can modify the parameters in t --- -### 5. Workflow outputs +### 5. Workflow outputs #### 5a. Main outputs diff --git a/Metagenomics/Illumina/Workflow_Documentation/README.md b/Metagenomics/Illumina/Workflow_Documentation/README.md index 468b5ca1..f739c11f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/README.md @@ -6,8 +6,8 @@ |Pipeline Version|Current Workflow Version (for respective pipeline version)| |:---------------|:---------------------------------------------------------| -|*[GL-DPPD-7107.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md)|[2.0.4](SW_MGIllumina)| +|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[1.0.0](NF_MGIllumina)| *Current GeneLab Pipeline/Workflow Implementation -> See the [workflow change log](SW_MGIllumina/CHANGELOG.md) to access previous workflow versions and view all changes associated with each version update. +> See the [workflow change log](NF_MGIllumina/CHANGELOG.md) to access previous workflow versions and view all changes associated with each version update. From 8e36cc8371f47eab4c6e1698dfea12725e247791 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Wed, 28 Aug 2024 02:56:20 -0700 Subject: [PATCH 22/48] Updating header and correcting tool version typos --- .../GL-DPPD-7107-A.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 55e70382..6fc12617 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -4,18 +4,19 @@ --- -**Date:** September 15, 2021 -**Revision:** - +**Date:** 2024 +**Revision:** -A **Document Number:** GL-DPPD-7107 **Submitted by:** -Michael D. Lee (GeneLab Analysis Team) +Olabiyi A. Obayomi (GeneLab Analysis Team) **Approved by:** Sylvain Costes (GeneLab Project Manager) Samrawit Gebre (GeneLab Deputy Project Manager and Interim GeneLab Configuration Manager) -Amanda Saravia-Butler (GeneLab Data Processing Lead) -Jonathan Galazka (GeneLab Project Scientist) +Barbara Novak (GeneLab Data Processing Lead) +Amanda Saravia-Butler (GeneLab Science Lead) +Lauren Sanders (OSDR Project Scientist) --- @@ -55,7 +56,7 @@ Jonathan Galazka (GeneLab Project Scientist) |megahit| 1.2.9 |[https://github.com/voutcn/megahit#megahit](https://github.com/voutcn/megahit#megahit)| |bit| 1.8.53 |[https://github.com/AstrobioMike/bioinf_tools#bioinformatics-tools-bit](https://github.com/AstrobioMike/bioinf_tools#bioinformatics-tools-bit)| |bowtie2| 2.4.1 |[https://github.com/BenLangmead/bowtie2#overview](https://github.com/BenLangmead/bowtie2#overview)| -|samtools| 1.2 |[https://github.com/samtools/samtools#samtools](https://github.com/samtools/samtools#samtools)| +|samtools| 1.20 |[https://github.com/samtools/samtools#samtools](https://github.com/samtools/samtools#samtools)| |prodigal| 2.6.3 |[https://github.com/hyattpd/Prodigal#prodigal](https://github.com/hyattpd/Prodigal#prodigal)| |KOFamScan| 1.3.0 |[https://github.com/takaram/kofam_scan#kofamscan](https://github.com/takaram/kofam_scan#kofamscan)| |CAT| 5.2.3 |[https://github.com/dutilh/CAT#cat-and-bat](https://github.com/dutilh/CAT#cat-and-bat)| @@ -64,7 +65,7 @@ Jonathan Galazka (GeneLab Project Scientist) |gtdbtk| 2.4.0 |[https://github.com/Ecogenomics/GTDBTk](https://github.com/Ecogenomics/GTDBTk)| |KEGGDecoder| 1.2.2 |[https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder](https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder) |HUMAnN3| 3.9 |[https://huttenhower.sph.harvard.edu/humann3/](https://huttenhower.sph.harvard.edu/humann3/)| -|MetaPhlAn3| 4.10 |[https://github.com/biobakery/MetaPhlAn/tree/3.0](https://github.com/biobakery/MetaPhlAn/tree/3.0)| +|MetaPhlAn3| 4.1.0 |[https://github.com/biobakery/MetaPhlAn/tree/3.0](https://github.com/biobakery/MetaPhlAn/tree/3.0)| --- From b9e234b34e95bb4ef4cfb70f689300848846a83f Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Wed, 28 Aug 2024 03:09:11 -0700 Subject: [PATCH 23/48] Adding updates from previous version --- .../GL-DPPD-7107-A.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 6fc12617..56f262eb 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -20,6 +20,21 @@ Lauren Sanders (OSDR Project Scientist) --- +## Updates from previous version + +- The following tool versions were updated: + - FastQC + - MultiQC + - bowtie2 + - samtools + - CAT + - gtdbtk + - HUMAnN3 + - MetaPhIAn3 +- In [step 14d](#14d-mag-taxonomic-classification), MAG taxonomic classification, gtdbtk classify_wf now takes a new argument "--skip_ani_screen" which specifies to skip the ani_screening step to classify genomes using mash and skani. + +--- + # Table of contents - [**Software used**](#software-used) @@ -877,8 +892,8 @@ gtdbtk classify_wf --genome_dir MAGs/ -x fa --out_dir gtdbtk-output-dir --skip_ * `classify_wf` – specifies the workflow being utilized * `--genome_dir` – specifies the directory holding the MAGs generated in step 14c * `-x` – specifies the extension that is on the MAG fasta files that are being taxonomically classified -* `-out_dir` – specifies the output directory -* `-skip_ani_screen` - specifies to skip ani_screening step to classify genomes using mash and skani +* `--out_dir` – specifies the output directory +* `--skip_ani_screen` - specifies to skip ani_screening step to classify genomes using mash and skani **Input data:** From 42b723a90d82acbae67ef87b2d8b8c9a60fddbb4 Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Thu, 5 Sep 2024 11:25:01 -0700 Subject: [PATCH 24/48] Update GL-DPPD-7107-A.md Changed text in "updates" section to more clearly explain the change in step 14d and fixed the capitalization on the GTDB-Tk software package. --- .../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 56f262eb..59cb2f3c 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -28,10 +28,10 @@ Lauren Sanders (OSDR Project Scientist) - bowtie2 - samtools - CAT - - gtdbtk + - GTDB-Tk - HUMAnN3 - MetaPhIAn3 -- In [step 14d](#14d-mag-taxonomic-classification), MAG taxonomic classification, gtdbtk classify_wf now takes a new argument "--skip_ani_screen" which specifies to skip the ani_screening step to classify genomes using mash and skani. +- In [step 14d](#14d-mag-taxonomic-classification), MAG taxonomic classification, added the new `--skip_ani_screen` argument to `gtdbtk classify_wf` to continue classifying genomes as in previous versions of GTDB-Tk. --- @@ -77,7 +77,7 @@ Lauren Sanders (OSDR Project Scientist) |CAT| 5.2.3 |[https://github.com/dutilh/CAT#cat-and-bat](https://github.com/dutilh/CAT#cat-and-bat)| |Metabat2| 2.15 |[https://bitbucket.org/berkeleylab/metabat/src/master/](https://bitbucket.org/berkeleylab/metabat/src/master/)| |checkm| 1.1.3 |[https://github.com/Ecogenomics/CheckM](https://github.com/Ecogenomics/CheckM)| -|gtdbtk| 2.4.0 |[https://github.com/Ecogenomics/GTDBTk](https://github.com/Ecogenomics/GTDBTk)| +|GTDB-Tk| 2.4.0 |[https://github.com/Ecogenomics/GTDBTk](https://github.com/Ecogenomics/GTDBTk)| |KEGGDecoder| 1.2.2 |[https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder](https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder) |HUMAnN3| 3.9 |[https://huttenhower.sph.harvard.edu/humann3/](https://huttenhower.sph.harvard.edu/humann3/)| |MetaPhlAn3| 4.1.0 |[https://github.com/biobakery/MetaPhlAn/tree/3.0](https://github.com/biobakery/MetaPhlAn/tree/3.0)| From 9a232e669f18b1850641642436ec566313277aa5 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Fri, 6 Sep 2024 11:48:45 -0700 Subject: [PATCH 25/48] Updating change description --- .../Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 59cb2f3c..566310d3 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -31,7 +31,7 @@ Lauren Sanders (OSDR Project Scientist) - GTDB-Tk - HUMAnN3 - MetaPhIAn3 -- In [step 14d](#14d-mag-taxonomic-classification), MAG taxonomic classification, added the new `--skip_ani_screen` argument to `gtdbtk classify_wf` to continue classifying genomes as in previous versions of GTDB-Tk. +- In [step 14d](#14d-mag-taxonomic-classification), MAG taxonomic classification, added the new `--skip_ani_screen` argument to `gtdbtk classify_wf` to continue classifying genomes as in previous versions of GTDB-Tk, using mash and skani. --- From 9dfa6fa985d8c8f8b96a78da4f8d1c7b22712a56 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Tue, 10 Sep 2024 08:51:10 -0700 Subject: [PATCH 26/48] Added post-processing workflow --- .../NF_MGIllumina/README.md | 25 +- ...L-gen-metagenomics-file-associations-table | 641 ++++++++++++++ .../bin/GL-gen-processed-metagenomics-readme | 267 ++++++ .../GL-validate-processed-metagenomics-data | 800 ++++++++++++++++++ .../workflow_code/bin/clean-paths.sh | 2 +- .../NF_MGIllumina/workflow_code/main.nf | 2 +- .../workflow_code/modules/genelab.nf | 283 +++++++ .../workflow_code/modules/summarize_MAG.nf | 2 +- .../workflow_code/nextflow.config | 2 +- .../workflow_code/post_processing.config | 165 ++++ .../workflow_code/post_processing.nf | 323 +++++++ 11 files changed, 2507 insertions(+), 5 deletions(-) create mode 100755 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table create mode 100755 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme create mode 100755 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 9a405619..4a139c71 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -29,6 +29,8 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M 5a. [Main outputs](#5a-main-outputs) 5b. [Resource logs](#5b-resource-logs) +6. [Post Processing](#6-post-processing) +
--- @@ -140,7 +142,8 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). + +
+ +--- + +### 6. Post Processing + +For options and detailed help on how to run the post-processing workflow, run the following command: + +```bash +nextflow run post_processng.nf --help +``` + +To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command: + +```bash +nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity +``` + +The outputs of the run will be in a directory called `Post_Processing` by default. diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table new file mode 100755 index 00000000..f3d93281 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table @@ -0,0 +1,641 @@ +#!/usr/bin/env python + +""" +This is a program for generating the file-associations table needed by Curation for newly processed metagenomics datasets. +""" + +import os +import sys +import argparse +import textwrap +import pandas as pd +import zipfile +import re + +parser = argparse.ArgumentParser(description = "This program generates the file-assocations table needed by Curation for \ + newly processed metagenomics datasets. It is intended to be run after `GL-validate-processed-data` \ + has been run successfully.") +required = parser.add_argument_group('required arguments') +required.add_argument("-g", "--GLDS-ID", help = 'GLDS ID (e.g. "GLDS-276")', action = "store", required = True) +parser.add_argument("-i", "--isa-zip", help = "Appropriate ISA file for dataset (a zip archive, providing this will assume there is only one a_*amplicon* assay table in there, \ + if that's not the case, explicitly provide the assay table to the '-a' argument instead)", + action = "store", default = "") +parser.add_argument("--assay-table", + help = 'Appropriate assay table for dataset (this can be provided directly instead of being pulled from an ISA object)', + action = "store", default = "") +parser.add_argument("--runsheet", + help = """ + Input csv runsheet file used to run nextflow. This argument must be set when running the workflow with an OSD/GLDS accession as input as opposed to passing an input csv file. + This argument is used to get raw input file names that are used to retrieve raw read depths paer sample. + """, + action = "store", default = "") + +parser.add_argument("--output", + help = 'Name of output log file (default: "_[]-associated-file-names.tsv", with appended prefix if one is provided)', + default = "", action = "store") +parser.add_argument("-p", "--output-prefix", help = "Output additional file prefix if there is one", action = "store", default = "") +parser.add_argument("--additional-string-to-remove-from-unique-filenames", + help = "If there is any additional text to remove from unqiue filenames, it can be provided here.", + action = "store") +parser.add_argument("--assay_suffix", help = "Genelab assay suffix", action = "store", default = "_GLmetagenomics") +parser.add_argument("--raw_file_prefix", help = "Prefix to be added to the raw data files alone (Default: _metagenomics_)", action = "store", default ="") +parser.add_argument("--file_prefix", help = "Prefix to be added to all files except the raw files (Default: _GLmetagenomics_)", action = "store", default ="") +parser.add_argument("--raw_suffix", help = "Raw reads suffix", action = "store", default ="_HRremoved_raw.fastq.gz") +parser.add_argument("--raw_R1_suffix", help = "Raw forward reads suffix", action = "store", default = "_R1_HRremoved_raw.fastq.gz") +parser.add_argument("--raw_R2_suffix", help = "Raw reverse reads suffix", action = "store", default = "_R2_HRremoved_raw.fastq.gz") +parser.add_argument("--filtered_suffix", help = "Filtered reads suffix", action = "store", default = "_filtered.fastq.gz") +parser.add_argument("--filtered_R1_suffix", help = "Filtered forward reads suffix", action = "store", default = "_R1_filtered.fastq.gz") +parser.add_argument("--filtered_R2_suffix", help = "Filtered reverse reads suffix", action = "store", default = "_R2_filtered.fastq.gz") +parser.add_argument("--processing_zip_file", help = "Specifies the name of processing_info.zip", + action = "store", default = "processing_info.zip") +parser.add_argument("--readme", help = "Specifies the name of README.txt", + action = "store", default = "README.txt") +parser.add_argument("--raw_reads_dir", help = "Specifies the name of the raw reads directory if they are to be included", + action = "store", default = "Raw_Sequence_Data/") +parser.add_argument("--fastqc_dir", help = "Specifies the location of fastqc and multiqc reports directory", + action = "store", default = "FastQC_Outputs/") +parser.add_argument("--filtered_reads_dir", help = "Specifies the name of the filtered reads directory", + action = "store", default = "Filtered_Sequence_Data/") +parser.add_argument("--read_based_dir", help = "Specifies the location of the directory containing results generated from read-based processing approach", + action = "store", default = "Read-based_Processing/") +parser.add_argument("--assembly_based_dir", help = "Specifies the location of the directory containing results generated from assembly-based approach", + action = "store", default = "Assembly-based_Processing/") +parser.add_argument("--assemblies_dir", help = "Specifies the location of the directory containing sample contig assemblies", + action = "store", default = "Assembly-based_Processing/assemblies/") +parser.add_argument("--genes_dir", help = "Specifies the location of the directory containing predicted genes", + action = "store", default = "Assembly-based_Processing/predicted-genes/") +parser.add_argument("--annotations_and_tax_dir", help = "Specifies the location of the directory containing contigs annotation and taxonomy", + action = "store", default = "Assembly-based_Processing/annotations-and-taxonomy/") +parser.add_argument("--mapping_dir", help = "Specifies the location of the directory containing per-sample bam, coverage, and mapping info files", + action = "store", default = "Assembly-based_Processing/read-mapping/") +parser.add_argument("--bins_dir", help = "Specifies the location of the directory containing recovered genome bins", + action = "store", default = "Assembly-based_Processing/bins/") +parser.add_argument("--MAGs_dir", help = "Specifies the location of the directory containing meta-assembled genomes (MAGs)", + action = "store", default = "Assembly-based_Processing/MAGs/") +parser.add_argument("--combined_output_dir", help = "Specifies the location of the directory containing contig annotation summary outputs with all samples combined", + action = "store", default = "Assembly-based_Processing/combined-outputs/") +parser.add_argument("--single-ended", help = "Add this flag if data are single-end sequencing.", action = "store_true") +parser.add_argument("--R1-used-as-single-ended-data", help = "Provide this flag if processing only R1 reads as single-end (as the expected raw \ + filename suffixes will have 'R1' in there)", + action = "store_true") +parser.add_argument("--include-raw-multiqc-in-output", + help = "Provide this flag if wanting to include the raw multiqc zip in the file-associations output table (may be wanted for older datasets)", + action = "store_true") +parser.add_argument("--use-sample-names-from-assay-table", + help = "Provide this flag if the unique filename strings in the processed outputs are taken directly from the \ + 'Sample Name' column of the input assay table.", action = "store_true") + + +if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(0) + +args = parser.parse_args() + + +# Setting some colors +tty_colors = { + 'green' : '\033[0;32m%s\033[0m', + 'yellow' : '\033[0;33m%s\033[0m', + 'red' : '\033[0;31m%s\033[0m' +} + + +######################### Aesthetic functions ######################### +def color_text(text, color='green'): + if sys.stdout.isatty(): + return tty_colors[color] % text + else: + return text + + +def wprint(text): + """ Print wrapper """ + + print(textwrap.fill(text, width=80, initial_indent=" ", + subsequent_indent=" ", break_on_hyphens=False)) + + +def modify_symbolic_link(file_path): + """ Modify symbolic link such that it retruns a string containing the parent dir and the base name""" + full_path = os.path.realpath(file_path) + parent_dir = os.path.basename(os.path.dirname(full_path)) + base_name = os.path.basename(full_path) + mod_path = os.path.join(parent_dir, base_name) + return(mod_path.replace("_", " ").rstrip("/")) + + +#################### End of Aesthetic functions ######################### + + +def report_failure(message, color = "yellow"): + print("") + wprint(color_text(message, color)) + print("\nCuration file-associations table generation failed.\n") + sys.exit(1) + + +def preflight_checks(isa_zip, assay_table): + """Check that either one of isa_zip or assay_table is passed as argument""" + # Ensure that at least one of isa_zip or assay_table is passed as argument + if isa_zip == "" and assay_table == "": + report_failure("This program requires either an input ISA object (passed to '-i') or a specific assay table (passed to '-a').") + # Ensure that only one of isa_zip or assay_table is passed as argument + if isa_zip != "" and assay_table != "": + report_failure("This program requires *only* an input ISA object (passed to '-i') or a specific assay table (passed to '-a'), not both.") + + +def check_for_file_and_contents(file_path): + """Checks if file exists and that it is not empty""" + if not os.path.exists(file_path): + report_failure("The expected file '" + str(file_path) + "' does not exist.") + if not os.path.getsize(file_path) > 0: + report_failure("The file '" + str(file_path) + "' is empty.") + + +def get_assay_table_from_ISA(isa_zip): + """ Tries to find a single assay table in an isa object """ + + zip_file = zipfile.ZipFile(isa_zip) + isa_files = zip_file.namelist() + + # Getting wanted filename (those that start with "a_" and contains the word "metagenomic" seem to be what we want) + wanted_file_list = [item for item in isa_files if item.startswith("a_") and item.find("metagenomic") != -1] + if len(wanted_file_list) != 1: + report_failure("We couldn't find the correct assay table in the ISA object, consider passing it directly to the '-a' argument.") + + wanted_file = wanted_file_list[0] + + df = pd.read_csv(zip_file.open(wanted_file), sep = "\t") + + return(df) + + +def get_assay_table(isa_zip, assay_table): + """ Returns the assay table whether provided directly or pulled from ISA archive """ + + # Get assay table if we are using an input isa object + if isa_zip != "": + # Check if ISA exists and thet it isn't empty + check_for_file_and_contents(isa_zip) + + assay_table_df = get_assay_table_from_ISA(isa_zip) + + # Reading assay table if provided directly + else: + # Check if assay_table exists and that it isn't empty + check_for_file_and_contents(assay_table) + assay_table_df = pd.read_csv(assay_table, sep = "\t") + + return(assay_table_df) + + +def remove_suffixes(name, raw_file_prefix, raw_R1_suffix, raw_R2_suffix, raw_suffix): + """ This removes expected prefixes and suffixes """ + + # Removing expected prefix + curr_name = name.replace(raw_file_prefix, "") + + # Removing potential suffixes (also checking R2 in case they are not + # in the appropriate order in the sample table, e.g. R2 before R1) + curr_name = curr_name.replace(raw_R1_suffix, "") + curr_name = curr_name.replace(raw_R2_suffix, "") + curr_name = curr_name.replace(raw_suffix, "") + + return(curr_name) + +def get_sample_names_and_unique_filenames(assay_table, raw_file_prefix, raw_R1_suffix, + raw_R2_suffix, raw_suffix, + use_sample_names_from_assay_table, + additional_string_to_remove_from_unique_filenames): + """ + This gets the sample names ('Sample Name' column) from the assay table, + and tries to get what would have been the unique filename prefixes generated from + what's in the Raw Data File column of the assay table. + + Unless the --use-sample-names-from-assay-table flag was provided, then it just uses what's + in the 'Sample Name' column. + """ + + sample_names = assay_table["Sample Name"].tolist() + + if use_sample_names_from_assay_table: + unique_filename_prefixes = sample_names + return(sample_names, unique_filename_prefixes) + + all_filenames = assay_table["Raw Data File"] + + unique_filename_prefixes = [] + + # Attempting to split if they have multiple files (like paired-end) + # and also removing the common prefixes and suffixes intending to create the same + # unique filenames used for processing + + for entry in all_filenames: + + # splitting if there are more than one (like with paired-end) + curr_name = entry.split(",")[0] + + curr_name = remove_suffixes(curr_name, raw_file_prefix, raw_R1_suffix, raw_R2_suffix, raw_suffix) + + unique_filename_prefixes.append(curr_name) + + + if additional_string_to_remove_from_unique_filenames: + + unique_filename_prefixes = [x.replace(additional_string_to_remove_from_unique_filenames, "") for x in unique_filename_prefixes] + + return(sample_names, unique_filename_prefixes) + + +def get_read_counts_from_raw_multiqc(raw_multiqc_stats_file_path, + fastqc_dir, output_prefix, raw_multiqc_zip): + + input_zip = os.path.join(fastqc_dir, output_prefix + raw_multiqc_zip) + zip_file = zipfile.ZipFile(input_zip) + df = pd.read_csv(zip_file.open(raw_multiqc_stats_file_path), sep = "\t", usecols = [0,6]) + df.columns = ["sample", "counts"] + df.set_index("sample", inplace = True) + + return(df) + + +def get_read_count_from_df(sample_name, read_counts_tab, + raw_suffix, raw_R1_suffix, single_ended, sample_raw_prefix_dict): + + if sample_raw_prefix_dict != "": + return(round(read_counts_tab.at[sample_raw_prefix_dict[sample_name], "counts"])) + + if single_ended: + return(round(read_counts_tab.at[str(sample_name) + \ + raw_suffix.replace("_raw.fastq.gz", ""), "counts"])) + else: + return(round(read_counts_tab.at[str(sample_name) + \ + raw_R1_suffix.replace("_raw.fastq.gz", ""), "counts"])) + + + +def write_colnames(raw_reads_dir, filtered_reads_dir, fastqc_dir, + assembly_based_dir, assemblies_dir, genes_dir, + annotations_and_tax_dir, mapping_dir, bins_dir, + MAGs_dir, combined_output_dir, read_based_dir): + """ Function to write the required column names""" + + # Get Parent directory and tag it unto the base directory name to confirm with what we expect as column headers + # for sundirectory outputs + assemblies_dir = modify_symbolic_link(assemblies_dir) if os.path.islink(assemblies_dir) else assemblies_dir.replace("_", " ").rstrip("/") + genes_dir = modify_symbolic_link(genes_dir) if os.path.islink(genes_dir) else genes_dir.replace("_", " ").rstrip("/") + annotations_and_tax_dir = modify_symbolic_link(annotations_and_tax_dir) if os.path.islink(annotations_and_tax_dir) else annotations_and_tax_dir.replace("_", " ").rstrip("/") + mapping_dir = modify_symbolic_link(mapping_dir) if os.path.islink(mapping_dir) else mapping_dir.replace("_", " ").rstrip("/") + bins_dir = modify_symbolic_link(bins_dir) if os.path.islink(bins_dir) else bins_dir.replace("_", " ").rstrip("/") + MAGs_dir = modify_symbolic_link(MAGs_dir) if os.path.islink(MAGs_dir) else MAGs_dir.replace("_", " ").rstrip("/") + combined_output_dir = modify_symbolic_link(combined_output_dir) if os.path.islink(combined_output_dir) else combined_output_dir.replace("_", " ").rstrip("/") + + + colnames = ["Sample Name", + "Parameter Value[README]", + f"Parameter Value[{raw_reads_dir}]", + "Parameter Value[Read Depth]", + "Unit", + f"Parameter Value[{filtered_reads_dir}]", + f"Parameter Value[{fastqc_dir}]", + f"Parameter Value[{assembly_based_dir}]", + f"Parameter Value[{assemblies_dir}]", + f"Parameter Value[{genes_dir}]", + f"Parameter Value[{annotations_and_tax_dir}]", + f"Parameter Value[{mapping_dir}]", + f"Parameter Value[{bins_dir}]", + f"Parameter Value[{MAGs_dir}]", + f"Parameter Value[{combined_output_dir}]", + f"Parameter Value[{read_based_dir}]", + "Parameter Value[Processing Info]"] + return colnames + + + +def create_constants(include_raw_multiqc_in_output, raw_multiqc_zip, + filtered_multiqc_zip, combined_prefix, assay_suffix): + """A function to create lists of contants to be in creating a file association table""" + if include_raw_multiqc_in_output: + fastqc = [combined_prefix + raw_multiqc_zip, + combined_prefix + filtered_multiqc_zip] + else: + fastqc = [combined_prefix + filtered_multiqc_zip] + + combined_outputs = [combined_prefix + f"Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", + combined_prefix + f"Combined-gene-level-KO-function-coverages-CPM{assay_suffix}.tsv", + combined_prefix + f"Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", + combined_prefix + f"Combined-gene-level-taxonomy-coverages-CPM{assay_suffix}.tsv", + combined_prefix + f"Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", + combined_prefix + f"Combined-contig-level-taxonomy-coverages-CPM{assay_suffix}.tsv"] + + read_based_outputs = [combined_prefix + f"Gene-families{assay_suffix}.tsv", + combined_prefix + f"Gene-families-grouped-by-taxa{assay_suffix}.tsv", + combined_prefix + f"Gene-families-cpm{assay_suffix}.tsv", + combined_prefix + f"Gene-families-KO-cpm{assay_suffix}.tsv", + combined_prefix + f"Pathway-abundances{assay_suffix}.tsv", + combined_prefix + f"Pathway-abundances-grouped-by-taxa{assay_suffix}.tsv", + combined_prefix + f"Pathway-abundances-cpm{assay_suffix}.tsv", + combined_prefix + f"Pathway-coverages{assay_suffix}.tsv", + combined_prefix + f"Pathway-coverages-grouped-by-taxa{assay_suffix}.tsv", + combined_prefix + f"Metaphlan-taxonomy{assay_suffix}.tsv"] + + return fastqc, combined_outputs, read_based_outputs + +def rusheet_to_dict(runsheet): + """ Reads the input nextflow runsheet into a dataframe and converts it to + a dictionary with sample names as keys and raw reads forward prefix used + by multiqc as values + """ + def get_prefix(string): + basename = os.path.basename(string) + index = basename.rfind("_") + return(basename[0:index]) + df = pd.read_csv(runsheet, usecols=["sample_id", "forward"]) + df['forward'] = df.forward.apply(lambda row : get_prefix(row)) + sample_to_prefix_dict = {k:v['forward'] for k,v in df.set_index("sample_id").T.to_dict().items()} + return(sample_to_prefix_dict) + +def create_association_table(header_colnames, assembly_overview_tab, fastqc, combined_outputs, read_based_outputs, + unique_filename_prefixes, read_count_tab, sample_file_dict, file_prefix, combined_prefix, + readme, assay_suffix, raw_file_prefix, raw_suffix, raw_R1_suffix, raw_R2_suffix, + filtered_suffix, filtered_R1_suffix, filtered_R2_suffix, processing_info, + single_ended, R1_used_as_single_ended_data, assemblies_dir, assembly_suffix, assembly_files, + genes_dir, mapping_dir, bins_overview, bins_dir_files, MAGs_overview, MAGs_dir_files, + MAG_KO_files_list, sample_raw_prefix_dict, read_count_unit = "read"): + """Create association table and add data rows to it""" + + # Initialize association table + association_df = pd.DataFrame(columns = header_colnames) + filtered_reads_count = combined_prefix + f"filtered-read-counts{assay_suffix}.tsv" + # Create row + for sample in unique_filename_prefixes: + if sample_raw_prefix_dict != "": + # This expects that sample input fastq files will always end with + # HRremoved. if they don't, then you have to modify these hard coded strings. + if (single_ended and R1_used_as_single_ended_data) or (not single_ended): + raw_sample_name = re.sub("_R1_HRremoved$","", sample_raw_prefix_dict[sample]) + else: + raw_sample_name = re.sub("_HRremoved$","", sample_raw_prefix_dict[sample]) + # Single-end (Paired-end data where only the forward reads were analyzed) + if single_ended and R1_used_as_single_ended_data: + # If only forward read was used, still want to include both foward and reverse read names + # in the "Raw Data" column because it is tied to the hosted raw data, not just what was used here + curr_raw_data = [raw_file_prefix + sample + raw_R1_suffix, + raw_file_prefix + sample + raw_R2_suffix] + + if sample_raw_prefix_dict != "": + curr_raw_data = [raw_sample_name + raw_R1_suffix, raw_sample_name + raw_R2_suffix] + + curr_filt_data = [file_prefix + sample + filtered_R1_suffix, + filtered_reads_count] + # Single-end without reverse reads + elif single_ended: + curr_raw_data = [raw_file_prefix + sample + raw_suffix] + + if sample_raw_prefix_dict != "": + curr_raw_data = [raw_sample_name + raw_suffix] + + + curr_filt_data = [file_prefix + sample + filtered_suffix, + filtered_reads_count] + # Paired-end + else: + curr_raw_data = [raw_file_prefix + sample + raw_R1_suffix, + raw_file_prefix + sample + raw_R2_suffix] + + if sample_raw_prefix_dict != "": + curr_raw_data = [raw_sample_name + raw_R1_suffix, raw_sample_name + raw_R2_suffix] + + curr_filt_data = [file_prefix + sample + filtered_R1_suffix, + file_prefix + sample + filtered_R2_suffix, + filtered_reads_count] + # Get sample raw read count + curr_read_count = get_read_count_from_df(sample, read_count_tab, raw_suffix, + raw_R1_suffix, single_ended, sample_raw_prefix_dict) + + read_count_tab['samples'] = read_count_tab.index + contains_sample = read_count_tab['samples'].str.contains + # Only adding file to list if it exists and isn't empty (easier for curation this way) + curr_path = os.path.join(assemblies_dir, sample + assembly_suffix) + + if os.path.exists(curr_path) and os.path.getsize(curr_path) > 0: + curr_assembly = [file_prefix + sample + assembly_suffix] + assembly_files + else: + curr_assembly = [""] + + # Only adding file to list if it exists and isn't empty (easier for curation this way) + curr_genes = [] + gene_suffixes = ["-genes.faa", "-genes.fasta", "-genes.gff"] + for ext in gene_suffixes: + curr_path = os.path.join(genes_dir, sample + ext) + + if os.path.exists(curr_path) and os.path.getsize(curr_path) > 0: + curr_genes.append(file_prefix + sample + ext) + + # Adding empty value if all 3 missing (which i don't think happens as the gff has content either way) + if len(curr_genes) == 0: + curr_genes = [""] + + # These have headers even if no data for a sample, so no complications about being empty + curr_annots = [file_prefix + sample + "-gene-coverage-annotation-and-tax.tsv", + file_prefix + sample + "-contig-coverage-and-tax.tsv"] + + # Only adding file to list if it exists and isn't empty (easier for curation this way) + curr_read_mapping = [] + mapping_suffixes = [".bam", "-mapping-info.txt", "-metabat-assembly-depth.tsv"] + for ext in mapping_suffixes: + curr_path = os.path.join(mapping_dir, sample + ext) + + if os.path.exists(curr_path) and os.path.getsize(curr_path) > 0: + curr_read_mapping.append(file_prefix + sample + ext) + + # Adding empty value if all 3 missing + if len(curr_read_mapping) == 0: + curr_read_mapping = [""] + + if bins_overview[0] == f"{combined_prefix}bins-overview{assay_suffix}.tsv": + curr_bins = bins_overview + [file_prefix + file for file in bins_dir_files if file.startswith(sample)] + else: + curr_bins = [""] + + if MAGs_overview[0] == f"{combined_prefix}MAGs-overview{assay_suffix}.tsv": + curr_MAGs = MAGs_overview + [file_prefix + file for file in MAGs_dir_files if file.startswith(sample)] + MAG_KO_files_list + else: + curr_MAGs = [""] + + curr_row_as_list = [sample_file_dict[sample], + readme, + ", ".join(curr_raw_data), + curr_read_count, + read_count_unit, + ", ".join(curr_filt_data), + ", ".join(fastqc), + assembly_overview_tab, + ", ".join(curr_assembly), + ", ".join(curr_genes), + ", ".join(curr_annots), + ", ".join(curr_read_mapping), + ", ".join(curr_bins), + ", ".join(curr_MAGs), + ", ".join(combined_outputs), + ", ".join(read_based_outputs), + processing_info] + + # Append row to the association dataframe + association_df.loc[len(association_df)] = curr_row_as_list + + return association_df + + + +def write_association_table(outfile, association_df): + """Write to csv file""" + # Writing out + association_df.to_csv(outfile, sep = "\t", index = False) + + +def main(): + + ### Set variables ### + # Directories + fastqc_dir = str(args.fastqc_dir) + raw_reads_dir = str(args.raw_reads_dir) + filtered_reads_dir = str(args.filtered_reads_dir) + assembly_based_dir = str(args.assembly_based_dir) + assemblies_dir = str(args.assemblies_dir) + genes_dir = str(args.genes_dir) + annotations_and_tax_dir = str(args.annotations_and_tax_dir) + mapping_dir = str(args.mapping_dir) + bins_dir = str(args.bins_dir) + MAGs_dir = str(args.MAGs_dir) + combined_output_dir = str(args.combined_output_dir) + read_based_dir = str(args.read_based_dir) + + raw_reads_dir = raw_reads_dir.replace("_", " ").rstrip("/") + filtered_reads_dir = filtered_reads_dir.replace("_", " ").rstrip("/") + assembly_based_dir = assembly_based_dir.replace("_", " ").rstrip("/") + annotations_and_tax_dir = annotations_and_tax_dir.replace("_", " ").rstrip("/") + combined_output_dir = combined_output_dir.replace("_", " ").rstrip("/") + read_based_dir = read_based_dir.replace("_", " ").rstrip("/") + fastqc_dir = fastqc_dir.replace("_", " ").rstrip("/") + + + # Suffixes + filtered_suffix = str(args.filtered_suffix) + filtered_R1_suffix = str(args.filtered_R1_suffix) + filtered_R2_suffix = str(args.filtered_R2_suffix) + raw_suffix = str(args.raw_suffix) + raw_R1_suffix = str(args.raw_R1_suffix) + raw_R2_suffix = str(args.raw_R2_suffix) + if args.R1_used_as_single_ended_data: + raw_suffix = raw_R1_suffix + # Just in case user only specified --R1-used-as-single-ended, but didn't specify --single-ended + args.single_ended = True + assay_suffix = str(args.assay_suffix) + assembly_suffix = "-assembly.fasta" + + # This one is only used for the raw data files + raw_file_prefix = f"{args.GLDS_ID}_metagenomics_" if args.raw_file_prefix == "" else str(args.raw_file_prefix) + file_prefix = f"{args.GLDS_ID}_GMetagenomics_" if args.file_prefix == "" else str(args.file_prefix) + raw_multiqc_zip = f"raw_multiqc{assay_suffix}_report.zip" + filtered_multiqc_zip = f"filtered_multiqc{assay_suffix}_report.zip" + output_prefix = str(args.output_prefix) + combined_prefix = file_prefix + output_prefix + raw_multiqc_stats_file_path = output_prefix + f"raw_multiqc_report.zip".split(".")[0] + \ + f"/{output_prefix}raw_multiqc_data/multiqc_general_stats.txt" + processing_info = combined_prefix + str(args.processing_zip_file) + + assembly_overview_tab = f"{combined_prefix}Assembly-based-processing-overview{assay_suffix}.tsv" + failed_assemblies = os.path.join(args.assemblies_dir, f"{output_prefix}Failed-assemblies{assay_suffix}.tsv") + bins_overview_tab = os.path.join(args.bins_dir, f"{output_prefix}bins-overview{assay_suffix}.tsv") + mags_overview_tab = os.path.join(args.MAGs_dir,f"{output_prefix}MAGs-overview{assay_suffix}.tsv") + mag_ko_annotation = os.path.join(args.MAGs_dir, f"{output_prefix}MAG-level-KO-annotations{assay_suffix}.tsv") + mag_kegg_tab = os.path.join(args.MAGs_dir, f"{output_prefix}MAG-KEGG-Decoder-out{assay_suffix}.tsv") + mag_kegg_html = os.path.join(args.MAGs_dir, f"{output_prefix}MAG-KEGG-Decoder-out{assay_suffix}.html") + + + # Set output file name + if args.output == "" and output_prefix != "": + outfile = f"{args.GLDS_ID}_{output_prefix}-associated-file-names.tsv" + elif args.output == "": + outfile = f"{args.GLDS_ID}-associated-file-names.tsv" + else: + outfile = args.output + + readme = combined_prefix + args.readme + include_raw_multiqc_in_output = str(args.include_raw_multiqc_in_output) + + + if os.path.exists(failed_assemblies): + assembly_files = [f"{combined_prefix}assembly-summaries{assay_suffix}.tsv", + f"{combined_prefix}Failed-assemblies{assay_suffix}.tsv"] + else: + assembly_files = [f"{combined_prefix}assembly-summaries{assay_suffix}.tsv"] + + if os.path.exists(bins_overview_tab): + bins_overview = [f"{combined_prefix}bins-overview{assay_suffix}.tsv"] + bins_dir_files = [file for file in os.listdir(args.bins_dir) if file.endswith(".fasta")] + else: + bins_overview = [""] + bins_dir_files = [""] + + if os.path.exists(mags_overview_tab): + MAGs_overview = [f"{combined_prefix}MAGs-overview{assay_suffix}.tsv"] + MAGs_dir_files = [file for file in os.listdir(args.MAGs_dir) if file.endswith(".fasta")] + else: + MAGs_overview = [""] + MAGs_dir_files = [""] + + MAG_KO_files_list = [] + if os.path.exists(mag_ko_annotation): + MAG_KO_files_list.append(f"{combined_prefix}MAG-level-KO-annotations{assay_suffix}.tsv") + if os.path.exists(mag_kegg_tab): + MAG_KO_files_list.append(f"{combined_prefix}MAG-KEGG-Decoder-out{assay_suffix}.tsv") + if os.path.exists(mag_kegg_html): + MAG_KO_files_list.append(f"{combined_prefix}MAG-KEGG-Decoder-out{assay_suffix}.html") + + + # Check that either of ISA zip or assay table is passed as argument + preflight_checks(args.isa_zip, args.assay_table) + + assay_table = get_assay_table(args.isa_zip, args.assay_table) + + sample_names, unique_filename_prefixes = get_sample_names_and_unique_filenames(assay_table, raw_file_prefix, raw_R1_suffix, + raw_R2_suffix, raw_suffix, + args.use_sample_names_from_assay_table, + args.additional_string_to_remove_from_unique_filenames) + + sample_file_dict = dict(zip(unique_filename_prefixes, sample_names)) + + read_counts_df = get_read_counts_from_raw_multiqc(raw_multiqc_stats_file_path, args.fastqc_dir, + output_prefix, raw_multiqc_zip) + + ################################### Write file association table ########################################## + header = write_colnames(raw_reads_dir, filtered_reads_dir, fastqc_dir, + assembly_based_dir, args.assemblies_dir, args.genes_dir, + args.annotations_and_tax_dir, args.mapping_dir, args.bins_dir, + args.MAGs_dir, args.combined_output_dir, read_based_dir) + + fastqc, combined_outputs, read_based_outputs = create_constants(include_raw_multiqc_in_output, raw_multiqc_zip, + filtered_multiqc_zip, combined_prefix, assay_suffix) + + + # Retrieve a dictionary with sample names as keys and raw fatqfile prefix as values + sample_raw_prefix_dict = rusheet_to_dict(args.runsheet) if args.runsheet != "" else "" + + association_df = create_association_table(header, assembly_overview_tab, fastqc, combined_outputs, read_based_outputs, + unique_filename_prefixes, read_counts_df, sample_file_dict, file_prefix, + combined_prefix,readme, assay_suffix, raw_file_prefix, raw_suffix, + raw_R1_suffix, raw_R2_suffix, filtered_suffix, filtered_R1_suffix, + filtered_R2_suffix, processing_info, args.single_ended, + args.R1_used_as_single_ended_data, args.assemblies_dir, assembly_suffix, assembly_files, + args.genes_dir, args.mapping_dir, bins_overview, bins_dir_files, MAGs_overview, + MAGs_dir_files, MAG_KO_files_list, sample_raw_prefix_dict, read_count_unit = "read") + + + write_association_table(outfile, association_df) + + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme new file mode 100755 index 00000000..9dd6aa2d --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme @@ -0,0 +1,267 @@ +#!/usr/bin/env python + +""" +This is a program for generating a README.txt file for GeneLab processed metagenomics datasets. +""" + +import os +import sys +import argparse +import textwrap +import zipfile +import re + + +parser = argparse.ArgumentParser(description = "This program generates the corresponding README file for GeneLab processed amplicon dataset. It is intended to \ + be run before running `GL-validate-processed-metagenomics-data` and after processing_info.zip has been created.") + +required = parser.add_argument_group('required arguments') +required.add_argument("-g", "--GLDS-ID", help = 'GLDS ID (e.g. "GLDS-69")', action = "store", required = True) +parser.add_argument("--output", help = 'Name of output file (default: "README.txt", with appended prefix if one is provided)', default = "README.txt") +parser.add_argument("--name", help = 'Name of individual who performed the processing (default: "Michael D. Lee")', default = "Michael D. Lee") +parser.add_argument("--email", help = 'Email address of individual who performed the processing (default: "Mike.Lee@nasa.gov")', default = "Mike.Lee@nasa.gov") +parser.add_argument("--protocol_ID", help = 'Protocol document ID followed (default: assay dependent)', default = "GL-DPPD-7104-B") +parser.add_argument("-p", "--output-prefix", help = "Output additional file prefix if there is one", action = "store", default = "") +parser.add_argument("--assay_suffix", help = "Genelab assay suffix", action = "store", default = "_GLAmpSeq") +parser.add_argument("--primers-already-trimmed", help = "Add this flag if primers were trimmed prior to GeneLab processing, \ + therefore there are no trimmed sequence data", action = "store_true") +parser.add_argument("--processing_zip_file", help = "Specifies the location of processing_info.zip", + action = "store", default = "processing_info.zip") +parser.add_argument("--raw-reads-dir", help = "Specifies the location of the raw reads directory if they are to be included", action = "store", default = "") +parser.add_argument("--fastqc_dir", help = "Specifies the location of fastqc and multiqc reports directory", + action = "store", default = "FastQC_Outputs/") +parser.add_argument("--filtered_reads_dir", help = "Specifies the location of the filtered reads directory", + action = "store", default = "Filtered_Sequence_Data/") +parser.add_argument("--read_based_dir", help = "Specifies the location of the directory containing the results generated from read-based approach", + action = "store", default = "Read-based_Processing/") +parser.add_argument("--assembly_based_dir", help = "Specifies the location of the directory containing results generated from assembly-based approach", + action = "store", default = "Assembly-based_Processing/") +parser.add_argument("--assemblies_dir", help = "Specifies the location of the directory containing sample contig assemblies", + action = "store", default = "Assembly-based_Processing/assemblies/") +parser.add_argument("--genes_dir", help = "Specifies the location of the directory containing predicted genes", + action = "store", default = "Assembly-based_Processing/predicted-genes/") +parser.add_argument("--annotations_and_tax_dir", help = "Specifies the location of the directory containing contigs annotation and taxonomy", + action = "store", default = "Assembly-based_Processing/annotations-and-taxonomy/") +parser.add_argument("--mapping_dir", help = "Specifies the location of the directory containing per-sample bam, coverage, and mapping info files", + action = "store", default = "Assembly-based_Processing/read-mapping/") +parser.add_argument("--bins_dir", help = "Specifies the location of the directory containing recovered genome bins", + action = "store", default = "Assembly-based_Processing/bins/") +parser.add_argument("--MAGs_dir", help = "Specifies the location of the directory containing metaassebled genomes (MAGs)", + action = "store", default = "Assembly-based_Processing/MAGs/") +parser.add_argument("--combined_output_dir", help = "Specifies the location of the directory containing summary outputs with all samples combined", + action = "store", default = "Assembly-based_Processing/combined-outputs/") + +if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(0) + +args = parser.parse_args() + +# Setting some colors +tty_colors = { + 'green' : '\033[0;32m%s\033[0m', + 'yellow' : '\033[0;33m%s\033[0m', + 'red' : '\033[0;31m%s\033[0m' +} + + +### Functions ### +def color_text(text, color='green'): + if sys.stdout.isatty(): + return tty_colors[color] % text + else: + return text + + +def wprint(text): + """ Print wrapper """ + + print(textwrap.fill(text, width=80, initial_indent=" ", + subsequent_indent=" ", break_on_hyphens=False)) + + +def report_failure(message, color = "yellow"): + print("") + wprint(color_text(message, color)) + print("\nREADME-generation failed.\n") + + sys.exit(1) + + +def check_for_file_and_contents(file_path): + """ Used by get_processing_zip_contents function """ + + if not os.path.exists(file_path): + report_failure("The expected file '" + str(file_path) + "' does not exist.") + if not os.path.getsize(file_path) > 0: + report_failure("The file '" + str(file_path) + "' is empty.") + + +def get_processing_zip_contents(processing_zip_file): + """ This gets the filenames that are in the processing_info.zip to add them to the readme """ + # Check that the zip file exists and that it is not empty + + check_for_file_and_contents(processing_zip_file) + + with zipfile.ZipFile(processing_zip_file) as zip_obj: + + entries = zip_obj.namelist() + entries.sort() + + return(entries) + + +def write_header(output, GLDS_ID, name, email, protocol_ID): + + header = ["################################################################################\n", + "{:<77} {:>0}".format("## This directory holds processed data for NASA " + str(GLDS_ID), "##\n"), + "{:<77} {:>0}".format("## https://genelab-data.ndc.nasa.gov/genelab/accession/" + str(GLDS_ID) + "/", "##\n"), + "{:<77} {:>0}".format("##", "##\n"), + "{:<77} {:>0}".format("## Processed by " + str(name) + " (" + str(email) + ")", "##\n"), + "{:<77} {:>0}".format("## Based on " + str(protocol_ID), "##\n"), + "################################################################################\n\n", + "Summary of contents:\n\n"] + + output.writelines(header) + + + +def write_metagenomics_body(output, output_file, assay_suffix, output_prefix, processing_zip_file, + processing_zip_contents, fastqc_dir, raw_reads_dir, filtered_reads_dir, + read_based_dir, assembly_based_dir, assemblies_dir, mapping_dir, genes_dir, + annotations_and_tax_dir, bins_dir, MAGs_dir, combined_output_dir): + + # this file + output.write(" {:<75} {:>0}".format("- " + str(output_file), "- this file\n\n")) + + # fastqc info + output.write(" {:<75} {:>0}".format("- " + str(fastqc_dir), "- multiQC summary reports of FastQC runs\n\n")) + + # raw reads + if raw_reads_dir != "": + output.write(" {:<75} {:>0}".format("- " + str(raw_reads_dir), "- initial read fastq files\n\n")) + + # quality-filtered reads + output.write(" {:<75} {:>0}".format("- " + str(filtered_reads_dir), "- quality-filtered fastq files\n\n")) + + # outputs + output.write(" {:<75} {:>0}".format("- " + str(assembly_based_dir), "- results generated from assembly-based approach\n\n")) + + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Assembly-based-processing-overview{assay_suffix}.tsv", "- Assembly-based overview per sample\n\n")) + + output.write(" {:<71} {:>0}".format("- " + str(assemblies_dir), "- per-sample assembly files and info\n")) + output.write(" {:<67} {:>0}".format("- *-assembly.fasta", "- fasta files of individual sample assemblies\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}assembly-summaries{assay_suffix}.tsv", "- table of all assemblies' summary statistics\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Failed-assemblies{assay_suffix}.tsv", "- samples that didn't assemble any contigs (if any)\n\n")) + + output.write(" {:<71} {:>0}".format("- " + str(genes_dir), "- per-sample predicted gene files\n")) + output.write(" {:<67} {:>0}".format("- *.faa", "- gene amino-acid sequences\n")) + output.write(" {:<67} {:>0}".format("- *.fasta", "- gene nucleotide sequences\n")) + output.write(" {:<67} {:>0}".format("- *.gff", "- predicted genes in general feature format\n\n")) + + output.write(" {:<71} {:>0}".format("- " + str(annotations_and_tax_dir), "- per-sample Kegg Orthology (KO) annotations, taxonomy, and coverages\n")) + output.write(" {:<67} {:>0}".format("- *-gene-coverage-annotation-tax.tsv", "- tables with gene coverage, annotation, and taxonomy info\n")) + output.write(" {:<67} {:>0}".format("- *-contig-coverage-and-tax.tsv", "- tables with contig coverage and taxonomy info\n\n")) + + output.write(" {:<71} {:>0}".format("- " + str(mapping_dir), "- per-sample bam, coverage, and mapping info files\n")) + output.write(" {:<67} {:>0}".format("- *.bam", "- bam files\n")) + output.write(" {:<67} {:>0}".format("- *.tsv", "- coverage files used for metabat2 binning\n")) + output.write(" {:<67} {:>0}".format("- *.txt", "- stdout from bowtie2 mapping\n\n")) + + if os.path.exists(bins_dir): + output.write(" {:<71} {:>0}".format("- " + str(bins_dir), "- genomic bins recovered (if any)\n")) + output.write(" {:<67} {:>0}".format("- *.fasta", "- fasta files of bins recovered\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}bins-overview{assay_suffix}.tsv", "- summary stats of bins recovered\n\n")) + + if os.path.exists(MAGs_dir): + output.write(" {:<71} {:>0}".format("- " + str(MAGs_dir), "- high-quality Metagenome-Assembled Genomes recovered (if any; > 90% est. comp., < 10% est. redundancy)\n")) + output.write(" {:<67} {:>0}".format("- *.fasta", "- fasta files of MAGs\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}MAGs-overview{assay_suffix}.tsv", "- summary stats of MAGs including GTDB taxonomy\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}MAG-level-KO-annotations{assay_suffix}.tsv", "- KO functional annotations associated with each MAG\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}MAG-KEGG-Decoder*", "- KEGG-Decoder summaries of MAG functional annotations\n\n")) + + + output.write(" {:<71} {:>0}".format("- " + str(combined_output_dir), "- summary outputs with all samples combined\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", "- table of combined KO function coverages\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Combined-gene-level-KO-function-coverages-CPM{assay_suffix}.tsv", "- table of combined KO function coverages, normalized to coverage per million\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv", "- table of combined, gene-level taxonomy coverages\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Combined-gene-level-taxonomy-coverages-CPM{assay_suffix}.tsv", "- table of combined, gene-level taxonomy coverages, normalized to coverage per million\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", "- table of combined, contig-level taxonomy coverages\n")) + output.write(" {:<67} {:>0}".format(f"- {output_prefix}Combined-contig-level-taxonomy-coverages-CPM{assay_suffix}.tsv", "- table of combined, contig-level taxonomy coverages, normalized to coverage per million\n\n")) + + output.write(" {:<75} {:>0}".format("- " + str(read_based_dir), "- results generated from read-based approach\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Gene-families{assay_suffix}.tsv", "- gene-family abundances\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Gene-families-grouped-by-taxa{assay_suffix}.tsv", "- gene-family abundances grouped by taxa\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Gene-families-cpm{assay_suffix}.tsv", "- gene-family abundances normalized to copies-per-million\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Gene-families-KO-cpm{assay_suffix}.tsv", "- KO term abundances normalized to copies-per-million\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Pathway-abundances{assay_suffix}.tsv", "- pathway abundances\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Pathway-abundances-grouped-by-taxa{assay_suffix}.tsv", "- pathway abundances grouped by taxa\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Pathway-abundances-cpm{assay_suffix}.tsv", "- pathway abundances normalized to copies-per-million\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Pathway-coverages{assay_suffix}.tsv", "- pathway coverages\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Pathway-coverages-grouped-by-taxa{assay_suffix}.tsv", "- pathway coverages grouped by taxa\n")) + output.write(" {:<71} {:>0}".format(f"- {output_prefix}Metaphlan-taxonomy{assay_suffix}.tsv", "- metaphlan estimated taxonomic relative abundances\n\n")) + + # Processing info + output.write(" {:<75} {:>0}".format("- " + str(processing_zip_file), "- zip archive holding info related to processing\n")) + for item in processing_zip_contents: + + num_levels = item.count("/") + + if num_levels > 1 and not item.endswith("/"): + out_item = re.sub(r'^.*/', '', str(item)) + elif num_levels == 1 and not item.endswith("/"): + out_item = re.sub(r'^.*/', '', str(item)) + elif num_levels > 1: + out_item = re.sub(r'^[^/]*/', '', str(item)) + else: + out_item = str(item) + + if item.endswith('/'): + num_levels -= 1 + + num_spaces = num_levels * 4 + + output.write(" " + " " * num_spaces + "- " + out_item + "\n") + + output.write("\n") + + + + +def main(): + ### Variable setup ### + # Suffixes + output_prefix = str(args.output_prefix) + assay_suffix = str(args.assay_suffix) + # Directories + raw_reads_dir = str(args.raw_reads_dir) + fastqc_dir = str(args.fastqc_dir) + filtered_reads_dir = str(args.filtered_reads_dir) + read_based_dir = str(args.read_based_dir) + assembly_based_dir = str(args.assembly_based_dir) + assemblies_dir = str(args.assemblies_dir) + genes_dir = str(args.genes_dir) + annotations_and_tax_dir = str(args.annotations_and_tax_dir) + mapping_dir = str(args.mapping_dir) + bins_dir = str(args.bins_dir) + MAGs_dir = str(args.MAGs_dir) + combined_output_dir = str(args.combined_output_dir) + # Files + processing_zip_file = str(args.processing_zip_file) + output_file = str(args.output) + + processing_zip_contents = get_processing_zip_contents(processing_zip_file) + + with open(output_file, "w") as output: + + write_header(output, args.GLDS_ID, args.name, args.email, args.protocol_ID) + + write_metagenomics_body(output, output_file, assay_suffix, output_prefix, processing_zip_file, + processing_zip_contents, fastqc_dir, raw_reads_dir, filtered_reads_dir, + read_based_dir, assembly_based_dir, assemblies_dir, mapping_dir, genes_dir, + annotations_and_tax_dir, bins_dir, MAGs_dir, combined_output_dir) + + + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data new file mode 100755 index 00000000..d6d40abd --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data @@ -0,0 +1,800 @@ +#!/usr/bin/env python + +""" +This is a program for validating GeneLab pipeline processed metagenomics datasets. +""" + +import os +import sys +import argparse +import textwrap +import pandas as pd +import zipfile +from statistics import mean, median + +parser = argparse.ArgumentParser(description = "This program validates a GeneLab pipeline processed metagenomics dataset. It is intended to \ + only be run after `GL-gen-processed-metagenomics-readme` has been run successfully.") +required = parser.add_argument_group('required arguments') + +required.add_argument("-g", "--GLDS-ID", help = 'GLDS ID (e.g. "GLDS-276")', action = "store", required = True) +required.add_argument("-s", "--sample-IDs-file", help = "Single-column file with unique file-name prefixes for each sample", + action = "store", required = True) +parser.add_argument("--output", + help = 'Name of output log file (default: "_metagenomics-validation.log", with appended prefix if one is provided)', + default = "", action = "store") +parser.add_argument("-p", "--output-prefix", help = "Output additional file prefix if there is one", action = "store", default = "") +parser.add_argument("-l", "--V_V_guidelines_link", help = "Validation and verification guidelines link", action = "store", + default = "https://genelab-tools.arc.nasa.gov/confluence/pages/viewpage.action?pageId=8225175") +parser.add_argument("--zip_targets", help = "A comma separated list of target files and/or directories to check in processing_info.zip", + action = "store", default = "Snakefile,config.yaml,envs/,logs/,scripts/,unique-sample-IDs.txt") +parser.add_argument("--assay_suffix", help = "Genelab assay suffix", action = "store", default = "_GLmetagenomics") +parser.add_argument("--raw_suffix", help = "Raw reads suffix", action = "store", default ="_HRremoved_raw.fastq.gz") +parser.add_argument("--raw_R1_suffix", help = "Raw forward reads suffix", action = "store", default = "_R1_HRremoved_raw.fastq.gz") +parser.add_argument("--raw_R2_suffix", help = "Raw reverse reads suffix", action = "store", default = "_R2_HRremoved_raw.fastq.gz") +parser.add_argument("--filtered_suffix", help = "Filtered reads suffix", action = "store", default = "_filtered.fastq.gz") +parser.add_argument("--filtered_R1_suffix", help = "Filtered forward reads suffix", action = "store", default = "_R1_filtered.fastq.gz") +parser.add_argument("--filtered_R2_suffix", help = "Filtered reverse reads suffix", action = "store", default = "_R2_filtered.fastq.gz") +parser.add_argument("--logs_dir_basename", help = "Specifies the basename of the directory containing per sample run logs", + action = "store", default = "logs/") +parser.add_argument("--processing_zip_file", help = "Specifies the location of processing_info.zip", + action = "store", default = "processing_info.zip") +parser.add_argument("--readme", help = "Specifies the location of README.txt", + action = "store", default = "README.txt") +parser.add_argument("--raw_reads_dir", help = "Specifies the location of the raw reads directory if they are to be included", action = "store", default = "") +parser.add_argument("--fastqc_dir", help = "Specifies the location of fastqc and multiqc reports directory", + action = "store", default = "FastQC_Outputs/") +parser.add_argument("--filtered_reads_dir", help = "Specifies the location of the filtered reads directory", + action = "store", default = "Filtered_Sequence_Data/") +parser.add_argument("--assembly_based_dir", help = "Specifies the location of the directory containing results generated from assembly-based processing approach", + action = "store", default = "Assembly-based_Processing/") +parser.add_argument("--read_based_dir", help = "Specifies the location of the directory containing results generated from read-based processing approach", + action = "store", default = "Read-based_Processing/") +parser.add_argument("--assemblies_dir", help = "Specifies the location of the directory containing sample contig assemblies", + action = "store", default = "Assembly-based_Processing/assemblies/") +parser.add_argument("--genes_dir", help = "Specifies the location of the directory containing predicted genes", + action = "store", default = "Assembly-based_Processing/predicted-genes/") +parser.add_argument("--annotations_and_tax_dir", help = "Specifies the location of the directory containing contigs annotation and taxonomy", + action = "store", default = "Assembly-based_Processing/annotations-and-taxonomy/") +parser.add_argument("--mapping_dir", help = "Specifies the location of the directory containing per-sample bam, coverage, and mapping info files", + action = "store", default = "Assembly-based_Processing/read-mapping/") +parser.add_argument("--bins_dir", help = "Specifies the location of the directory containing recovered genome bins", + action = "store", default = "Assembly-based_Processing/bins/") +parser.add_argument("--MAGs_dir", help = "Specifies the location of the directory containing meta-assembled genomes (MAGs)", + action = "store", default = "Assembly-based_Processing/MAGs/") +parser.add_argument("--combined_output_dir", help = "Specifies the location of the directory containing contig annotation summary outputs with all samples combined", + action = "store", default = "Assembly-based_Processing/combined-outputs/") +parser.add_argument("--single-ended", help = "Add this flag if data are single-end sequencing.", action = "store_true") +parser.add_argument("--primers-already-trimmed", help = "Add this flag if primers were trimmed prior to GeneLab processing, \ + therefore there are no trimmed sequence data", action = "store_true") +parser.add_argument("--R1-used-as-single-ended-data", help = "Provide this flag if processing only R1 reads as single-end (as the expected raw \ + filename suffixes will have 'R1' in there)", + action = "store_true") +parser.add_argument("--skip_raw_multiqc", help = "Provide this flag to skip checking for samples present in raw_multiqc_report.zip", + action = "store_true") + + +if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(0) + +args = parser.parse_args() + + +################################################################################ + +# Setting some colors +tty_colors = { + 'green' : '\033[0;32m%s\033[0m', + 'yellow' : '\033[0;33m%s\033[0m', + 'red' : '\033[0;31m%s\033[0m' +} + + +######################### Aesthetic functions ################################ +def color_text(text, color='green'): + if sys.stdout.isatty(): + return tty_colors[color] % text + else: + return text + + +def wprint(text): + """ Print wrapper """ + print(textwrap.fill(text, width = 80, initial_indent=" ", + subsequent_indent=" ", break_on_hyphens=False)) + + +def report_failure(validation_log, message, color = "yellow", write_log = True): + print("") + wprint(color_text(message, color)) + print("\nValidation failed.\n") + + if write_log: + with open(validation_log, "a") as log: + log.write(message + "\n" + "Validation failed." + "\n\n") + sys.exit(1) + + +def modify_symbolic_link(file_path): + """ Modify symbolic link such that it retruns a string containing the parent dir and the base name""" + full_path = os.path.realpath(file_path) + parent_dir = os.path.basename(os.path.dirname(full_path)) + base_name = os.path.basename(full_path) + mod_path = os.path.join(parent_dir, base_name) + return(mod_path) + + +######################### End of Aesthetic functions ################################ + + +############ Main functions ############## + +def setup_log(validation_log, V_V_guidelines_link): + """ Writes validation log's header """ + + with open(validation_log, "w") as log: + log.write(f"Performing baseline Metagenomics V+V as per: {V_V_guidelines_link}\n\n") + command_run = " ".join(sys.argv) + log.write(f"Validation program executed as:\n {command_run}\n\n") + +def append_message_to_log(validation_log, message, one_return = False): + """ Appends line to validation log with one or two newline characters """ + + with open(validation_log, "a") as log: + if one_return: + log.write(f"{message}\n") + else: + log.write(f"{message}\n\n") + +def check_for_file_and_contents(validation_log, file_path): + """ Used by various functions to check if a file exists and that it is not empty """ + + if not os.path.exists(file_path): + report_failure(validation_log, "The expected file '" + str(file_path) + "' does not exist.") + if not os.path.getsize(file_path) > 0: + report_failure(validation_log, "The file '" + str(file_path) + "' is empty.") + +def check_expected_directories(validation_log, expected_dirs): + """ Checks that the expected directories exist """ + + for directory in expected_dirs: + if not os.path.isdir(directory): + report_failure(validation_log, "The directory '" + str(directory) + "' was expected but not found.") + +def read_samples(file_path): + """ Reads unique sample names from file_path into a list """ + + with open(file_path) as f: + sample_names = f.read().splitlines() + return(sample_names) + +def check_multiqc_outputs(validation_log, sample_names, multiqc_zip, + multiqc_stats_file_path, R1_suffix, + R2_suffix, unpaired_suffix, prefix, isSingle_ended): + """ Makes sure all samples' read files are in the multiqc outputs """ + + # Checking raw + zip_file = zipfile.ZipFile(multiqc_zip) + + df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = ["Sample"]) + + file_prefixes_in_multiqc = df["Sample"].tolist() + + # If paired-end + if not isSingle_ended: + + R1_suffix = R1_suffix.split(".")[0].replace(f"_{prefix}", "") + R2_suffix = R2_suffix.split(".")[0].replace(f"_{prefix}", "") + + for sample in sample_names: + if not sample + R1_suffix in file_prefixes_in_multiqc: + report_failure(validation_log, f"The {prefix} multiqc output is missing the expected '" + \ + sample + R1_suffix + "' entry.") + if not sample + R2_suffix in file_prefixes_in_multiqc: + report_failure(validation_log, f"The {prefix} multiqc output is missing the expected '" + \ + sample + R2_suffix + "' entry.") + # If single-end + else: + + suffix = unpaired_suffix.split(".")[0].replace(f"_{prefix}", "") + + for sample in sample_names: + if not sample + suffix in file_prefixes_in_multiqc and not sample in file_prefixes_in_multiqc: + report_failure(validation_log, f"The {prefix} multiqc output is missing the expected '" + \ + sample + suffix + "' entry.") + + +def check_fastq_files(validation_log, sample_names, reads_dir, + unpaired_suffix, R1_suffix, R2_suffix, isSingle_ended): + """ Makes sure all expected read fastq files exist and hold something """ + + for sample in sample_names: + ## If paired-end + if not isSingle_ended: + check_for_file_and_contents(validation_log, os.path.join(reads_dir, sample + R1_suffix)) + check_for_file_and_contents(validation_log, os.path.join(reads_dir, sample + R2_suffix)) + ## If single-end + else: + check_for_file_and_contents(validation_log, os.path.join(reads_dir, sample + unpaired_suffix)) + + +def get_files_in_dir(dir_path): + + return([f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))]) + + +def check_read_based_outputs(validation_log, filenames, read_based_dir): + """ Makes sure outputs exist and aren't empty """ + + for file in filenames: + + check_for_file_and_contents(validation_log, os.path.join(read_based_dir, file)) + + +def check_general_fasta_format(validation_log, file_path): + """ + Check that a fasta file is formatted properly i.e. + the number of headers equals the number of sequences + """ + if not os.path.getsize(file_path) > 0: + report_failure(validation_log, "The fasta file '" + str(file_path) + "' is empty but isn't expected to be.") + + line_num = 0 + num_headers = 0 + num_seqs = 0 + + with open(file_path) as in_file: + + for line in in_file: + + # Keeping track of current line for reporting any problems + line_num += 1 + + if line.strip().startswith(">"): + num_headers += 1 + else: + num_seqs += 1 + + if num_headers != num_seqs + 1 and num_headers != num_seqs: + report_failure(validation_log, "Fasta file '" + str(file_path) + \ + "' does not seem to be formatted properly. Problem detected at line " + \ + str(line_num) + ".") + + +def get_failed_assemblies(failed_assemblies): + """ Retrieves a list of samples for which assembly failed """ + + failed_assemblies_list = [] + + if os.path.exists(failed_assemblies): + with open(failed_assemblies) as failed: + for line in failed: + failed_assemblies_list.append(line.strip().split("\t")[0]) + return(failed_assemblies_list) + + +def check_assembly_based_file(validation_log, sample, file_path, failed_assemblies_list, assembly = True): + + if not os.path.exists(file_path): + report_failure(validation_log, "The expected file '" + str(file_path) + "' does not exist.") + + if not os.path.getsize(file_path) > 0: + + # A sample can have no genes called even if the assembly produced contigs, + # so this is only throwing a warning if we are checking an assembly here + if sample not in failed_assemblies_list and assembly == True: + + report_failure(validation_log, "The file '" + str(file_path) + \ + "' is empty, but that sample isn't noted in the 'Failed-assemblies.tsv' file as it should be if the assembly failed.") + + +def check_assembly_based_genes_file(validation_log, sample, file_path, failed_assemblies_list, assembly = True): + """ + Separate function for working with expected output genes files, to handle + cases where assemblies can succeed while there are still no gene calls + + Just checks the file isn't empty if it exists + """ + + if os.path.exists(file_path) and sample not in failed_assemblies_list: + + if not os.path.getsize(file_path) > 0: + + report_failure(validation_log, "The expected file '" + str(file_path) + \ + "' exists, but appears to be empty when it shouldn't be.") + + +def check_assemblies(validation_log, sample_names, failed_assemblies_list, + assembly_suffix, assemblies_dir, assembly_summary): + """ A function to find sample assemblies """ + + ## Assemblies_dir ## + for sample in sample_names: + + curr_fasta_path = os.path.join(assemblies_dir, sample + assembly_suffix) + + # Checking that the file is present and not empty, unless it is noted + # in the Failed-assemblies file, then continuing to next sample + if sample not in failed_assemblies_list: + + check_assembly_based_file(validation_log, sample, curr_fasta_path, failed_assemblies_list) + + # Checking the general fasta format if present + check_general_fasta_format(validation_log, curr_fasta_path) + + # Making sure assembly summary file is there + assembly_summary_path = os.path.join(assemblies_dir, assembly_summary) + + if not os.path.exists(assembly_summary_path): + report_failure(validation_log, "The assembly summary file, " + str(assembly_summary_path) + \ + ", is expected but was not found.") + + +def check_genes(validation_log, sample_names, failed_assemblies_list, genes_dir, + predicted_gene_file_suffixes, gene_fasta_suffixes): + """ A function to find samples predicted genes associated files """ + + # If any assembly failed, these files won't exist for that assembly + # (they also may not exist if an assembly produced contigs too but not genes were called) + for sample in sample_names: + + if sample not in failed_assemblies_list: + + for suffix in predicted_gene_file_suffixes: + + curr_file_path = os.path.join(genes_dir, sample + suffix) + + # Checking that the file is present and not empty + check_assembly_based_genes_file(validation_log, sample, curr_file_path, + failed_assemblies_list, assembly = False) + + # Checking fasta format for those that exist + for suffix in gene_fasta_suffixes: + + curr_fasta_path = os.path.join(genes_dir, sample + suffix) + + if os.path.exists(curr_fasta_path) and os.path.getsize(curr_fasta_path) > 0: + check_general_fasta_format(validation_log, curr_fasta_path) + + +def check_contig_annotation(validation_log, sample_names, annotations_and_tax_dir, annotations_suffixes): + """Checks that the contig annotation files exist and that they are not empty""" + for sample in sample_names: + + for suffix in annotations_suffixes: + + curr_file_path = os.path.join(annotations_and_tax_dir, sample + suffix) + + check_for_file_and_contents(validation_log, curr_file_path) + + +def check_mapping(validation_log, sample_names, mapping_dir, failed_assemblies_list, + mapping_dir_suffixes_all_have, mapping_info_suffix): + """Checks that the outputs of read mapping to sample assemblies exist and that they are not empty""" + + for sample in sample_names: + + for suffix in mapping_dir_suffixes_all_have: + + curr_file_path = os.path.join(mapping_dir, sample + suffix) + + # Checking the file is present and not empty unless it is noted in the Failed-assemblies file + if sample not in failed_assemblies_list: + check_assembly_based_file(validation_log, sample, curr_file_path, failed_assemblies_list) + + # Checking for mapping-info file for those that should have it + if sample not in failed_assemblies_list: + + curr_file_path = os.path.join(mapping_dir, sample + mapping_info_suffix) + + check_assembly_based_file(validation_log, sample, curr_file_path, failed_assemblies_list) + + +def check_combined_outputs(validation_log, combined_output_dir, expected_assembly_combined_outputs): + """Checks that assembly-based summary files exist and that they are not empty""" + + for filename in expected_assembly_combined_outputs: + + curr_file_path = os.path.join(combined_output_dir, filename) + + check_for_file_and_contents(validation_log, curr_file_path) + + +def check_bins(validation_log, bins_dir, bins_summary, output_fasta_bins): + + # Checking for contents (checking fasta format not straightforward when there are softwraps, + # but don't want to remove them on these due to large contigs) + for bin_file in output_fasta_bins: + + curr_file_path = os.path.join(bins_dir, bin_file) + + if not os.path.getsize(curr_file_path) > 0: + + report_failure(validation_log, "The file '" + str(curr_file_path) + \ + "' is empty, but shouldn't be there if that's the case.") + + # Making sure summary table is there if there are any bins + if len(output_fasta_bins) > 0: + + bins_summary_path = os.path.join(bins_dir, bins_summary) + + if not os.path.exists(bins_summary_path): + + report_failure(validation_log, "The bins summary file, " + str(bins_summary_path) + \ + ", is expected but was not found.") + + +def check_mags(validation_log, MAGs_dir, output_fasta_MAGs, output_fasta_bins, MAGs_summary): + + # Checking for contents (checking fasta format not straightforward when there are softwraps, + # but don't want to remove them on these due to large contigs) + for MAG_file in output_fasta_MAGs: + + curr_file_path = os.path.join(MAGs_dir, MAG_file) + + if not os.path.getsize(curr_file_path) > 0: + + report_failure(validation_log, "The file '" + str(curr_file_path) + \ + "' is empty, but shouldn't be there if that's the case.") + + # Making sure summary table is there if there are any bins + if len(output_fasta_bins) > 0: + + MAGs_summary_path = os.path.join(MAGs_dir, MAGs_summary) + + if not os.path.exists(MAGs_summary_path): + + report_failure(validation_log, "The MAGs summary file, " + str(MAGs_summary_path) + \ + ", is expected but was not found.") + + +def check_assembly_based_overview_table(validation_log, expected_samples, overview_table_path): + """ Makes sure the output table exists and all input samples are in it """ + + # Making sure it exists and is not empty + check_for_file_and_contents(validation_log, overview_table_path) + + # Making sure all samples are in there + # reading in table and getting sample IDs in list + overview_tab = pd.read_csv(overview_table_path, sep = "\t") + samples_in_tab = overview_tab['Sample_ID'].tolist() + + missing_sample_IDs = [] + + for sample in expected_samples: + if sample not in samples_in_tab: + missing_sample_IDs.append(sample) + + if len(missing_sample_IDs) > 0: + report_failure(validation_log, "The assembly overview table, '" + \ + overview_table_path + \ + f"', does not contain these ({', '.join(missing_sample_IDs)}) expected sample(s).") + + +def check_metagenomics_processing_zip(validation_log, samples, processing_zip_file, expected_zip_contents, + expected_log_file_suffixes, logs_dir): + """ This makes sure a processing zip exists and has the expected core components """ + + # Check that the file exists and that it is not empty + check_for_file_and_contents(validation_log, processing_zip_file) + + with zipfile.ZipFile(processing_zip_file) as zip_obj: + entries = zip_obj.namelist() + ROOT_DIR = entries[0] + + for item in expected_zip_contents: + + if ROOT_DIR + item not in entries: + report_failure(validation_log, "The '" + str(processing_zip_file) + \ + "' does not have '" + str(item) + "' as expected.") + + # Checking log files + for sample in samples: + + for suffix in expected_log_file_suffixes: + + target_log = ROOT_DIR + logs_dir + sample + suffix + + if target_log not in entries: + report_failure(validation_log, "The '" + str(processing_zip_file) + \ + "' does not have the '" + str(target_log) + "' log file as expected.") + + +def report_success(validation_log): + print("") + wprint(color_text("Validation has completed successfully :)", "green")) + print(f"\n Log written to: '{validation_log}'\n") + + with open(validation_log, "a") as log: + + log.write(" -----------------------------------------------------------------------------\n") + log.write(" Validation completed successfully." + "\n") + log.write(" -----------------------------------------------------------------------------\n") + +def gen_stats(list_of_ints): + + """ Returns min, max, mean, median of input integer list """ + + min_val = min(list_of_ints) + max_val = max(list_of_ints) + + mean_val = round(mean(list_of_ints), 2) + median_val = int(median(list_of_ints)) + + return(min_val, max_val, mean_val, median_val) + + +def get_read_count_stats(validation_log, prefix, multiqc_zip, multiqc_stats_file_path): + + """ Grabs read counts and summarizes """ + + zip_file = zipfile.ZipFile(multiqc_zip) + + df = pd.read_csv(zip_file.open(multiqc_stats_file_path), sep = "\t", usecols = [6]) + + df.columns = ["counts"] + counts = df.counts.tolist() + + # Getting rid of decimals + counts = [ int(round(i, 0)) for i in counts ] + + Min, Max, Mean, Median = gen_stats(counts) + + + print(f"\n {prefix.title()} read count summary:") + print(" {:<10} {:>0}".format("Min:", Min)) + print(" {:<10} {:>0}".format("Max:", Max)) + print(" {:<10} {:>0}".format("Mean:", Mean)) + print(" {:<10} {:>0}".format("Median:", Median)) + + with open(validation_log, "a") as log: + + log.write(f"\n {prefix.title()} read count summary:") + log.write("\n {:<10} {:>0}".format("Min:", Min)) + log.write("\n {:<10} {:>0}".format("Max:", Max)) + log.write("\n {:<10} {:>0}".format("Mean:", Mean)) + log.write("\n {:<10} {:>0}".format("Median:", Median)) + + +def main(): + + ###---------------------------------------- Variable setup ----------------------------------------------### + output_prefix = str(args.output_prefix) + fastqc_dir = str(args.fastqc_dir) + filtered_reads_dir = str(args.filtered_reads_dir) + read_based_dir = str(args.read_based_dir) + assembly_based_dir = str(args.assembly_based_dir) + assemblies_dir = str(args.assemblies_dir) + genes_dir = str(args.genes_dir) + annotations_and_tax_dir = str(args.annotations_and_tax_dir) + mapping_dir = str(args.mapping_dir) + bins_dir = str(args.bins_dir) + MAGs_dir = str(args.MAGs_dir) + combined_output_dir = str(args.combined_output_dir) + logs_dir = str(args.logs_dir_basename) + processing_zip_file = str(args.processing_zip_file) + + # Just in case user only specified --R1-used-as-single-ended, but didn't specify --single-ended + if args.R1_used_as_single_ended_data: + args.single_ended = True + + V_V_guidelines_link = str(args.V_V_guidelines_link ) + + # Suffixes + assay_suffix = str(args.assay_suffix) + raw_suffix = str(args.raw_suffix) + raw_R1_suffix = str(args.raw_R1_suffix) + raw_R2_suffix = str(args.raw_R2_suffix) + filtered_suffix = str(args.filtered_suffix) + filtered_R1_suffix = str(args.filtered_R1_suffix) + filtered_R2_suffix = str(args.filtered_R2_suffix) + assembly_suffix = "-assembly.fasta" + predicted_gene_file_suffixes = ["-genes.faa", "-genes.gff", "-genes.fasta"] + gene_fasta_suffixes = ["-genes.faa", "-genes.fasta"] + annotations_suffixes = ["-gene-coverage-annotation-and-tax.tsv", "-contig-coverage-and-tax.tsv"] + mapping_dir_suffixes_all_have = [".bam", "-metabat-assembly-depth.tsv"] + mapping_info_suffix = "-mapping-info.txt" + + # Expected Directories + expected_dirs = [fastqc_dir, filtered_reads_dir, assembly_based_dir, + assemblies_dir, genes_dir, annotations_and_tax_dir, mapping_dir, + combined_output_dir, bins_dir, MAGs_dir, read_based_dir] + + if args.raw_reads_dir != "": + expected_dirs.append(args.raw_reads_dir) + + + # Expected files + assembly_summary = f"{output_prefix}assembly-summaries{assay_suffix}.tsv" + failed_assemblies = os.path.join(assemblies_dir,f"{output_prefix}Failed-assemblies.tsv") + + raw_multiqc_zip = f"{output_prefix}raw_multiqc{assay_suffix}_report.zip" + filtered_multiqc_zip = f"{output_prefix}filtered_multiqc{assay_suffix}_report.zip" + raw_multiqc_stats_file_path = f"{output_prefix}raw_multiqc_report.zip".split(".")[0] + \ + f"/{output_prefix}raw_multiqc_data/multiqc_general_stats.txt" + filtered_multiqc_stats_file_path = f"{output_prefix}filtered_multiqc_report.zip".split(".")[0] + \ + f"/{output_prefix}filtered_multiqc_data/multiqc_general_stats.txt" + + + expected_assembly_combined_outputs = [f"{output_prefix}Combined-contig-level-taxonomy-coverages-CPM{assay_suffix}.tsv", + f"{output_prefix}Combined-gene-level-KO-function-coverages-CPM{assay_suffix}.tsv", + f"{output_prefix}Combined-gene-level-taxonomy-coverages-CPM{assay_suffix}.tsv", + f"{output_prefix}Combined-contig-level-taxonomy-coverages{assay_suffix}.tsv", + f"{output_prefix}Combined-gene-level-KO-function-coverages{assay_suffix}.tsv", + f"{output_prefix}Combined-gene-level-taxonomy-coverages{assay_suffix}.tsv"] + + assembly_based_overview_table = os.path.join(assembly_based_dir, f"{output_prefix}Assembly-based-processing-overview{assay_suffix}.tsv") + + expected_read_based_outputs = [f"{output_prefix}Gene-families-KO-cpm{assay_suffix}.tsv", + f"{output_prefix}Gene-families-cpm{assay_suffix}.tsv", + f"{output_prefix}Gene-families-grouped-by-taxa{assay_suffix}.tsv", + f"{output_prefix}Gene-families{assay_suffix}.tsv", + f"{output_prefix}Metaphlan-taxonomy{assay_suffix}.tsv", + f"{output_prefix}Pathway-abundances-cpm{assay_suffix}.tsv", + f"{output_prefix}Pathway-abundances-grouped-by-taxa{assay_suffix}.tsv", + f"{output_prefix}Pathway-abundances{assay_suffix}.tsv", + f"{output_prefix}Pathway-coverages-grouped-by-taxa{assay_suffix}.tsv", + f"{output_prefix}Pathway-coverages{assay_suffix}.tsv"] + + expected_zip_contents = str(args.zip_targets).split(",") + + #expected_log_file_suffixes = ["-CAT.log", "-assembly.log", "-bam-summarize-and-metabat.log", "-bowtie2-build.log", + # "-bbduk.log", "-kofamscan.log", "-pileup.log", "-prodigal.log", "-humann3-run.log"] + + expected_log_file_suffixes = ["-assembly.log", "-bbduk.log"] + + # Setting-up the output log file name + if args.output == "": + validation_log = f"{str(args.GLDS_ID)}_{output_prefix}metagenomics-validation.log" + else: + validation_log = str(args.output) + + # ------------------------------------------------- Logging Begins -----------------------------------------------------------# + + # Initializing the log file + setup_log(validation_log, V_V_guidelines_link) + append_message_to_log(validation_log, f"Summary of checks:") + + # Check if README.txt exists + check_for_file_and_contents(validation_log, args.readme) + append_message_to_log(validation_log, f" - populated {args.readme} detected") + # Check if the expected directories exist + check_expected_directories(validation_log, expected_dirs) + # Retrieve unique sample names from the sample IDs file + sample_names = read_samples(args.sample_IDs_file) + + # Check raw multiqc outputs + raw_multiqc_zip = os.path.join(fastqc_dir, raw_multiqc_zip) + raw_prefix = "raw" + + if not args.skip_raw_multiqc: + check_multiqc_outputs(validation_log, sample_names, raw_multiqc_zip, + raw_multiqc_stats_file_path, raw_R1_suffix, + raw_R2_suffix, raw_suffix, raw_prefix, args.single_ended) + append_message_to_log(validation_log, f" - all expected samples were found in raw multiqc files in {fastqc_dir}") + + # Check filtered multiqc outputs + filtered_multiqc_zip = os.path.join(fastqc_dir,filtered_multiqc_zip) + filtered_prefix = "filtered" + check_multiqc_outputs(validation_log, sample_names, filtered_multiqc_zip, + filtered_multiqc_stats_file_path, filtered_R1_suffix, + filtered_R2_suffix, filtered_suffix, filtered_prefix, args.single_ended) + append_message_to_log(validation_log, f" - all expected samples were found in filtered multiqc files in {fastqc_dir}") + + # Raw reads + if args.raw_reads_dir != "": + check_fastq_files(validation_log, sample_names, args.raw_reads_dir, + raw_suffix, raw_R1_suffix, raw_R2_suffix, args.single_ended) + append_message_to_log(validation_log, f" - all expected fastq read files were found in {args.raw_reads_dir}") + + # Filtered reads + check_fastq_files(validation_log, sample_names, filtered_reads_dir, + filtered_suffix, filtered_R1_suffix, + filtered_R2_suffix, args.single_ended) + append_message_to_log(validation_log, f" - all expected fastq read files were found in {filtered_reads_dir}") + + # -------- Read-based approach files checking --------# + check_read_based_outputs(validation_log, expected_read_based_outputs, read_based_dir) + for file in expected_read_based_outputs: + append_message_to_log(validation_log, f" - {file} was found in {read_based_dir}") + append_message_to_log(validation_log, f" - all expected files for read-based analysis were found in the {read_based_dir} directory") + + # ------------- Aseembly-based approach files checking ---------# + # Get list of samples for which assembly failed + failed_assemblies_list = get_failed_assemblies(failed_assemblies) + + # Check assemblies + check_assemblies(validation_log, sample_names, failed_assemblies_list, + assembly_suffix, assemblies_dir, assembly_summary) + mod_assemblies_dir = modify_symbolic_link(assemblies_dir) if os.path.islink(assemblies_dir) else assemblies_dir + if len(failed_assemblies_list) > 0: + append_message_to_log(validation_log, f" - all samples except {', '.join(failed_assemblies_list)} had assemblies found in {mod_assemblies_dir}") + else: + append_message_to_log(validation_log, f" - all samples had assemblies found in {mod_assemblies_dir}") + append_message_to_log(validation_log, f" - {assembly_summary} was found in {mod_assemblies_dir}") + append_message_to_log(validation_log, f" - all expected assembly files for assembly-based analysis were found in the {mod_assemblies_dir} directory") + + # Check genes + check_genes(validation_log, sample_names, failed_assemblies_list, + genes_dir, predicted_gene_file_suffixes, gene_fasta_suffixes) + mod_genes_dir = modify_symbolic_link(genes_dir) if os.path.islink(genes_dir) else genes_dir + + checked_suffixes = predicted_gene_file_suffixes + gene_fasta_suffixes + for suffix in checked_suffixes: + append_message_to_log(validation_log, f" - *{suffix} predicted genes files were found in {mod_genes_dir}") + append_message_to_log(validation_log, f" - all expected predicted genes files for assembly-based analysis were found in the {mod_genes_dir} directory") + + # Check contig annotation + check_contig_annotation(validation_log, sample_names, annotations_and_tax_dir, annotations_suffixes) + mod_annotations_and_tax_dir = modify_symbolic_link(annotations_and_tax_dir) if os.path.islink(annotations_and_tax_dir) else annotations_and_tax_dir + + for suffix in annotations_suffixes: + append_message_to_log(validation_log, f" - *{suffix} annotation files were found in {mod_annotations_and_tax_dir}") + append_message_to_log(validation_log, f" - all expected annotation files for assembly-based analysis were found in the {mod_annotations_and_tax_dir} directory") + + # Check read mapping + check_mapping(validation_log, sample_names, mapping_dir, failed_assemblies_list, + mapping_dir_suffixes_all_have, mapping_info_suffix) + mod_mapping_dir = modify_symbolic_link(mapping_dir) if os.path.islink(mapping_dir) else mapping_dir + checked_suffixes = mapping_dir_suffixes_all_have + [mapping_info_suffix] + for suffix in checked_suffixes: + append_message_to_log(validation_log, f" - *{suffix} read mapping files were found in the {mod_mapping_dir}") + append_message_to_log(validation_log, f" - all expected read mapping files for assembly-based analysis were found in the {mod_mapping_dir} directory") + + # Combined contigs annotation outputs + check_combined_outputs(validation_log, combined_output_dir, expected_assembly_combined_outputs) + mod_combined_output_dir = modify_symbolic_link(combined_output_dir) if os.path.islink(combined_output_dir) else combined_output_dir + for file in expected_assembly_combined_outputs: + append_message_to_log(validation_log, f" - {file} was found in {mod_combined_output_dir}") + append_message_to_log(validation_log, f" - all expected contig annotation summary files for assembly-based analysis were found in the {mod_combined_output_dir} directory") + + # Check bins - Only if there were bins recovered + output_files_present = get_files_in_dir(bins_dir) + if output_files_present: + bins_summary = f"{output_prefix}bins-overview{assay_suffix}.tsv" + output_fasta_bins = [filename for filename in output_files_present if filename.endswith(".fasta")] + check_bins(validation_log, bins_dir, bins_summary, output_fasta_bins) + mod_bins_dir = modify_symbolic_link(bins_dir) if os.path.islink(bins_dir) else bins_dir + append_message_to_log(validation_log, f" - bins were found in {mod_bins_dir}") + append_message_to_log(validation_log, f" - {bins_summary} was found in {mod_bins_dir}") + else: + append_message_to_log(validation_log, f" - No bin was recovered for any sample") + + # Check MAGS - only if there were MAGs recovered + output_files_present = get_files_in_dir(MAGs_dir) + if output_files_present: + output_fasta_MAGs = [filename for filename in output_files_present if filename.endswith(".fasta")] + MAGs_summary = f"{output_prefix}MAGs-overview{assay_suffix}.tsv" + check_mags(validation_log, MAGs_dir, output_fasta_MAGs, output_fasta_bins, MAGs_summary) + mod_MAGs_dir = modify_symbolic_link(MAGs_dir) if os.path.islink(MAGs_dir) else MAGs_dir + append_message_to_log(validation_log, f" - MAGs were found in {mod_MAGs_dir}") + append_message_to_log(validation_log, f" - {MAGs_summary} was found in {mod_MAGs_dir} directory") + else: + append_message_to_log(validation_log, f" - No MAG was recovered for any sample") + + check_assembly_based_overview_table(validation_log, sample_names, assembly_based_overview_table) + append_message_to_log(validation_log, f" - {assembly_based_overview_table} was found in {assembly_based_dir}") + + append_message_to_log(validation_log, f" - all expected files were found in the {assembly_based_dir} directory") + + # Check processing info + check_metagenomics_processing_zip(validation_log, sample_names, processing_zip_file, + expected_zip_contents, expected_log_file_suffixes, logs_dir) + for file in expected_zip_contents: + append_message_to_log(validation_log, f" - {file} was found in {processing_zip_file}") + + suffixes_modified = [f"*{suffix}" for suffix in expected_log_file_suffixes] + files = ", ".join(suffixes_modified) + append_message_to_log(validation_log, + f" - all expected sample log files with these suffixes ({files}) were found in {processing_zip_file}") + append_message_to_log(validation_log, f" - all expected files were found in {processing_zip_file}") + + # ------------- Summarize ------------------------# + report_success(validation_log) + # Raw + get_read_count_stats(validation_log, raw_prefix, raw_multiqc_zip, raw_multiqc_stats_file_path) + # Filtered + get_read_count_stats(validation_log, filtered_prefix, filtered_multiqc_zip, filtered_multiqc_stats_file_path) + +if __name__ == "__main__": + main() diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh index 416758a2..f5430059 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh @@ -21,4 +21,4 @@ sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${1} \ | sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \ | sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \ | sed -E 's|/[a-z]{6}/[^ ]*||g' \ - | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1} \ No newline at end of file + | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf index 4039515a..2249aa9f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf @@ -80,7 +80,7 @@ if (params.help) { println(" --genes_dir [PATH] Specifies where predicted genes from the assemblies will be published. Default: ../Assembly-based_Processing/predicted-genes/.") println(" --annotations_and_tax_dir [PATH] Contig taxonomy and annotation directory. Default: ../Assembly-based_Processing/annotations-and-taxonomy/.") println(" --mapping_dir [PATH] Read mapping to assembly directory. Default: ../Assembly-based_Processing/read-mapping/.") - println(" --combined_output_dir [PATH] Assembly summuries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.") + println(" --combined_output_dir [PATH] Assembly summaries and reports across samples directory. Default: ../Assembly-based_Processing/combined-outputs/.") println(" --bins_dir [PATH] Assembly bins directory. Default: ../Assembly-based_Processing/bins/.") println(" --MAGs_dir [PATH] Meta assembled genomes (MAGs) directory. Default: ../Assembly-based_Processing/MAGs/.") println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: ../Read-based_Processing/.") diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf new file mode 100644 index 00000000..d3929896 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf @@ -0,0 +1,283 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl = 2 + +process CLEAN_FASTQC_PATHS { + tag "Purging genelab paths from MultiQC zip files in ${params.directories.FastQC_Outputs}" + input: + path(FastQC_Outputs_dir) + output: + path("${OUT_DIR}"), emit: clean_dir + script: + OUT_DIR = "${FastQC_Outputs_dir.baseName}" + """ + WORKDIR=`pwd` + mv ${FastQC_Outputs_dir} FastQC_Outputs_dir + + [ -d ${OUT_DIR}/ ] || mkdir ${OUT_DIR}/ && \\ + cp -r FastQC_Outputs_dir/* ${OUT_DIR}/ + + [ -f ${OUT_DIR}/versions.txt ] && rm -rf ${OUT_DIR}/versions.txt + + cat `which clean-paths.sh` > \${WORKDIR}/clean-paths.sh + chmod +x \${WORKDIR}/clean-paths.sh + + echo "Purging paths from multiqc outputs" + cd \${WORKDIR}/${OUT_DIR}/ + echo "Cleaning raw multiqc files with path info" + unzip raw_multiqc${params.assay_suffix}_report.zip && rm raw_multiqc${params.assay_suffix}_report.zip + cd raw_multiqc_report/raw_multiqc_data/ + + # No reason not to just run it on all + echo "Purging paths in all raw QC files..." + find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\; + cd \${WORKDIR}/${OUT_DIR}/ + + echo "Re-zipping up raw multiqc" + zip -r raw_multiqc${params.assay_suffix}_report.zip raw_multiqc_report/ && rm -rf raw_multiqc_report/ + + echo "Cleaning filtered multiqc files with path info..." + unzip filtered_multiqc${params.assay_suffix}_report.zip && rm filtered_multiqc${params.assay_suffix}_report.zip + cd filtered_multiqc_report/filtered_multiqc_data/ + + + # No reason not to just run it on all + echo "Purging paths in all filtered QC files..." + find . -type f -exec bash \${WORKDIR}/clean-paths.sh '{}' ${params.baseDir} \\; + cd \${WORKDIR}/${OUT_DIR}/ + + + echo "Re-zipping up filtered multiqc..." + zip -r filtered_multiqc${params.assay_suffix}_report.zip filtered_multiqc_report/ && rm -rf filtered_multiqc_report/ + cd \${WORKDIR} + + echo "Purging paths from multiqc outputs completed successfully..." + + echo "Done! Paths purged successfully." + """ + +} + +process PACKAGE_PROCESSING_INFO { + + tag "Purging file paths and zipping processing info" + + input: + val(files_and_dirs) + output: + path("processing_info${params.assay_suffix}.zip"), emit: zip + + script: + """ + cat `which clean-paths.sh` > clean-paths.sh + chmod +x ./clean-paths.sh + [ -d processing_info/ ] || mkdir processing_info/ && \\ + cp -r ${files_and_dirs.join(" ")} processing_info/ + + echo "Purging file paths" + find processing_info/ -type f -exec bash ./clean-paths.sh '{}' ${params.baseDir} \\; + + # Purge file paths and then zip + zip -r processing_info${params.assay_suffix}.zip processing_info/ + """ +} + + +process GENERATE_README { + + beforeScript "chmod +x ${baseDir}/bin/*" + tag "Generating README for ${OSD_accession}" + input: + tuple val(name), val(email), val(output_prefix), + val(OSD_accession), val(protocol_id), + val(FastQC_Outputs), val(Filtered_Sequence_Data), + val(Read_Based_Processing), val(Assembly_Based_Processing), + val(Assemblies), val(Genes), val(Annotations_And_Tax), + val(Mapping), val(Combined_Output) + path(processing_info) + path(Bins) + path(MAGS) + output: + path("README${params.assay_suffix}.txt"), emit: readme + + script: + """ + GL-gen-processed-metagenomics-readme \\ + --output 'README${params.assay_suffix}.txt' \\ + --GLDS-ID '${OSD_accession}' \\ + --output-prefix '${output_prefix}' \\ + --name '${name}' \\ + --email '${email}' \\ + --protocol_ID '${protocol_id}' \\ + --assay_suffix '${params.assay_suffix}' \\ + --processing_zip_file '${processing_info}' \\ + --fastqc_dir '${FastQC_Outputs}' \\ + --filtered_reads_dir '${Filtered_Sequence_Data}' \\ + --read_based_dir '${Read_Based_Processing}' \\ + --assembly_based_dir '${Assembly_Based_Processing}' \\ + --assemblies_dir '${Assemblies}' \\ + --genes_dir '${Genes}' \\ + --annotations_and_tax_dir '${Annotations_And_Tax}' \\ + --mapping_dir '${Mapping}' \\ + --bins_dir '${Bins}' \\ + --MAGs_dir '${MAGS}' \\ + --combined_output_dir '${Combined_Output}' ${params.readme_extra} + """ + +} + + +process VALIDATE_PROCESSING { + + tag "Running automated validation and verification...." + + input: + // Labels + tuple val(GLDS_accession), val(V_V_guidelines_link), val(output_prefix), + val(target_files), val(assay_suffix), val(log_dir_basename), + val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix), + val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix) + // Directory paths + tuple path(Filtered_Sequence_Data), path(Read_Based), + path(Assembly_Based), path(Assemblies), path(Mapping), + path(Genes), path(Annotation_And_Tax), path(Bins), + path(MAGS), path(Combined_Output), path(FastQC_Outputs) + // File paths + path(sample_ids_file) + path(README) + path(processing_info) + + output: + path("${GLDS_accession}_${output_prefix}metagenomics-validation.log"), emit: log + + script: + """ + GL-validate-processed-metagenomics-data \\ + --output '${GLDS_accession}_${output_prefix}metagenomics-validation.log' \\ + --GLDS-ID '${GLDS_accession}' \\ + --readme '${README}' \\ + --sample-IDs-file '${sample_ids_file}' \\ + --V_V_guidelines_link '${V_V_guidelines_link}' \\ + --processing_zip_file '${processing_info}' \\ + --output-prefix '${output_prefix}' \\ + --zip_targets '${target_files}' \\ + --assay_suffix '${assay_suffix}' \\ + --raw_suffix '${raw_suffix}' \\ + --raw_R1_suffix '${raw_R1_suffix}' \\ + --raw_R2_suffix '${raw_R2_suffix}' \\ + --filtered_suffix '${filtered_suffix}' \\ + --filtered_R1_suffix '${filtered_R1_suffix}' \\ + --filtered_R2_suffix '${filtered_R2_suffix}' \\ + --logs_dir_basename '${log_dir_basename}' \\ + --fastqc_dir ${FastQC_Outputs} \\ + --filtered_reads_dir ${Filtered_Sequence_Data} \\ + --read_based_dir ${Read_Based} \\ + --assembly_based_dir ${Assembly_Based} \\ + --assemblies_dir ${Assemblies} \\ + --genes_dir ${Genes} \\ + --annotations_and_tax_dir ${Annotation_And_Tax} \\ + --mapping_dir ${Mapping} \\ + --bins_dir ${Bins} \\ + --MAGs_dir ${MAGS} \\ + --combined_output_dir ${Combined_Output} ${params.validation_extra} + """ +} + + +process GENERATE_CURATION_TABLE { + + beforeScript "chmod +x ${baseDir}/bin/*" + tag "Generating a file association table for curation..." + + input: + // GeneLab accession and Suffixes + tuple val(GLDS_accession), val(output_prefix), val(assay_suffix), + val(raw_suffix), val(raw_R1_suffix), val(raw_R2_suffix), + val(filtered_suffix), val(filtered_R1_suffix), val(filtered_R2_suffix) + // File labels + tuple val(processing_zip_file), val(readme) + // Directory labels as paths - these paths are utilized as mere labels by the script + tuple path(raw_reads_dir), path(filtered_reads_dir), path(read_based_dir), + path(assembly_based_dir), path(annotation_and_tax_dir), path(combined_output_dir) + // Directory paths + tuple path(Assemblies), path(Genes), path(Mapping), + path(Bins), path(MAGS), path(FastQC_Outputs) + path(assay_table) + path(runsheet) + + output: + path("${GLDS_accession}_${output_prefix}-associated-file-names.tsv"), emit: curation_table + + script: + def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip ${assay_table}" : "--assay-table ${assay_table}" + """ + GL-gen-metagenomics-file-associations-table ${INPUT_TABLE} \\ + --runsheet '${runsheet}' \\ + --output '${GLDS_accession}_${output_prefix}-associated-file-names.tsv' \\ + --GLDS-ID '${GLDS_accession}' \\ + --output-prefix '${output_prefix}' \\ + --assay_suffix '${assay_suffix}' \\ + --raw_suffix '${raw_suffix}' \\ + --raw_R1_suffix '${raw_R1_suffix}' \\ + --raw_R2_suffix '${raw_R2_suffix}' \\ + --filtered_suffix '${filtered_suffix}' \\ + --filtered_R1_suffix '${filtered_R1_suffix}' \\ + --filtered_R2_suffix '${filtered_R2_suffix}' \\ + --processing_zip_file '${processing_zip_file}' \\ + --readme '${readme}' \\ + --fastqc_dir '${FastQC_Outputs}' \\ + --assemblies_dir '${Assemblies}' \\ + --genes_dir '${Genes}' \\ + --mapping_dir '${Mapping}' \\ + --bins_dir '${Bins}' \\ + --MAGs_dir '${MAGS}' \\ + --raw_reads_dir '${raw_reads_dir}' \\ + --filtered_reads_dir '${filtered_reads_dir}' \\ + --read_based_dir '${read_based_dir}' \\ + --assembly_based_dir '${assembly_based_dir}' \\ + --annotations_and_tax_dir '${annotation_and_tax_dir}' \\ + --combined_output_dir '${combined_output_dir}' ${params.file_association_extra} + """ +} + + +process GENERATE_MD5SUMS { + + tag "Generating md5sums for the files to be released on OSDR..." + + input: + path(processing_info) + path(README) + val(dirs) + + output: + path("processed_md5sum${params.assay_suffix}.tsv"), emit: md5sum + script: + """ + mkdir processing/ && \\ + cp -r ${dirs.join(" ")} ${processing_info} ${README} \\ + processing/ + + # Generate md5sums + find -L processing/ -type f -exec md5sum '{}' \\; | + awk -v OFS='\\t' 'BEGIN{OFS="\\t"; printf "File Path\\tFile Name\\tmd5\\n"} \\ + {N=split(\$2,a,"/"); sub(/processing\\//, "", \$2); print \$2,a[N],\$1}' \\ + | grep -v "versions.txt" > processed_md5sum${params.assay_suffix}.tsv + """ +} + + +process GENERATE_PROTOCOL { + + beforeScript "chmod +x ${baseDir}/bin/*" + tag "Generating your analysis protocol..." + + input: + path(software_versions) + val(protocol_id) + output: + path("protocol.txt") + script: + """ + generate_protocol.sh ${software_versions} ${protocol_id} > protocol.txt + """ +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf index 14840db4..882c22a3 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf @@ -17,7 +17,7 @@ leave empty if wanting to use memory, the default, put in quotes the path to a d already exists if wanting to use disk space */ -params.gtdb_tk_scratch_location = "" +//params.gtdb_tk_scratch_location = "" /* Retrieve MAGS. Filters checkm results based on estimate completion, redundancy, and diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config index f8895dc4..82df899d 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config @@ -167,7 +167,7 @@ executor.queueSize = 20 Note that relative paths such as '~/' and '../' are not expanded by nextflow's evaluation of files, so don't use that. */ -params.DB_ROOT = ("${baseDir}".split("/")[0..-2]).join('/') + "/Reference_DBs" +params.DB_ROOT = "${baseDir.getParent()}/Reference_DBs" // Mount Humann databases to their predefined locations in the Biobakery container being used if(params.database.chocophlan_dir == null || diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config new file mode 100644 index 00000000..6260e97a --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config @@ -0,0 +1,165 @@ +//******** Global parameters *****************// +params { + publishDir_mode = "link" // "copy", "link", "symlink" + //-------- Parmeters used to generate README.txt ------------------// + name = "FirstName M. LastName" // name of analyst + email = "name@nasa.gov" // email of analyst + // Genelab pipeline document protocol id used to process the data + protocol_id = "GL-DPPD-7107-A" + GLDS_accession = "" // e.g. "GLDS-574" + OSD_accession = "" // e.g. "OSD-574" + assay_suffix = "_GLmetagenomics" + readme = "README${params.assay_suffix}.txt" + processing_zip_file = "processing_info${params.assay_suffix}.zip" + logs = "Logs/" // base directory name of directory containg sample logs from processing - should always end with '/' + + + /* extra parameters and arguments to GL-gen-processed-metagenomics-data-readme command. + run `GL-gen-processed-metagenomics-readme --help` for extra parameters that can be set + "--raw-reads-dir '../Raw_Sequence_Data/' " for Metagenomics + + */ + readme_extra = "" + + output_prefix = "" + V_V_guidelines_link = "https://genelab-tools.arc.nasa.gov/confluence/pages/viewpage.action?pageId=8225175" + // A comma separated list of files and/or directories to find in processing_info.zip + target_files = "main.nf,nextflow.config,unique-sample-IDs.txt,envs/,bin/,config/,modules/,${params.logs}" + // Suffixes + raw_suffix = "_HRremoved_raw.fastq.gz" + raw_R1_suffix = "_R1_HRremoved_raw.fastq.gz" + raw_R2_suffix = "_R2_HRremoved_raw.fastq.gz" + filtered_suffix = "_filtered.fastq.gz" + filtered_R1_suffix = "_R1_filtered.fastq.gz" + filtered_R2_suffix = "_R2_filtered.fastq.gz" + /* Extra parameters and arguments to GL-validate-processed-metagenomics-data command + run `GL-validate-processed-metagenomics-data --help` for extra parameters that can be set + "--single-ended" if data are single-ended + "--R1-used-as-single-ended-data" if processing only R1 reads as single-end + */ + validation_extra = "--skip_raw_multiqc" + + /* Extra parameters and arguments to GL-gen-metagenomics-file-associations-table command + run `GL-gen-metagenomics-file-associations-table --help` for extra parameters that can be set + "--single-ended" if data are single-ended + "--R1-used-as-single-ended-data" if processing only R1 reads as single-end + */ + file_association_extra = "--use-sample-names-from-assay-table" + + + files { + main = "./main.nf" + config = "./nextflow.config" + samples = "./unique-sample-IDs.txt" + assay_table = "" //"../GeneLab/a_OSD-574_metagenomic-sequencing_whole-genome-shotgun-sequencing_illumina.txt" + isa_zip = "" //"../GeneLab/OSD-574_metadata_OSD-574-ISA.zip" + runsheet = "" // "../GeneLab/GLfile.csv" + software_versions = "" //"../Metadata/software_versions.txt" + } + + // Make sure you always end the directory names with a forward slash "/" and that if you use + // relative paths, they are located in the run directory (./) or in its parent (../) + directories { + bin = "./bin/" + envs = "./envs/" + config = "./config/" + modules = "./modules/" + logs = "../${params.logs}" + Raw_Sequence_Data = "../Raw_Sequence_Data" + FastQC_Outputs = "../FastQC_Outputs/" + Read_Based_Processing = "../Read-based_Processing/" + Filtered_Sequence_Data = "../Filtered_Sequence_Data/" + Assembly_Based_Processing = "../Assembly-based_Processing/" + Assemblies = "../Assembly-based_Processing/assemblies/" + Genes = "../Assembly-based_Processing/predicted-genes/" + Annotations_And_Tax = "../Assembly-based_Processing/annotations-and-taxonomy/" + Mapping = "../Assembly-based_Processing/read-mapping/" + Combined_Output = "../Assembly-based_Processing/combined-outputs/" + Bins = "../Assembly-based_Processing/bins/" + MAGS = "../Assembly-based_Processing/MAGs/" + Output_dir = "../Post_Processing/" + } + + conda{ + // Specify paths to existing conda environments + // Leave as is if you'd like to create a new conda environment + genelab = null // "/path/to/envs/genelab-utils" + } + +} + + +params.baseDir = "${baseDir}" +parent_dir = "${baseDir.getParent()}" +// Setting the default container engine as singularity +params.containerEngine = "singularity" +// Conda shouldn't be used be default except when using conda-based profiles +// i.e., slurm_conda and conda +params.use_conda = false + + +/*************************************************************************************** +******************************** Workflow Profiles ************************************ +****************************************************************************************/ +profiles { + slurm { + process.executor = 'slurm' + } + + conda { + conda.enabled = true + params.use_conda = true + } + + singularity { + singularity.enabled = true + singularity.autoMounts = true + singularity.cacheDir = "singularity/" // local singularity images location + params.containerEngine = "singularity" + } + + docker { + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g)' + docker.userEmulation = true + params.containerEngine = "docker" + } +} + +// Maximum number of jobs to submit in parallel +executor.queueSize = 20 + +/************************************************************************************ +*********** Tune process specific resources (cpu, container, memory etc.) *********** +*************************************************************************************/ +process { + //******************* Default process settings ************************// + errorStrategy = "ignore" + cpus = 2 + memory = '5 GB' + cache = 'lenient' + conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + container = "olabiyi/genelab-utils:1.3.22" + publishDir = [path: params.directories.Output_dir, mode: params.publishDir_mode] + + + // Mount Parent directory for processes that copy files + withName: "PACKAGE_PROCESSING_INFO|GENERATE_MD5SUMS" { + + containerOptions = { params.containerEngine == "singularity" ? "-B ${parent_dir}" : "-v ${parent_dir}" } + } +} + + +/****************************************************************************** +**************************** Workflow Metadata ******************************** +*******************************************************************************/ +manifest { + author = 'Olabiyi Aderemi Obayomi' + homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/' + description = 'Metagenomics Illumina post-processing workflow' + mainScript = 'post_processing.nf' + defaultBranch = 'main' + nextflowVersion = '>=22.10.1' + version = '1.0.0' +} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf new file mode 100644 index 00000000..b9603162 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf @@ -0,0 +1,323 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// color defs +c_back_bright_red = "\u001b[41;1m"; +c_bright_green = "\u001b[32;1m"; +c_blue = "\033[0;34m"; +c_reset = "\033[0m"; + +params.help = false +params.debug = false + + +/************************************************** +* HELP MENU ************************************** +**************************************************/ +if(params.help){ + + println() + println("GeneLab Post Processing Pipeline: $workflow.manifest.version") + println("USAGE:") + println("Example: Submit and run jobs with slurm in singularity containers.") + println(" > nextflow -C post_processing.config run post_processing.nf -resume -profile slurm,singularity") + println() + println("Required Parameters:") + println("""-profile [STRING] Specifies the profile to be used to run the workflow. Options are [slurm, singularity, docker, and conda]. + singularity, docker and conda will run the workflow locally using singularity, docker, and conda, respectively. + To combine profiles, separate two or more profiles with a comma. + For example, to combine slurm and singularity profiles, pass 'slurm,singularity' as argument. """) + println(" --publishDir_mode [STRING] Specifies how nextflow handles output file publishing. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.") + println(" --GLDS_accession [STRING] A Genelab GLDS accession number. Example GLDS-574. Default: empty string") + println(" --OSD_accession [STRING] A Genelab OSD accession number. Example OSD-574. Default: empty string") + println(" --name [STRING] The analyst's full name. E.g. 'FirstName A. LastName'. Default: FirstName A. LastName") + println(" --email [STRING] The analyst's email address. E.g. 'mail@nasa.gov'. Default: mail@nasa.gov") + println(" --logs [STRING] Base directory name of directory containig per sample logs from processing - should always end with '/'. E.g. 'Logs/'. Default: Logs/") + println(" --assay_suffix [STRING] Genelab's assay suffix. Default: _GLmetagenomics.") + println(" --output_prefix [STRING] Unique name to tag onto output files. Default: empty string.") + println(" --V_V_guidelines_link [URL] Genelab metagenomics data validation and verification guidelines link. Default: https://genelab-tools.arc.nasa.gov/confluence/pages/viewpage.action?pageId=8225175.") + println(" --target_files [STRING] A comma separated list of target files and/or directories to find in processing_info.zip. Default: main.nf,nextflow.config,unique-sample-IDs.txt,envs/,bin/,config/,modules/,<--logs>.") + println("File Suffixes:") + println(" --raw_suffix [STRING] Suffix used for the raw reads during processing. Only applicable when input reads are single-end. Default: _HRremoved_raw.fastq.gz.") + println(" --raw_R1_suffix [STRING] Suffix used for the raw forward reads during processing. Default: _R1_HRremoved_raw.fastq.gz.") + println(" --raw_R2_suffix [STRING] Suffix used for the raw reverse reads during processing. Default: _R2_HRremoved_raw.fastq.gz.") + + println(" --filtered_suffix [STRING] Suffix used for quality filtered reads during processing. Only applicable when input reads are single-end. Default: _filtered.fastq.gz.") + println(" --filtered_R1_suffix [STRING] Suffix to use for quality filtered forward reads during processing. Default: _R1_filtered.fastq.gz.") + println(" --filtered_R2_suffix [STRING] Suffix to use for quality filtered reverse reads during processing. Default: _R2_filtered.fastq.gz.") + println() + println("Extra parameters to scripts:") + println(" --readme_extra [STRING] Extra parameters and arguments to GL-gen-processed-metagenomics-data-readme command. Run 'GL-gen-processed-metagenomics-readme --help' for extra parameters that can be set. Example '--raw-reads-dir ../Raw_Sequence_Data/'. Default: empty string") + println(" --validation_extra [STRING] Extra parameters and arguments to GL-validate-processed-metagenomics-data command. Run 'GL-validate-processed-metagenomics-data --help' for extra parameters that can be set. Example '--single-ended --R1-used-as-single-ended-data --skip_raw_multiqc'. Default: '--skip_raw_multiqc' ") + println(" --file_association_extra [STRING] Extra parameters and arguments to GL-gen-metagenomics-file-associations-table command. Run 'GL-gen-metagenomics-file-associations-table --help' for extra parameters that can be set. Example '--single-ended --R1-used-as-single-ended-data'. Default: '--use-sample-names-from-assay-table' ") + println() + println("Files:") + println(" --files.main [PATH] The main workflow script used for processing. Default: ./main.nf") + println(" --files.config [PATH] The main workflow configuration file used for processing. Default: ./nextflow.config") + println(" --files.samples [PATH] A single column file with sample ids on each line generated after running the processing pipeline. Default: ./unique-sample-IDs.txt") + println(" --files.assay_table [PATH] GLDS assay table generated after running the processing pipeline with accession number as input.") + println(" Example, ../Genelab/a_OSD-574_metagenomic-sequencing_whole-genome-shotgun-sequencing_illumina.txt. Default: empty string") + println(" --files.isa_zip [PATH] Genelab ISA zip files containing an assay atable for the OSD accession. This is only required if --files.assay_table is not set.") + println(" Example, ../Genelab/OSD-574_metadata_OSD-574-ISA.zip. Default: empty string") + println(" --files.runsheet [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired) used to run the processing pipeline. This is the value set to the paremater --csv_file when run the processing pipeline with a csv file as input otherwise it is the GLfile.csv in the GeneLab directory if --GLDS_accession was used as input. Example '../GeneLab/GLfile.csv'. Default: empty string") + println(" --files.software_versions [PATH] A file generated after running the processing pipeline listing the software versions used. Default: ../Metadata/software_versions.txt") + println() + println("Directories:") + println(" --directories.config [PATH] A directory containing configuration files used in the processing pipeline. Only relevent in Metagenomics and AmpIllumina workflows. Default: ./config/") + println(" --directories.bin [PATH] A directory containing scripts used by nextflow. Default: ./bin/") + println(" --directories.envs [PATH] A directory containing conda yaml files. Default: ./envs/") + println(" --directories.config [PATH] A directory containing config files. Default: ./config/") + println(" --directories.modules [PATH] A directory containing nextflow module scripts. Default: ./modules/") + println(" --directories.Raw_Sequence_Data [PATH] A directory containing raw sequence and raw sequence outputs. Default: ../Raw_Sequence_Data/") + println(" --directories.FastQC_Outputs [PATH] A directory containing fastqc and multiqc zip reports. Default: ../FastQC_Outputs/") + println(" --directories.Filtered_Sequence_Data [PATH] A directory containing the outputs of read filtering after running the processing pipeline. Default: ../Filtered_Sequence_Data/") + println(" --directories.Read_based_Processing [PATH] A directory containing the outputs of read based processing after running the processing pipeline. Default: ../Read_based_Processing/") + println(" --directories.Assembly_based_Processing [PATH] A directory containing the outputs of assembly based processing after running the processing pipeline. Default: ../Assembly_based_Processing/") + println(" --directories.Assemblies [PATH] A directory containing sample contig assemblies after running the processing pipeline. Default: ../Assembly_based_Processing/assemblies/") + println(" --directories.Genes [PATH] A directory containing sample predicted genes after running the processing pipeline. Default: ../Assembly_based_Processing/predicted-genes/") + println(" --directories.Annotations_And_Tax [PATH] A directory containing sample gene and contig annotations after running the processing pipeline. Default: ../Assembly_based_Processing/annotations-and-taxonomy/") + println(" --directories.Mapping [PATH] A directory containing sample read mapping (bam) files after running the processing pipeline. Default: ../Assembly_based_Processing/read-mapping/") + println(" --directories.Combined_Output [PATH] A directory containing assembly summaries and reports across samples after running the processing pipeline. Default: ../Assembly_based_Processing/combined-outputs/") + println(" --directories.Bins [PATH] A directory containing metagenome bins after running the processing pipeline. Default: ../Assembly_based_Processing/bins/") + println(" --directories.MAGS [PATH] A directory containing metagenome assembled genomes (MAGS) after running the processing pipeline. Default: ../Assembly_based_Processing/MAGs/") + println(" --directories.Output_dir [PATH] Specifies the directory where outputs of this post-processing workflow will be published. Default: ../Post_Processing/") + println() + println("Optional arguments:") + println(" --help Print this help message and exit") + println() + println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") + println(" --conda.genelab [PATH] Path to a conda environment containing genelab-utils. Default: null.") + exit 0 +} + + +/************************************************ +*********** Show pipeline parameters ************ +*************************************************/ +if(params.debug){ + +log.info """ + GeneLab Post Processing Pipeline: $workflow.manifest.version + + You have set the following parameters: + Profile: ${workflow.profile} + Analyst's Name : ${params.name} + Analyst's Email : ${params.email} + GLDS Accession : ${params.GLDS_accession} + OSD Accession : ${params.OSD_accession} + Assay Suffix: ${params.assay_suffix} + Output Prefix: ${params.output_prefix} + Logs: ${params.logs} + V & V Link: ${params.V_V_guidelines_link} + Target Files: ${params.target_files} + Nextflow Directory publishing mode: ${params.publishDir_mode} + + Suffixes: + Raw Suffix: ${params.raw_suffix} + Raw R1 suffix: ${params.raw_R1_suffix} + Raw R2 suffix: ${params.raw_R2_suffix} + Filtered Suffix: ${params.filtered_suffix} + Filtered R1 suffix: ${params.filtered_R1_suffix} + Filtered R2 suffix: ${params.filtered_R2_suffix} + + Extra scripts parameters: + Readme Script Extra: ${params.readme_extra} + Validation Script Extra : ${params.validation_extra} + File association Script Extra: ${params.file_association_extra} + + Files: + Main Workflow Script: ${params.files.main} + Nextflow Config File: ${params.files.config} + Samples: ${params.files.samples} + Assay Table: ${params.files.assay_table} + ISA Zip: ${params.files.isa_zip} + Input Runsheet: ${params.files.runsheet} + Software Versions: ${params.files.software_versions} + + Directories: + Config: ${params.directories.config} + Bin: ${params.directories.bin} + Conda Environments: ${params.directories.envs} + Modules: ${params.directories.modules} + Raw Reads Directory: ${params.directories.Raw_Sequence_Data} + Filtered Sequence Data: ${params.directories.Filtered_Sequence_Data} + FastQC Outputs: ${params.directories.FastQC_Outputs} + Read-based Processing: ${params.directories.Read_based_Processing} + Assemblies: ${params.directories.Assemblies} + Genes: ${params.directories.Genes} + Annotations And Taxonomy: ${params.directories.Annotations_And_Tax} + Mapping: ${params.directories.Mapping} + Combined Output: ${params.directories.Combined_Output} + Bins: ${params.directories.Bins} + MAGS: ${params.directories.MAGS} + Pipeline Outputs: ${params.directories.Output_dir} + """ + +} + + +include { CLEAN_FASTQC_PATHS; PACKAGE_PROCESSING_INFO; GENERATE_README; VALIDATE_PROCESSING; + GENERATE_CURATION_TABLE; GENERATE_MD5SUMS; GENERATE_PROTOCOL} from './modules/genelab.nf' + +workflow { + + // ---------------------- Input channels -------------------------------- // + // Input files + sample_ids_file = Channel.fromPath(params.files.samples, checkIfExists: true) + software_versions = Channel.fromPath(params.files.software_versions, checkIfExists: true) + + // Directories + Bins = Channel.fromPath(params.directories.Bins, type: 'dir', checkIfExists: true) + MAGS = Channel.fromPath(params.directories.MAGS, type: 'dir', checkIfExists: true) + + // Input Value channels + OSD_ch = Channel.of([params.name, params.email, params.output_prefix, + params.OSD_accession, params.protocol_id, + params.directories.FastQC_Outputs, + params.directories.Filtered_Sequence_Data, + params.directories.Read_Based_Processing, + params.directories.Assembly_Based_Processing, + params.directories.Assemblies, + params.directories.Genes, + params.directories.Annotations_And_Tax, + params.directories.Mapping, + params.directories.Combined_Output]) + + GLDS_ch = Channel.of([params.GLDS_accession, params.V_V_guidelines_link, params.output_prefix, + params.target_files, params.assay_suffix, params.logs, + params.raw_suffix, params.raw_R1_suffix, params.raw_R2_suffix, + params.filtered_suffix, params.filtered_R1_suffix, params.filtered_R2_suffix]) + + suffix_ch = Channel.of([params.GLDS_accession, params.output_prefix, params.assay_suffix, + params.raw_suffix, params.raw_R1_suffix, params.raw_R2_suffix, + params.filtered_suffix, params.filtered_R1_suffix, params.filtered_R2_suffix]) + + file_label_ch = Channel.of([params.processing_zip_file, params.readme]) + + // processed as paths but utilized as labels in the genberate curation association table script + dir_label_ch = Channel.of([params.directories.Raw_Sequence_Data, + params.directories.Filtered_Sequence_Data, + params.directories.Read_Based_Processing, + params.directories.Assembly_Based_Processing, + params.directories.Annotations_And_Tax, + params.directories.Combined_Output]) + .collect() + .map{ Raw_Sequence_Data, Filtered_Sequence_Data, Read_Based_Processing, + Assembly_Based_Processing, Annotations_And_Tax, Combined_Output -> + tuple( file(Raw_Sequence_Data, checkIfExists: true), + file(Filtered_Sequence_Data, checkIfExists: true), + file(Read_Based_Processing, checkIfExists: true), + file(Assembly_Based_Processing, checkIfExists: true), + file(Annotations_And_Tax, checkIfExists: true), + file(Combined_Output, checkIfExists: true) + ) + } + + // If the assay table is provided use it as the input table otherwise use the isa_zip + assay_table_ch = Channel.fromPath("${params.files.assay_table}" == "" ? "${params.files.isa_zip}" : "${params.files.assay_table}", + checkIfExists: true) + + // Runsheet used to execute the processing workflow + runsheet_ch = Channel.fromPath(params.files.runsheet) + + + + // Files and directories to be packaged in processing_info.zip + files_and_dirs_ch = Channel.of(params.directories.config, params.directories.logs, + params.directories.bin, params.directories.modules, + params.directories.envs, params.files.main, + params.files.config, params.files.samples) + .collect() + .map{ config_dir, logs, bin, modules, envs, main, config_file, samples -> + tuple( file(config_dir, checkIfExists: true), + file(logs, checkIfExists: true), + file(bin, checkIfExists: true), + file(modules, checkIfExists: true), + file(envs, checkIfExists: true), + file(main, checkIfExists: true), + file(config_file, checkIfExists: true), + file(samples, checkIfExists: true) + ) } + + // ---------------------- Post-processing begins ---------------------------------// + PACKAGE_PROCESSING_INFO(files_and_dirs_ch) + + + GENERATE_README(OSD_ch, PACKAGE_PROCESSING_INFO.out.zip, Bins, MAGS) + + + FastQC_Outputs_dir = Channel.fromPath(params.directories.FastQC_Outputs, + type: 'dir', checkIfExists: true) + CLEAN_FASTQC_PATHS(FastQC_Outputs_dir) + + validation_dirs_ch = Channel.of(params.directories.Filtered_Sequence_Data, + params.directories.Read_Based_Processing, + params.directories.Assembly_Based_Processing, + params.directories.Assemblies, + params.directories.Mapping, + params.directories.Genes, + params.directories.Annotations_And_Tax, + params.directories.Bins, + params.directories.MAGS, + params.directories.Combined_Output) + .concat(CLEAN_FASTQC_PATHS.out.clean_dir) + .collect() + .map{ filtered_sequence, read_based, assembly_based, assemblies, + mapping, genes, annotation, bins, mags, combined_output, fastqc -> + tuple( file(filtered_sequence, checkIfExists: true), + file(read_based, checkIfExists: true), + file(assembly_based, checkIfExists: true), + file(assemblies, checkIfExists: true), + file(mapping, checkIfExists: true), + file(genes, checkIfExists: true), + file(annotation, checkIfExists: true), + file(bins, checkIfExists: true), + file(mags, checkIfExists: true), + file(combined_output, checkIfExists: true), + file(fastqc, checkIfExists: true) + ) } + + // Automatic verification and validation + VALIDATE_PROCESSING(GLDS_ch, validation_dirs_ch, + sample_ids_file, + GENERATE_README.out.readme, + PACKAGE_PROCESSING_INFO.out.zip) + // Generate md5sums + dirs_ch = Channel.of(params.directories.Read_Based_Processing, + params.directories.Filtered_Sequence_Data, + params.directories.Assembly_Based_Processing) + .concat(CLEAN_FASTQC_PATHS.out.clean_dir) + .collect() + .map{ read_based, filtered_sequence, assembly_based, fastqc -> + tuple( file(read_based, checkIfExists: true), + file(filtered_sequence, checkIfExists: true), + file(assembly_based, checkIfExists: true), + file(fastqc, checkIfExists: true) + ) } + + GENERATE_MD5SUMS(PACKAGE_PROCESSING_INFO.out.zip, + GENERATE_README.out.readme, dirs_ch) + + // Generate curation file association table + curation_dirs_ch = Channel.of(params.directories.Assemblies, + params.directories.Genes, + params.directories.Mapping, + params.directories.Bins, + params.directories.MAGS) + .concat(CLEAN_FASTQC_PATHS.out.clean_dir) + .collect() + .map{ assemblies, genes, mapping, bins, mags, fastqc -> + tuple( file(assemblies, checkIfExists: true), + file(genes, checkIfExists: true), + file(mapping, checkIfExists: true), + file(bins, checkIfExists: true), + file(mags, checkIfExists: true), + file(fastqc, checkIfExists: true) + ) } + + GENERATE_CURATION_TABLE(suffix_ch, file_label_ch, + dir_label_ch, + curation_dirs_ch, + assay_table_ch, runsheet_ch) + GENERATE_PROTOCOL(software_versions, params.protocol_id) +} From 0279deb2786f18b92117313ef74f7f899520d69d Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:18:33 -0700 Subject: [PATCH 27/48] Typo fixes --- .../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 566310d3..c8dff248 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -29,8 +29,8 @@ Lauren Sanders (OSDR Project Scientist) - samtools - CAT - GTDB-Tk - - HUMAnN3 - - MetaPhIAn3 + - HUMAnN + - MetaPhlAn - In [step 14d](#14d-mag-taxonomic-classification), MAG taxonomic classification, added the new `--skip_ani_screen` argument to `gtdbtk classify_wf` to continue classifying genomes as in previous versions of GTDB-Tk, using mash and skani. --- @@ -75,12 +75,12 @@ Lauren Sanders (OSDR Project Scientist) |prodigal| 2.6.3 |[https://github.com/hyattpd/Prodigal#prodigal](https://github.com/hyattpd/Prodigal#prodigal)| |KOFamScan| 1.3.0 |[https://github.com/takaram/kofam_scan#kofamscan](https://github.com/takaram/kofam_scan#kofamscan)| |CAT| 5.2.3 |[https://github.com/dutilh/CAT#cat-and-bat](https://github.com/dutilh/CAT#cat-and-bat)| -|Metabat2| 2.15 |[https://bitbucket.org/berkeleylab/metabat/src/master/](https://bitbucket.org/berkeleylab/metabat/src/master/)| +|MetaBAT| 2.15 |[https://bitbucket.org/berkeleylab/metabat/src/master/](https://bitbucket.org/berkeleylab/metabat/src/master/)| |checkm| 1.1.3 |[https://github.com/Ecogenomics/CheckM](https://github.com/Ecogenomics/CheckM)| |GTDB-Tk| 2.4.0 |[https://github.com/Ecogenomics/GTDBTk](https://github.com/Ecogenomics/GTDBTk)| |KEGGDecoder| 1.2.2 |[https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder](https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder) -|HUMAnN3| 3.9 |[https://huttenhower.sph.harvard.edu/humann3/](https://huttenhower.sph.harvard.edu/humann3/)| -|MetaPhlAn3| 4.1.0 |[https://github.com/biobakery/MetaPhlAn/tree/3.0](https://github.com/biobakery/MetaPhlAn/tree/3.0)| +|HUMAnN| 3.9 |[https://github.com/biobakery/humann](https://github.com/biobakery/humann)| +|MetaPhlAn| 4.1.0 |[https://github.com/biobakery/MetaPhlAn](https://github.com/biobakery/MetaPhlAn)| --- From ba86dc55cceb0e96a0e4cf5af3de9b9114fd8e58 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:54:00 -0700 Subject: [PATCH 28/48] Typo fixes --- .../Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index c8dff248..ce602a76 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -979,7 +979,7 @@ KEGG-decoder -v interactive -i MAG-level-KO-annotations_GLmetagenomics.tsv -o MA ## Read-based processing ### 16. Taxonomic and functional profiling -The following uses the `humann3` and `metaphlan3` reference databases downloaded on 26-Sept-2020 as follows: +The following uses the `humann` and `metaphlan` reference databases downloaded on 13-Jun-2024 as follows: ```bash humann_databases --download chocophlan full @@ -988,7 +988,7 @@ humann_databases --download utility_mapping full metaphlan --install ``` -#### 16a. Running humann3 (which also runs metaphlan3) +#### 16a. Running humann (which also runs metaphlan) ```bash # forward and reverse reads need to be provided combined if paired-end (if not paired-end, single-end reads are provided to the --input argument next) cat sample-1_R1_filtered.fastq.gz sample-1_R2_filtered.fastq.gz > sample-1-combined.fastq.gz From 28843a39dcff0f06fc3ddc424eabd04451f8bfb7 Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Wed, 11 Sep 2024 10:59:56 -0700 Subject: [PATCH 29/48] Update GL-DPPD-7107-A.md - minor changes to software names in table (to match capitalization in each tool's documentation) - updated mutliqc commands to remove "-z" option which is not used in any workflows - updated thread specifications to use generic "NumerOfThreads" rather than specific number. --- .../GL-DPPD-7107-A.md | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index ce602a76..f6f41aa6 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -68,17 +68,17 @@ Lauren Sanders (OSDR Project Scientist) |FastQC| 0.12.1 |[https://www.bioinformatics.babraham.ac.uk/projects/fastqc/](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)| |MultiQC| 1.19 |[https://multiqc.info/](https://multiqc.info/)| |bbduk| 38.86 |[https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/)| -|megahit| 1.2.9 |[https://github.com/voutcn/megahit#megahit](https://github.com/voutcn/megahit#megahit)| +|MEGAHIT| 1.2.9 |[https://github.com/voutcn/megahit#megahit](https://github.com/voutcn/megahit#megahit)| |bit| 1.8.53 |[https://github.com/AstrobioMike/bioinf_tools#bioinformatics-tools-bit](https://github.com/AstrobioMike/bioinf_tools#bioinformatics-tools-bit)| |bowtie2| 2.4.1 |[https://github.com/BenLangmead/bowtie2#overview](https://github.com/BenLangmead/bowtie2#overview)| |samtools| 1.20 |[https://github.com/samtools/samtools#samtools](https://github.com/samtools/samtools#samtools)| -|prodigal| 2.6.3 |[https://github.com/hyattpd/Prodigal#prodigal](https://github.com/hyattpd/Prodigal#prodigal)| +|Prodigal| 2.6.3 |[https://github.com/hyattpd/Prodigal#prodigal](https://github.com/hyattpd/Prodigal#prodigal)| |KOFamScan| 1.3.0 |[https://github.com/takaram/kofam_scan#kofamscan](https://github.com/takaram/kofam_scan#kofamscan)| |CAT| 5.2.3 |[https://github.com/dutilh/CAT#cat-and-bat](https://github.com/dutilh/CAT#cat-and-bat)| |MetaBAT| 2.15 |[https://bitbucket.org/berkeleylab/metabat/src/master/](https://bitbucket.org/berkeleylab/metabat/src/master/)| -|checkm| 1.1.3 |[https://github.com/Ecogenomics/CheckM](https://github.com/Ecogenomics/CheckM)| +|CheckM| 1.1.3 |[https://github.com/Ecogenomics/CheckM](https://github.com/Ecogenomics/CheckM)| |GTDB-Tk| 2.4.0 |[https://github.com/Ecogenomics/GTDBTk](https://github.com/Ecogenomics/GTDBTk)| -|KEGGDecoder| 1.2.2 |[https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder](https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder) +|KEGG-Decoder| 1.2.2 |[https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder](https://github.com/bjtully/BioData/tree/master/KEGGDecoder#kegg-decoder) |HUMAnN| 3.9 |[https://github.com/biobakery/humann](https://github.com/biobakery/humann)| |MetaPhlAn| 4.1.0 |[https://github.com/biobakery/MetaPhlAn](https://github.com/biobakery/MetaPhlAn)| @@ -113,7 +113,7 @@ fastqc -o raw_fastqc_output *raw.fastq.gz #### 1a. Compile Raw Data QC ``` -multiqc -o raw_multiqc_output -n raw_multiqc -z raw_fastqc_output/ +multiqc -o raw_multiqc_output -n raw_multiqc raw_fastqc_output/ # this is how it's packaged with our workflow outputs zip -r raw_multiqc_GLmetagenomics_report.zip raw_multiqc_output ``` @@ -122,7 +122,6 @@ zip -r raw_multiqc_GLmetagenomics_report.zip raw_multiqc_output * `-o` – the output directory to store results * `-n` – the filename prefix of results -* `-z` – specifies to zip the output data directory * `raw_fastqc_output/` – the directory holding the output data from the fastqc run, provided as a positional argument **Input data:** @@ -175,7 +174,7 @@ bbduk.sh in=sample-1-R1-raw.fastq.gz in2=sample-1-R2-raw.fastq.gz out1=sample-1_ * `maxns` – sets the maximum number of Ns allowed in a read before it will be filtered out -* `swift` – tells the program to look for and trim low-complexity adaptase reminants from the Swift1S kit +* `swift` – tells the program to look for and trim low-complexity adaptase reminants from the Swift1S kit * `> bbduk.log 2>&1` – redirects the stderr and stdout to a log file for saving @@ -214,7 +213,7 @@ fastqc -o filtered_fastqc_output/ *filtered.fastq.gz #### 3a. Compile Filtered/Trimmed Data QC ``` -multiqc -o filtered_multiqc_output -n filtered_multiqc -z filtered_fastqc_output/ +multiqc -o filtered_multiqc_output -n filtered_multiqc filtered_fastqc_output/ # this is how it's packaged with our workflow outputs zip -r filtered_multiqc_GLmetagenomics_report.zip filtered_multiqc_output ``` @@ -223,7 +222,6 @@ zip -r filtered_multiqc_GLmetagenomics_report.zip filtered_multiqc_output * `-o` – the output directory to store results * `-n` – the filename prefix of results -* `-z` – specifies to zip the output data directory * `filtered_fastqc_output/` – the directory holding the output data from the fastqc run, provided as a positional argument **Input data:** @@ -244,7 +242,7 @@ zip -r filtered_multiqc_GLmetagenomics_report.zip filtered_multiqc_output ### 4. Sample assembly ``` megahit -1 sample-1_R1_filtered.fastq.gz -2 sample-1_R2_filtered.fastq.gz \ - -o sample-1-assembly -t 10 --min-contig-length 500 > sample-1-assembly.log 2>&1 + -o sample-1-assembly -t NumberOfThreads --min-contig-length 500 > sample-1-assembly.log 2>&1 ``` **Parameter Definitions:** @@ -587,8 +585,8 @@ bowtie2-build sample-1-assembly.fasta sample-1-assembly-bt-index #### 9b. Performing mapping, conversion to bam, and sorting ``` -bowtie2 --threads 15 -x sample-1-assembly-bt-index -1 sample-1_R1_filtered.fastq.gz \ - -2 sample-1_R2_filtered.fastq.gz 2> sample-1-mapping-info.txt | samtools view -b | samtools sort -@ 15 > sample-1.bam +bowtie2 --threads NumberOfThreads -x sample-1-assembly-bt-index -1 sample-1_R1_filtered.fastq.gz \ + -2 sample-1_R2_filtered.fastq.gz 2> sample-1-mapping-info.txt | samtools view -b | samtools sort -@ NumberOfThreads > sample-1.bam ``` **Parameter Definitions:** @@ -609,7 +607,7 @@ bowtie2 --threads 15 -x sample-1-assembly-bt-index -1 sample-1_R1_filtered.fastq #### 9c. Indexing ``` -samtools index -@ 15 sample-1.bam +samtools index -@ NumberOfThreads sample-1.bam ``` **Parameter Definitions:** @@ -787,7 +785,7 @@ bit-GL-combine-KO-and-tax-tables *-gene-coverage-annotation-and-tax.tsv -o Combi ``` jgi_summarize_bam_contig_depths --outputDepth sample-1-metabat-assembly-depth.tsv --percentIdentity 97 --minContigLength 1000 --minContigDepth 1.0 --referenceFasta sample-1-assembly.fasta sample-1.bam -metabat2 --inFile sample-1-assembly.fasta --outFile sample-1 --abdFile sample-1-metabat-assembly-depth.tsv -t 4 +metabat2 --inFile sample-1-assembly.fasta --outFile sample-1 --abdFile sample-1-metabat-assembly-depth.tsv -t NumberOfThreads mkdir sample-1-bins mv sample-1*bin*.fasta sample-1-bins @@ -993,7 +991,7 @@ metaphlan --install # forward and reverse reads need to be provided combined if paired-end (if not paired-end, single-end reads are provided to the --input argument next) cat sample-1_R1_filtered.fastq.gz sample-1_R2_filtered.fastq.gz > sample-1-combined.fastq.gz -humann --input sample-1-combined.fastq.gz --output sample-1-humann3-out-dir --threads 15 \ +humann --input sample-1-combined.fastq.gz --output sample-1-humann3-out-dir --threads NumberOfThreads \ --output-basename sample-1 --metaphlan-options "--unknown_estimation --add_viruses \ --sample_id sample-1" ``` From 0c853540b1947560b9db09f5fabc33b9cb4d0881 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:40:31 -0700 Subject: [PATCH 30/48] Adding notes. --- .../Workflow_Documentation/NF_MGIllumina/CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md index c3da8cd0..8be31953 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md @@ -1,4 +1,6 @@ # Workflow change log +> ***Note:** The initial GeneLab Illumina metagenomics sequencing data processing pipeline, [GL-DPPD-7101](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), was wrapped in a Snakemake workflow and can be found in the [SW_MGIllumina](../SW_MGIllumina) directory. The current pipeline version [GL-DPPD-7101-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is wrapped in a Nextflow workflow and can be found in the [NF_MGIllumina](./) directory. This change log details changes for the Nextflow workflow implementation only.* + ## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina) -- workflow version that converted snakemake to nextflow \ No newline at end of file +- workflow version that converted snakemake to nextflow From e5110cb8e28e3ff9761f6fe8e28b82ee9a91def8 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:41:45 -0700 Subject: [PATCH 31/48] Typo fix --- .../Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md index 8be31953..5a7ab315 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md @@ -1,6 +1,6 @@ # Workflow change log -> ***Note:** The initial GeneLab Illumina metagenomics sequencing data processing pipeline, [GL-DPPD-7101](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), was wrapped in a Snakemake workflow and can be found in the [SW_MGIllumina](../SW_MGIllumina) directory. The current pipeline version [GL-DPPD-7101-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is wrapped in a Nextflow workflow and can be found in the [NF_MGIllumina](./) directory. This change log details changes for the Nextflow workflow implementation only.* +> ***Note:** The initial GeneLab Illumina metagenomics sequencing data processing pipeline, [GL-DPPD-7101](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), was wrapped in a Snakemake workflow and can be found in the [SW_MGIllumina](../SW_MGIllumina) directory. The current pipeline version, [GL-DPPD-7101-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is wrapped in a Nextflow workflow and can be found in the [NF_MGIllumina](./) directory. This change log details changes for the Nextflow workflow implementation only.* ## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina) - workflow version that converted snakemake to nextflow From c3ca4edfa6456e448712bae293288e0327b813fc Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Tue, 22 Oct 2024 22:01:33 -0700 Subject: [PATCH 32/48] Adding missing pipeline info --- Metagenomics/Illumina/Workflow_Documentation/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/README.md b/Metagenomics/Illumina/Workflow_Documentation/README.md index f739c11f..4951b808 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/README.md @@ -4,9 +4,11 @@ ## MGIllumina Pipeline Version and Corresponding Workflow -|Pipeline Version|Current Workflow Version (for respective pipeline version)| -|:---------------|:---------------------------------------------------------| -|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[1.0.0](NF_MGIllumina)| +|Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| +|:---------------|:---------------------------------------------------------|:---------------| +|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[NF_MGIllumina-A_1.0.0](NF_MGIllumina-A)|23.10.1| +|[GL-DPPD-7107.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md)|[SW_MGIllumina_2.0.4](SW_MGIllumina)|N/A (Snakemake vXXXX)| + *Current GeneLab Pipeline/Workflow Implementation From d2f33ddb948da0ea88953a0fe7b6744619fbc687 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Tue, 22 Oct 2024 22:23:19 -0700 Subject: [PATCH 33/48] Version info updates --- .../NF_MGIllumina/README.md | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 4a139c71..2b7e3adb 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -4,7 +4,7 @@ ### Implementation Tools -The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. +The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina-A), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. > **Note on reference databases** > Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. @@ -66,11 +66,11 @@ We recommend installing Singularity on a system wide level as per the associated ### 2. Download the workflow files -All files required for utilizing the NF_XXX GeneLab workflow for processing metagenomics illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_XXX* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: +All files required for utilizing the NF_MGIllumina-A GeneLab workflow for processing metagenomics Illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_MGIllumina-A* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: ```bash -wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina/NF_MGIllumina.zip -unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina-A_1.0.0/NF_MGIllumina-A_1.0.0.zip +unzip NF_MGIllumina-A_1.0.0.zip && cd NF_MGIllumina-A_1.0.0 ```
@@ -81,9 +81,9 @@ unzip NF_MGIllumina.zip && cd NF_XXX-X_X.X.X Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). -To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_MGIllumina workflow: +To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_MGIllumina-A workflow: -> Note: This command should be run in the location containing the `NF_MGIllumina` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. +> Note: This command should be run from within the `NF_MGIllumina-A_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. ```bash bash ./bin/prepull_singularity.sh nextflow.config @@ -101,6 +101,8 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity ### 4. Run the Workflow +> ***Note:** All the commands in this step must be run from within the `NF_MGIllumina-A_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above.* + For options and detailed help on how to run the workflow, run the following command: ```bash @@ -137,9 +139,9 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. @@ -159,7 +161,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

@@ -175,7 +177,7 @@ The outputs from this pipeline are documented in the [GL-DPPD-7107-A](../../Pipe Standard nextflow resource usage logs are also produced as follows: -- Output: +- **Output:** - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) From 11d1dbdbb997d8b3ce3c206658c23c630e0a486d Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Tue, 22 Oct 2024 22:24:34 -0700 Subject: [PATCH 34/48] Formatting update --- .../Illumina/Workflow_Documentation/NF_MGIllumina/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 2b7e3adb..55a7a650 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -9,6 +9,8 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M > **Note on reference databases** > Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. +
+ ## Utilizing the Workflow 1. [Install Nextflow and Singularity](#1-install-nextflow-and-singularity) From cfdd9a724e8ffcc162ed838aa0ae2a315c062e5b Mon Sep 17 00:00:00 2001 From: asaravia-butler Date: Tue, 22 Oct 2024 22:30:00 -0700 Subject: [PATCH 35/48] renaming NF_MGIllumina to NF_MGIllumina-A --- .../{NF_MGIllumina => NF_MGIllumina-A}/CHANGELOG.md | 0 .../{NF_MGIllumina => NF_MGIllumina-A}/README.md | 0 .../{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/PE_file.csv | 0 .../{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/SE_file.csv | 0 .../workflow_code/bin/GL-gen-metagenomics-file-associations-table | 0 .../workflow_code/bin/GL-gen-processed-metagenomics-readme | 0 .../workflow_code/bin/GL-validate-processed-metagenomics-data | 0 .../workflow_code/bin/clean-paths.sh | 0 .../workflow_code/bin/combine-all-gene-tables.py | 0 .../bin/combine-gene-level-coverages-annots-and-tax-per-group.py | 0 .../workflow_code/bin/create_runsheet.sh | 0 .../workflow_code/bin/download-GTDBTK-db.sh | 0 .../workflow_code/bin/format-contig-tax-classifications.sh | 0 .../workflow_code/bin/format-gene-tax-classifications.sh | 0 .../workflow_code/bin/generate-assembly-based-overview-table.sh | 0 .../workflow_code/bin/get-cov-and-depth.sh | 0 .../workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh | 0 .../workflow_code/bin/parse-MAG-annots.py | 0 .../workflow_code/bin/prepull_singularity.sh | 0 .../workflow_code/bin/swap-MAG-IDs.py | 0 .../workflow_code/config/bbtools_adapters.fa | 0 .../workflow_code/config/multiqc.config | 0 .../workflow_code/envs/bit.yaml | 0 .../workflow_code/envs/cat.yaml | 0 .../workflow_code/envs/checkm.yaml | 0 .../workflow_code/envs/genelab.yaml | 0 .../workflow_code/envs/gtdb-tk.yaml | 0 .../workflow_code/envs/humann3.yaml | 0 .../workflow_code/envs/image_def.bit | 0 .../workflow_code/envs/image_def.genelab | 0 .../workflow_code/envs/keggdecoder.yaml | 0 .../workflow_code/envs/kofamscan.yaml | 0 .../workflow_code/envs/mapping.yaml | 0 .../workflow_code/envs/megahit.yaml | 0 .../workflow_code/envs/metabat.yaml | 0 .../workflow_code/envs/prodigal.yaml | 0 .../{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/qc.yaml | 0 .../{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/main.nf | 0 .../workflow_code/modules/assembly.nf | 0 .../workflow_code/modules/assembly_annotation.nf | 0 .../workflow_code/modules/assembly_based_processing.nf | 0 .../workflow_code/modules/binning.nf | 0 .../workflow_code/modules/combine_contig_annotation.nf | 0 .../workflow_code/modules/coverage.nf | 0 .../workflow_code/modules/create_runsheet.nf | 0 .../workflow_code/modules/database_creation.nf | 0 .../workflow_code/modules/genelab.nf | 0 .../workflow_code/modules/quality_assessment.nf | 0 .../workflow_code/modules/read_based_processing.nf | 0 .../workflow_code/modules/read_mapping.nf | 0 .../workflow_code/modules/summarize_MAG.nf | 0 .../workflow_code/modules/summarize_assembly-based_processing.nf | 0 .../workflow_code/modules/summarize_bins.nf | 0 .../workflow_code/modules/zip_fasta.nf | 0 .../workflow_code/nextflow.config | 0 .../workflow_code/post_processing.config | 0 .../workflow_code/post_processing.nf | 0 .../workflow_code/slurm_submit.slurm | 0 58 files changed, 0 insertions(+), 0 deletions(-) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/CHANGELOG.md (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/README.md (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/PE_file.csv (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/SE_file.csv (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/GL-gen-metagenomics-file-associations-table (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/GL-gen-processed-metagenomics-readme (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/GL-validate-processed-metagenomics-data (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/clean-paths.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/combine-all-gene-tables.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/create_runsheet.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/download-GTDBTK-db.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/format-contig-tax-classifications.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/format-gene-tax-classifications.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/generate-assembly-based-overview-table.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/get-cov-and-depth.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/parse-MAG-annots.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/prepull_singularity.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/bin/swap-MAG-IDs.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/config/bbtools_adapters.fa (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/config/multiqc.config (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/bit.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/cat.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/checkm.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/genelab.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/gtdb-tk.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/humann3.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/image_def.bit (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/image_def.genelab (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/keggdecoder.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/kofamscan.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/mapping.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/megahit.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/metabat.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/prodigal.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/envs/qc.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/main.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/assembly.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/assembly_annotation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/assembly_based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/binning.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/combine_contig_annotation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/coverage.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/create_runsheet.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/database_creation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/genelab.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/quality_assessment.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/read_based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/read_mapping.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/summarize_MAG.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/summarize_assembly-based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/summarize_bins.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/modules/zip_fasta.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/nextflow.config (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/post_processing.config (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/post_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina => NF_MGIllumina-A}/workflow_code/slurm_submit.slurm (100%) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/CHANGELOG.md similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/CHANGELOG.md diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/PE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/PE_file.csv similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/PE_file.csv rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/PE_file.csv diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/SE_file.csv similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/SE_file.csv rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/SE_file.csv diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-metagenomics-file-associations-table similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-metagenomics-file-associations-table diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-processed-metagenomics-readme similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-processed-metagenomics-readme diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-validate-processed-metagenomics-data similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-validate-processed-metagenomics-data diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-all-gene-tables.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-all-gene-tables.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-all-gene-tables.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-all-gene-tables.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/create_runsheet.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/create_runsheet.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/create_runsheet.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/create_runsheet.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/download-GTDBTK-db.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/download-GTDBTK-db.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-contig-tax-classifications.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-contig-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-gene-tax-classifications.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-gene-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate-assembly-based-overview-table.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate-assembly-based-overview-table.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get-cov-and-depth.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get-cov-and-depth.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/parse-MAG-annots.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/parse-MAG-annots.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/prepull_singularity.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/prepull_singularity.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/swap-MAG-IDs.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/swap-MAG-IDs.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/swap-MAG-IDs.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/swap-MAG-IDs.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/bbtools_adapters.fa b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/bbtools_adapters.fa similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/bbtools_adapters.fa rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/bbtools_adapters.fa diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/multiqc.config similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/multiqc.config diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/bit.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/bit.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/cat.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/cat.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/checkm.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/checkm.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/genelab.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/genelab.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/genelab.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/genelab.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/gtdb-tk.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/gtdb-tk.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/humann3.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/humann3.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.bit similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.bit diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.genelab similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.genelab diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/keggdecoder.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/keggdecoder.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/kofamscan.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/kofamscan.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/mapping.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/mapping.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/megahit.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/megahit.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/metabat.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/metabat.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/prodigal.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/prodigal.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/qc.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/qc.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_annotation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_annotation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/binning.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/binning.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/binning.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/binning.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/combine_contig_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/combine_contig_annotation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/combine_contig_annotation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/combine_contig_annotation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_mapping.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_MAG.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_MAG.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_assembly-based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_assembly-based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/zip_fasta.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/zip_fasta.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/zip_fasta.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/zip_fasta.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm From e5c4fe6c2763a518d5044467c8cb280bcddbefa3 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 23 Oct 2024 13:04:06 -0700 Subject: [PATCH 36/48] Edited README and accession parameter --- .../NF_MGIllumina-A/README.md | 23 ++++++++++++++++--- .../NF_MGIllumina-A/workflow_code/main.nf | 10 ++++---- .../workflow_code/nextflow.config | 2 +- .../workflow_code/slurm_submit.slurm | 7 +++--- .../Illumina/Workflow_Documentation/README.md | 2 +- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md index 55a7a650..af45d7d0 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md @@ -118,7 +118,7 @@ nextflow run main.nf --help #### 4a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input ```bash -nextflow run main.nf -resume -profile slurm,singularity --GLDS_accession OSD-574 +nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574 ```
@@ -149,7 +149,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

_-associated-file-names.tsv (File association table for curation) + + - Post_processing/_metagenomics-validation.log (Automatic verification and validation log file) + + - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR) + + - Post_processing/processing_info_GLmetagenomics.zip (Zip file containing all files used to run the workflow and required logs with paths purged) + + - Post_processing/protocol.txt (File describing the methods used by the workflow) + + - Post_processing/README_GLmetagenomics.txt (README file listing and describing the outputs of the workflow) + diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf index 2249aa9f..9f412edc 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf @@ -23,7 +23,7 @@ if (params.help) { println(" > nextflow run main.nf -resume -profile slurm,conda --csv_file SE_file.csv") println() println("Example 3: Run jobs locally in conda environments, supply a GLDS accession, and specify the path to an existing conda environment.") - println(" > nextflow run main.nf -resume -profile conda --GLDS_accession OSD-574 --conda.qc ") + println(" > nextflow run main.nf -resume -profile conda --accession OSD-574 --conda.qc ") println() println("Required arguments:") println("""-profile [STRING] Specifies the profile to be used to run the workflow. Options are [slurm, singularity, docker, and conda]. @@ -86,7 +86,7 @@ if (params.help) { println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: ../Read-based_Processing/.") println() println("Genelab specific arguements:") - println(" --GLDS_accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") + println(" --accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") println(" --RawFilePattern [STRING] If we do not want to download all files (which we often won't), we can specify a pattern here to subset the total files.") println(" For example, if we know we want to download just the fastq.gz files, we can say 'fastq.gz'. We can also provide multiple patterns") println(" as a comma-separated list. For example, If we want to download the fastq.gz files that also have 'NxtaFlex', 'metagenomics', and 'raw' in") @@ -145,7 +145,7 @@ log.info """ You have set the following parameters: Profile: ${workflow.profile} Input csv file : ${params.csv_file} - GLDS Accession : ${params.GLDS_accession} + GLDS or OSD Accession : ${params.accession} GLDS Raw File Pattern: ${params.RawFilePattern} Workflow : ${params.workflow} Nextflow Directory publishing mode: ${params.publishDir_mode} @@ -317,9 +317,9 @@ workflow { // Software Version Capturing - runsheet software_versions_ch = Channel.empty() // Parse file input - if(params.GLDS_accession){ + if(params.accession){ - GET_RUNSHEET(params.GLDS_accession) + GET_RUNSHEET(params.accession) GET_RUNSHEET.out.input_file .splitCsv(header:true) .set{file_ch} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config index 82df899d..5dd544e3 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config @@ -113,7 +113,7 @@ params { checkm = null // "/path/to/envs/checkm" } - GLDS_accession = false // GLDS or OSD acession number for the data to be processed + accession = false // GLDS or OSD acession number for the data to be processed // Pattern of files on OSDR for the GLDS_accession you want to process. RawFilePattern = null // "_metaG", "_HRremoved" errorStrategy = "terminate" diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm index 232fe48c..59c7fb82 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm @@ -26,7 +26,6 @@ echo $HOSTNAME ## Activate the conda environemnt containing the tools you need to run your job ## ## You can see a list of all available environments by running the command: conda env list ## -## If you need a conda envrionment installed request it using JIRA ## source activate /path/to/envs/nextflow ## Replace conda_env_name with the name of the conda environment with nextflow installed ## @@ -40,8 +39,10 @@ echo "" ## The command(s) that you want to run in this slurm job ## export NXF_SINGULARITY_CACHEDIR=singularity/ -#nextflow run main.nf -profile slurm,singularity -resume --csv_file PE_file.csv ## Replace command with the command(s) you want to run ## -nextflow run main.nf -profile slurm,singularity --GLDS_accession OSD-574 -resume +export TOWER_ACCESS_TOKEN= +export TOWER_WORKSPACE_ID= +#nextflow run main.nf -profile slurm,singularity -resume --csv_file PE_file.csv -with-tower ## Replace command with the command(s) you want to run ## +nextflow run main.nf -profile slurm,singularity --accession OSD-574 -resume -with-tower ## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## diff --git a/Metagenomics/Illumina/Workflow_Documentation/README.md b/Metagenomics/Illumina/Workflow_Documentation/README.md index 4951b808..e6a7e126 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/README.md @@ -7,7 +7,7 @@ |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| |*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[NF_MGIllumina-A_1.0.0](NF_MGIllumina-A)|23.10.1| -|[GL-DPPD-7107.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md)|[SW_MGIllumina_2.0.4](SW_MGIllumina)|N/A (Snakemake vXXXX)| +|[GL-DPPD-7107.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md)|[SW_MGIllumina_2.0.4](SW_MGIllumina)|N/A (Snakemake v7.26.0)| *Current GeneLab Pipeline/Workflow Implementation From da5d9db1fca40a8aebb255681132e7ff1f37c92d Mon Sep 17 00:00:00 2001 From: olabiyi Date: Wed, 23 Oct 2024 14:06:59 -0700 Subject: [PATCH 37/48] minor README edit --- .../Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md index af45d7d0..75f2ec15 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md @@ -151,7 +151,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

Date: Wed, 23 Oct 2024 15:58:52 -0700 Subject: [PATCH 38/48] renamed accession parameter --- .../workflow_code/modules/create_runsheet.nf | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf index 4aeaa844..4cf3a6a5 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow nextflow.enable.dsl = 2 -//params.GLDS_accession = "OSD-574" +//params.accession = "OSD-574" //params.RawFilePattern = null // Pattern of files on OSDR for the OSD accession you want to process process GET_RUNSHEET { @@ -9,7 +9,7 @@ process GET_RUNSHEET { beforeScript "chmod +x ${baseDir}/bin/create_runsheet.sh" input: - val(GLDS_accession) + val(accession) output: path("a_*metagenomic*.txt"), emit: assay_TABLE path("*.zip"), emit: zip @@ -18,19 +18,19 @@ process GET_RUNSHEET { script: """ # Download ISA zip file for the GLDS_accession then unzip it - GL-download-GLDS-data -g ${GLDS_accession} -p ISA -f && unzip *-ISA.zip + GL-download-GLDS-data -g ${accession} -p ISA -f && unzip *-ISA.zip if [ ${params.RawFilePattern} == null ];then # Attempt to download the sequences using the assay table, if that fails then # attempt retrieving all fastq.gz files - GL-download-GLDS-data -f -g ${GLDS_accession} -a a_*metagenomic*.txt -o Raw_Sequence_Data || \\ - GL-download-GLDS-data -f -g ${GLDS_accession} -p ".fastq.gz" -o Raw_Sequence_Data + GL-download-GLDS-data -f -g ${accession} -a a_*metagenomic*.txt -o Raw_Sequence_Data || \\ + GL-download-GLDS-data -f -g ${accession} -p ".fastq.gz" -o Raw_Sequence_Data else - GL-download-GLDS-data -f -g ${GLDS_accession} -p ${params.RawFilePattern} -o Raw_Sequence_Data + GL-download-GLDS-data -f -g ${accession} -p ${params.RawFilePattern} -o Raw_Sequence_Data fi @@ -39,8 +39,8 @@ process GET_RUNSHEET { grep '+' *wanted-file-download-commands.sh | \\ sort -u | \\ awk '{gsub(/\\+/,"%2B", \$NF);print}' \\ - > plus_containing_${GLDS_accession}-wanted-file-download-commands.sh - cat plus_containing_${GLDS_accession}-wanted-file-download-commands.sh | parallel -j $task.cpus + > plus_containing_${accession}-wanted-file-download-commands.sh + cat plus_containing_${accession}-wanted-file-download-commands.sh | parallel -j $task.cpus fi # Create runsheet from the assay table @@ -52,7 +52,7 @@ process GET_RUNSHEET { workflow { - GET_RUNSHEET(params.GLDS_accession) + GET_RUNSHEET(params.accession) file_ch = GET_RUNSHEET.out.input_file .splitCsv(header:true) From 1aeeb736967e9d399df7a74747b49f1b5f283029 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:22:42 -0700 Subject: [PATCH 39/48] Updatign signature matrix --- .../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index f6f41aa6..6d289563 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -4,7 +4,7 @@ --- -**Date:** 2024 +**Date:** 10/XX/2024 **Revision:** -A **Document Number:** GL-DPPD-7107 @@ -12,11 +12,10 @@ Olabiyi A. Obayomi (GeneLab Analysis Team) **Approved by:** -Sylvain Costes (GeneLab Project Manager) -Samrawit Gebre (GeneLab Deputy Project Manager and Interim GeneLab Configuration Manager) -Barbara Novak (GeneLab Data Processing Lead) +Samrawit Gebre (OSDR Project Manager) +Lauren Sanders (OSDR Project Scientist) Amanda Saravia-Butler (GeneLab Science Lead) -Lauren Sanders (OSDR Project Scientist) +Barbara Novak (GeneLab Data Processing Lead) --- From 59b165346895f2cb3cbc4102ecd384440a30a330 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:31:11 -0700 Subject: [PATCH 40/48] Updating nextflow version --- Metagenomics/Illumina/Workflow_Documentation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/README.md b/Metagenomics/Illumina/Workflow_Documentation/README.md index e6a7e126..eb2a7ad8 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/README.md @@ -6,7 +6,7 @@ |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| -|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[NF_MGIllumina-A_1.0.0](NF_MGIllumina-A)|23.10.1| +|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[NF_MGIllumina-A_1.0.0](NF_MGIllumina-A)|24.04.4| |[GL-DPPD-7107.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md)|[SW_MGIllumina_2.0.4](SW_MGIllumina)|N/A (Snakemake v7.26.0)| From 54c5bd85cb1eb9c6bb937f834c12fd2eab28c275 Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:33:57 -0700 Subject: [PATCH 41/48] Formatting updates --- .../Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 6d289563..4ba4fe29 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -14,7 +14,7 @@ Olabiyi A. Obayomi (GeneLab Analysis Team) **Approved by:** Samrawit Gebre (OSDR Project Manager) Lauren Sanders (OSDR Project Scientist) -Amanda Saravia-Butler (GeneLab Science Lead) +Amanda Saravia-Butler (GeneLab Science Lead) Barbara Novak (GeneLab Data Processing Lead) --- From 0dcec9baf530cb6515976c90a451fcd231612f2a Mon Sep 17 00:00:00 2001 From: asaravia-butler <70983120+asaravia-butler@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:34:38 -0700 Subject: [PATCH 42/48] Formatting updates --- .../Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 4ba4fe29..c12c3b39 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -4,7 +4,7 @@ --- -**Date:** 10/XX/2024 +**Date:** October XX, 2024 **Revision:** -A **Document Number:** GL-DPPD-7107 From 311800e7486d313682b752804bc25e05e2a3523b Mon Sep 17 00:00:00 2001 From: olabiyi Date: Fri, 1 Nov 2024 21:15:20 -0500 Subject: [PATCH 43/48] Fixed read mapping bug --- .../NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf | 3 ++- .../NF_MGIllumina-A/workflow_code/modules/read_mapping.nf | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf index 4cf3a6a5..cde64bcb 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf @@ -7,6 +7,7 @@ nextflow.enable.dsl = 2 process GET_RUNSHEET { beforeScript "chmod +x ${baseDir}/bin/create_runsheet.sh" + tag "Downloading raw fastq files and runsheet for ${accession}..." input: val(accession) @@ -17,7 +18,7 @@ process GET_RUNSHEET { path("versions.txt"), emit: version script: """ - # Download ISA zip file for the GLDS_accession then unzip it + # Download ISA zip file for the GLDS/OSD accession then unzip it GL-download-GLDS-data -g ${accession} -p ISA -f && unzip *-ISA.zip if [ ${params.RawFilePattern} == null ];then diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf index 06427714..a8096907 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf @@ -30,6 +30,7 @@ process MAPPING { else touch ${sample_id}.sam + echo "Mapping not performed for ${sample_id} because the assembly didn't produce anything." > ${sample_id}-mapping-info.txt printf "Mapping not performed for ${sample_id} because the assembly didn't produce anything.\\n" fi @@ -48,6 +49,7 @@ process MAPPING { else touch ${sample_id}.sam + echo "Mapping not performed for ${sample_id} because the assembly didn't produce anything." > ${sample_id}-mapping-info.txt printf "Mapping not performed for ${sample_id} because the assembly didn't produce anything.\\n" fi From e91d7a5d6559dbe29966e1d9949e44cd526145a1 Mon Sep 17 00:00:00 2001 From: olabiyi Date: Mon, 4 Nov 2024 12:57:33 -0800 Subject: [PATCH 44/48] Fixed typos and no assemblies produced bug --- .../NF_MGIllumina-A/README.md | 20 +++++------ .../workflow_code/bin/generate_protocol.sh | 36 +++++++++++++++++++ .../workflow_code/modules/assembly.nf | 1 + .../modules/assembly_based_processing.nf | 6 ++++ .../workflow_code/nextflow.config | 7 ++++ 5 files changed, 60 insertions(+), 10 deletions(-) create mode 100755 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md index 75f2ec15..aa2acc1c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md @@ -115,7 +115,7 @@ nextflow run main.nf --help
-#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD accession as input +#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD or GLDS accession as input ```bash nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574 @@ -195,30 +195,30 @@ Standard nextflow resource usage logs are also produced as follows: For options and detailed help on how to run the post-processing workflow, run the following command: ```bash -nextflow run post_processng.nf --help +nextflow run post_processing.nf --help ``` To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command: ```bash -nextflow -C post_processing.config run post_processng.nf -resume -profile slurm,singularity +nextflow -C post_processing.config run post_processing.nf -resume -profile slurm,singularity ``` The outputs of the run will be in a directory called `Post_Processing` by default and they are as follows: - - Post_processing/FastQC_Outputs/filtered_multiqc_GLmetagenomics_report.zip (Filtered sequence multiqc report with paths purged) + - Post_processing/FastQC_Outputs/filtered_multiqc_GLmetagenomics_report.zip (Filtered sequence multiqc report with paths purged) - - Post_processing/FastQC_Outputs/raw_multiqc_GLmetagenomics_report.zip (Raw sequence multiqc report with paths purged) + - Post_processing/FastQC_Outputs/raw_multiqc_GLmetagenomics_report.zip (Raw sequence multiqc report with paths purged) - - Post_processing/_-associated-file-names.tsv (File association table for curation) + - Post_processing/_-associated-file-names.tsv (File association table for curation) - - Post_processing/_metagenomics-validation.log (Automatic verification and validation log file) + - Post_processing/_metagenomics-validation.log (Automatic verification and validation log file) - - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR) + - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR) - - Post_processing/processing_info_GLmetagenomics.zip (Zip file containing all files used to run the workflow and required logs with paths purged) + - Post_processing/processing_info_GLmetagenomics.zip (Zip file containing all files used to run the workflow and required logs with paths purged) - - Post_processing/protocol.txt (File describing the methods used by the workflow) + - Post_processing/protocol.txt (File describing the methods used by the workflow) - Post_processing/README_GLmetagenomics.txt (README file listing and describing the outputs of the workflow) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh new file mode 100755 index 00000000..d15f68b6 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Generate protocol according to a pipeline document + +# USAGE: +# generate_protocol.sh +# EXAMPLE +# generate_protocol.sh ../Metadata/software_versions.txt GL-DPPD-7107-A + +FASTQC=`grep -i 'fastqc' $1 | awk '{print $2}' |sed -E 's/v//'` +MULTIQC=`grep -i 'multiqc' $1 | awk '{print $3}'` +BBMAP=`grep -i 'bbtools' $1 | awk '{print $2}'` +HUMANN=`grep -i 'humann' $1 | awk '{print $2}'|sed -E 's/v//'` +MEGAHIT=`grep -i 'megahit' $1 | awk '{print $2}'|sed -E 's/v//'` +PRODIGAL=`grep -i 'prodigal' $1 | awk '{print $2}'|sed -E 's/[vV:]//g'` +CAT=`grep 'CAT' $1 | awk '{print $2}'|sed -E 's/v//'` +KOFAMSCAN=`grep 'exec_annotation' $1 | awk '{print $2}'` +BOWTIE2=`grep -i 'bowtie' $1 | awk '{print $3}'` +SAMTOOLS=`grep -i 'samtools' $1 | awk '{print $2}'` +METABAT2=`grep -i 'metabat' $1 | awk '{print $2}'` +BIT=`grep -i 'bioinformatics tools' $1 | awk '{print $3}' | sed 's/v//' | sed -E 's/.+([0-9]+.[0-9]+.[0-9]+).+/\1/'` +CHECKM=`grep -i 'checkm' $1 | awk '{print $2}' |sed -E 's/v//'` +GTDBTK=`grep -i '^GTDB' $1 | awk '{print $2}' |sed -E 's/v//' | head -n2` # If 2 versions are used, choose the second + +PROTOCOL_ID=$2 + +PROTOCOL="Data were processed as described in ${PROTOCOL_ID} (https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/${PROTOCOL_ID}.md), using workflow NF_MGIllumina v1.0.0 (https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina). \ + In breif, quality assessment of reads was performed with FastQC v${FASTQC} and reports were summarized with MultiQC v${MULTIQC}. \ + Quality trimming and filtering were performed with bbmap v${BBMAP}. Read-based processing was performed with humann3 v${HUMANN}. \ + Individual samples were assembled with megahit v${MEGAHIT}. Genes were called with prodigal v${PRODIGAL}. \ + Taxonomic classification of genes and contigs was performed with CAT v${CAT}. Functional annotation was done with KOFamScan v${KOFAMSCAN}. \ + Reads were mapped to assemblies with bowtie2 v${BOWTIE2} and coverage information was extracted for reads and contigs with samtools v${SAMTOOLS} and bbmap v${BBMAP}. \ + Binning of contigs was performed with metabat2 v${METABAT2}. Bins were summarized with bit v${BIT} and estimates of quality were generated with checkm v${CHECKM}. \ + High-quality bins (> 90% est. completeness and < 10% est. redundancy) were taxonomically classified with gtdb-tk v${GTDBTK}." + +echo ${PROTOCOL} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf index f1e698d0..7a4229b0 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf @@ -61,6 +61,7 @@ process RENAME_HEADERS { output: tuple val(sample_id), path("${sample_id}-assembly.fasta"), emit: contigs path("versions.txt"), emit: version + path("Failed-assemblies.tsv"), optional: true, emit: failed_assembly script: """ bit-rename-fasta-headers -i ${assembly} \\ diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf index a4d54ab2..86ee76df 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf @@ -48,6 +48,12 @@ workflow assembly_based { sample_id, assembly -> file("${assembly}") }.collect() SUMMARIZE_ASSEMBLIES(assemblies_ch) + + // Write failed assemblies to a Failed assemblies file + failed_assemblies = RENAME_HEADERS.out.failed_assembly + failed_assemblies + .map{ it.text } + .collectFile(name: "${params.assemblies_dir}/Failed-assemblies.tsv", cache: false) // Map reads to assembly MAPPING(assembly_ch.join(filtered_ch)) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config index 5dd544e3..22d73881 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config @@ -327,6 +327,13 @@ process { publishDir = [path: params.logs_dir, pattern: "*-assembly.log", mode: params.publishDir_mode] } + withName: RENAME_HEADERS{ + + publishDir = [path: params.assemblies_dir, pattern: "*-assembly.fasta" , mode: params.publishDir_mode] + + } + + withLabel: mapping { conda = {params.conda.mapping != null ? params.conda.mapping : "envs/mapping.yaml"} cpus = 8 From 620598fc0e4df529579b5ea1b4d333066bd10dab Mon Sep 17 00:00:00 2001 From: OLABIYI ADEREMI OBAYOMI Date: Thu, 1 May 2025 15:14:55 -0500 Subject: [PATCH 45/48] Metagenomics Illumina Nextflow conversion (#134) * improved checkm performance by running separately on every bin * fixed typo in config * updated the nextflow version * changed the default value of accession input parameter * reverted AmpIllumina pipeline doc to remove updates * added launchDir variable * made format changes to config files * updated README * updated MGIllumina pipeline doc after CCB approval * added launch scripts and fixed bugs * deleted cluster path * fixed humman utilities mounting bug * commented out singularity cache_dir * updated the content of processing_info.zip to match other GeneLab Nextflow workflows --- .../GL-DPPD-7104-B.md | 8 +- .../GL-DPPD-7107-A.md | 2 +- .../NF_MGIllumina-A/README.md | 15 +- .../workflow_code/bin/clean-paths.sh | 19 +- .../workflow_code/bin/get-cov-and-depth.sh | 67 ----- .../NF_MGIllumina-A/workflow_code/launch.sh | 102 +++++++ .../workflow_code/launch.slurm | 51 ++++ .../NF_MGIllumina-A/workflow_code/main.nf | 172 +++++++----- .../modules/assembly_based_processing.nf | 14 +- .../workflow_code/modules/coverage.nf | 2 - .../workflow_code/modules/create_runsheet.nf | 2 +- .../modules/database_creation.nf | 2 +- .../workflow_code/modules/genelab.nf | 12 +- .../modules/quality_assessment.nf | 6 +- .../modules/read_based_processing.nf | 8 +- .../workflow_code/modules/summarize_bins.nf | 56 +++- .../workflow_code/nextflow.config | 188 +++++++------ .../workflow_code/post_processing.config | 154 ++++++----- .../workflow_code/post_processing.nf | 257 ++++++++++-------- 19 files changed, 666 insertions(+), 471 deletions(-) delete mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh create mode 100755 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh create mode 100755 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm diff --git a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md index cec64577..7f1d7c96 100644 --- a/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md +++ b/Amplicon/Illumina/Pipeline_GL-DPPD-7104_Versions/GL-DPPD-7104-B.md @@ -38,7 +38,7 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead) - Assay-specific suffixes were added where needed for GeneLab repo ("GLAmpSeq") -- The ITS UNITE reference database used was updated to "UNITE_v2023_July2023.RData", from http://www2.decipher.codes/Classification/TrainingSets/ +- The ITS UNITE reference database used was updated to "UNITE_v2023_July2023.RData", from https://www2.decipher.codes/data/Downloads/TrainingSets/ - Several program versions were updated (all versions listed in [Software used](#software-used) below) --- @@ -103,8 +103,8 @@ Amanda Saravia-Butler (GeneLab Data Processing Lead) |Program used| Database| Relevant Links| |:-----|:-----:|--------:| -|DECIPHER| SILVA SSU r138 | [http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData](http://www2.decipher.codes/Classification/TrainingSets/)| -|DECIPHER| UNITE v2020 | [http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2020_February2020.RData](http://www2.decipher.codes/Classification/TrainingSets/)| +|DECIPHER| SILVA SSU r138 | [https://www2.decipher.codes/data/Downloads/TrainingSets/SILVA_SSU_r138_2019.RData](https://www2.decipher.codes/data/Downloads/TrainingSets/)| +|DECIPHER| UNITE v2023 | [https://www2.decipher.codes/data/Downloads/TrainingSets/UNITE_v2023_July2023.RData](https://www2.decipher.codes/data/Downloads/TrainingSets/)| --- @@ -443,7 +443,7 @@ dna <- DNAStringSet(getSequences(seqtab.nochim)) Downloading the reference R taxonomy object: ```R -download.file( url=“http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData”, destfile=“SILVA_SSU_r138_2019.RData”) +download.file( url=“https://www2.decipher.codes/data/Downloads/TrainingSets/SILVA_SSU_r138_2019.RData”, destfile=“SILVA_SSU_r138_2019.RData”) ``` **Parameter Definitions:** diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index c12c3b39..1709a87c 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -4,7 +4,7 @@ --- -**Date:** October XX, 2024 +**Date:** October 28, 2024 **Revision:** -A **Document Number:** GL-DPPD-7107 diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md index aa2acc1c..2c4d1468 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md @@ -51,7 +51,10 @@ Nextflow can be installed either through [Anaconda](https://anaconda.org/biocond > conda install -c bioconda nextflow > nextflow self-update > ``` - +> You may also install [mamba](https://mamba.readthedocs.io/en/latest/index.html) which is a faster implementation of conda like so: +> ```bash +> conda install -c conda-forge mamba +> ```
#### 1b. Install Singularity @@ -111,7 +114,7 @@ For options and detailed help on how to run the workflow, run the following comm nextflow run main.nf --help ``` -> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --csv_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. +> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --input_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument.
@@ -126,7 +129,7 @@ nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574 #### 4b. Approach 2: Run slurm jobs in singularity containers with a csv file as input ```bash -nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv +nextflow run main.nf -resume -profile slurm,singularity --input_file PE_file.csv ```
@@ -134,7 +137,7 @@ nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv #### 4c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) ```bash -nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc +nextflow run main.nf -resume -profile mamba --input_file SE_file.csv --conda_megahit ```
@@ -153,7 +156,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. @@ -163,7 +166,7 @@ nextflow run main.nf -resume -profile conda --csv_file SE_file.csv --conda.qc

diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh index f5430059..bdbc4b4f 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh @@ -13,12 +13,25 @@ if [ -s t ]; then exit fi - +FILE=$1 ROOT_DIR=$(echo $2 | awk '{N=split($0,a,"/"); for(i=0; i < N-1; i++) printf "%s/", a[i]}' | sed 's|//|/|') + +# Remove path in paired end runsheet +if [ `awk 'NR==1{print}' ${FILE} | grep -c reverse` -gt 0 ]; then + + awk 'BEGIN{FS=OFS=","} NR==1{print} NR>1{split($2, f, "/");split($3, r, "/"); print $1,f[length(f)],r[length(r)],$4}' ${FILE} > temp && mv temp ${FILE} + +# Remove path in single end runsheet +elif [ `awk 'NR==1{print}' ${FILE} | grep -c forward` -gt 0 ]; then + + + awk 'BEGIN{FS=OFS=","} NR==1{print} NR>1{split($2, f, "/"); print $1,f[length(f)],$3}' ${FILE} > temp && mv temp ${FILE} + +fi -sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${1} \ +sed -E 's|.*/GLDS_Datasets/(.+)|\1|g' ${FILE} \ | sed -E 's|.+/miniconda.+/envs/[^/]*/||g' \ | sed -E 's|/[^ ]*/GLDS-|GLDS-|g' \ | sed -E 's|/[a-z]{6}/[^ ]*||g' \ - | sed -E "s|${ROOT_DIR}||g" > t && mv t ${1} + | sed -E "s|${ROOT_DIR}||g" > t && mv t ${FILE} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh deleted file mode 100644 index a0641fed..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get-cov-and-depth.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash -SAMPLE_ID=$1 -ASSEMBLY=$2 -NT=$3 -BAM=$4 -PILEUP_MEM=$5 - - -# Only running if the assembly produced anything -if [ -s ${ASSEMBLY} ]; then - - # Only running on genes also if genes were identified - if [ -s ${NT} ]; then - - pileup.sh -Xmx${PILEUP_MEM} -in ${BAM} \ - fastaorf=${NT} outorf=${SAMPLE_ID}-gene-cov.tmp \ - out=${SAMPLE_ID}-contig-cov-and-det.tmp - - # Filtering coverages based on detection - # Genes - grep -v "#" ${SAMPLE_ID}-gene-cov-and-det.tmp | \ - awk -F $'\t' ' BEGIN {OFS=FS} { if ( $10 <= 0.5 ) $4 = 0 } { print \$1,\$4 } ' \ - > ${SAMPLE_ID}-gene-cov.tmp - - cat <( printf "gene_ID\tcoverage\n" ) ${SAMPLE_ID}-gene-cov.tmp \ - > ${SAMPLE_ID}-gene-coverages.tsv - - # Contigs - grep -v "#" ${SAMPLE_ID}-contig-cov-and-det.tmp | \ - awk -F $'\t' ' BEGIN {OFS=FS} { if ( $5 <= 50 ) $2 = 0 } { print $1,$2 } ' \ - > ${SAMPLE_ID}-contig-cov.tmp - - cat <( printf "contig_ID\tcoverage\n" ) ${SAMPLE_ID}-contig-cov.tmp \ - > ${SAMPLE_ID}-contig-coverages.tsv - - # Removing intermediate files - rm ${SAMPLE_ID}-gene-cov-and-det.tmp ${SAMPLE_ID}-contig-cov-and-det.tmp \ - ${SAMPLE_ID}-gene-cov.tmp ${SAMPLE_ID}-contig-cov.tmp - - else - - pileup.sh -in ${BAM} out=${SAMPLE_ID}-contig-cov-and-det.tmp - - # Filtering coverages based on detection - # Contigs - grep -v "#" ${SAMPLE_ID}-contig-cov-and-det.tmp | \ - awk -F $'\t' ' BEGIN {OFS=FS} { if ( $5 <= 50 ) $2 = 0 } { print $1,$2 } ' \ - > ${SAMPLE_ID}-contig-cov.tmp - cat <( printf "contig_ID\tcoverage\n" ) ${SAMPLE_ID}-contig-cov.tmp \ - > ${SAMPLE_ID}-contig-coverages.tsv - - # Writing out empty genes coverage file - printf "gene_ID\tcoverage\n" > ${SAMPLE_ID}-gene-coverages.tsv - printf "\n\nGene-level coverage info not recovered because the assembly didn't have any genes identified.\n" - - # Removing intermediate files - rm ${SAMPLE_ID}-contig-cov-and-det.tmp ${SAMPLE_ID}-contig-cov.tmp - - fi - -else - - printf "gene_ID\tcoverage\n" > ${SAMPLE_ID}-gene-coverages.tsv - printf "contig_ID\tcoverage\n" > ${SAMPLE_ID}-contig-coverages.tsv - printf "Coverage info not recovered because the assembly didn't produce anything.\n" - -fi \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh new file mode 100755 index 00000000..0e6b2bec --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Script to launch a nextflow workflow on slurm cluster + +# Usage: bash ./launch.sh [mode] [main.nf] [config] '[extra arguments]' +# Examples + +# Processing: +# bash ./launch.sh processing path/to/main.nf path/to/nextflow.config '--accession OSD-574' + +# Postprocessing: +# bash ./launch.sh post_processing path/to/post_processing.nf path/to/post_processing.config \ +# '--name FirstNAme M. LastName --email email@doamin.com --GLDS_accession GLDS-574 --OSD_accession OSD-574 --isa_zip ../GeneLab/OSD-574_metadata_GLDS-574-ISA.zip --runsheet ../GeneLab/GLfile.csv' + + + +MODE=${1:-''} # Script run mode i.e. processing or post_processing +MAIN=${2:-''} # Path to the main.nf or post_processing.nf nextflow script for processing and post_processing, respectively. +CONFIG=${3:-''} # nextflow config file i.e. nextflow.config or post_processing.config +EXTRA=${4:-''} # extra arguments to the nextflow run command + + +#============================================================================== +# SETUP START +#============================================================================== +eval "$(conda shell.bash hook)" +conda activate /path/to/conda/envs/nextflow +export NXF_SINGULARITY_CACHEDIR= +export TOWER_ACCESS_TOKEN= +export TOWER_WORKSPACE_ID= + +#============================================================================== +# UMASK CONFIGURATION +#============================================================================== +echo "Setting umask to enable group read-access by default" +umask u=rwx,g=rx +echo "Umask settings for this launch: $(umask -S)" + + +#============================================================================== +# NEXTFLOW COMMAND START +#============================================================================== +if [ ${MODE} == "processing" ]; then + + RUN_NAME=MAIN_$(date +%Y%m%d%H%M%S) + + RUN_COMMAND="nextflow -C ${CONFIG} + run \ + -name ${RUN_NAME} \ + ${MAIN} \ + -resume \ + -profile slurm,singularity \ + -with-tower \ + -process.queue 'normal' \ + -ansi-log false \ + ${EXTRA}" + + echo "Running command: ${RUN_COMMAND}" + echo "" + [ -d processing_scripts ] || mkdir processing_scripts + eval ${RUN_COMMAND} && echo ${RUN_COMMAND} > processing_scripts/command.txt + + # Save the nextflow log to a file + echo "Creating Nextflow processing info file..." + nextflow log ${RUN_NAME} -f name,script > processing_scripts/nextflow_processing_info_GLmetagenomics.txt + echo nextflow log ${RUN_NAME} -f name,script >> processing_scripts/nextflow_processing_info_GLmetagenomics.txt + echo "Nextflow processing info written to processing_scripts/nextflow_processing_info_GLmetagenomics.txt" + + +elif [ ${MODE} == "post_processing" ];then + + + RUN_NAME=POST_$(date +%Y%m%d%H%M%S) + + RUN_COMMAND="nextflow -C ${CONFIG} + run \ + -name ${RUN_NAME} \ + ${MAIN} \ + -resume \ + -profile slurm,singularity \ + -with-tower \ + -process.queue 'normal' \ + -ansi-log false \ + ${EXTRA}" + + echo "Running command: ${RUN_COMMAND}" + echo "" + eval ${RUN_COMMAND} + +else + echo 'Please provide a valid mode to run the workflow.' + echo 'Either processing or post_processing for running the processing or post_processing workflows, respectively.' + exit 1 +fi + + +# Set permissions on launch directory +echo "" +echo "Setting permissions on launch directory..." +chmod -R 755 . +echo "Permissions set to 755 recursively on launch directory" diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm new file mode 100755 index 00000000..c2403403 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm @@ -0,0 +1,51 @@ +#!/bin/bash + +#SBATCH --job-name="nf_master" ## Replace job_name with the name of the job you are running ## +#SBATCH --output=nf_master.o.%j ## Replace job_name with the name of the job you are running ## +#SBATCH --error=nf_master.e.%j ## Replace job_name with the name of the job you are running ## +#SBATCH --partition=normal ## Specifies the job queue to use, for urgent jobs change normal to priority ## +#SBATCH --mem=20G ## Memory required to run the job in MB, this example is showing 10,000 MB or 10GB, change this number based on how much RAM you need ## +#SBATCH --cpus-per-task=1 ## Number of CPUs to run the job, this example is showing 5 CPUs, change this number based on how many CPUs you need ## +#SBATCH --mail-user=name@domain.com ## Specifies the e-mail address to e-mail when the job is complete, replace this e-mail address with your NASA e-mail address ## +#SBATCH --mail-type=END ## Tells slurm to e-mail the address above when the job has completed ## + +. ~/.profile + + +echo "nf_master" ## Replace job_name with the name of the job you are running ## +echo "" + + +## Add a time-stamp at the start of the job ## +start=$(date +%s) +echo "start time: $start" + +## Print the name of the compute node executing the job ## +echo $HOSTNAME + +WORKFLOW_DIR='/path/to/nextflow/workflow_code' +# Processing +bash ./launch.sh processing ${WORKFLOW_DIR}/main.nf ${WORKFLOW_DIR}/nextflow.config '--accession OSD-574' + +# Post Processing +#bash ./launch.sh post_processing ${WORKFLOW_DIR}/post_processing.nf ${WORKFLOW_DIR}/post_processing.config \ +# '--name First M. Last --email name@domain.com --GLDS_accession GLDS-574 --OSD_accession OSD-574 --isa_zip ../GeneLab/OSD-574_metadata_OSD-574-ISA.zip --runsheet ../GeneLab/GLfile.csv' + + +## Add a time-stamp at the end of the job then calculate how long the job took to run in seconds, minutes, and hours ## +echo "" +end=$(date +%s) +echo "end time: $end" +runtime_s=$(echo $(( end - start ))) +echo "total run time(s): $runtime_s" +sec_per_min=60 +sec_per_hr=3600 +runtime_m=$(echo "scale=2; $runtime_s / $sec_per_min;" | bc) +echo "total run time(m): $runtime_m" +runtime_h=$(echo "scale=2; $runtime_s / $sec_per_hr;" | bc) +echo "total run time(h): $runtime_h" +echo "" + + +## Print the slurm job ID so you have it recorded and can view slurm job statistics if needed ## +echo "slurm job ID: ${SLURM_JOB_ID}" diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf index 9f412edc..c53a3d55 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf @@ -1,14 +1,14 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 -// color defs +// Terminal text color defintions c_back_bright_red = "\u001b[41;1m"; -c_bright_green = "\u001b[32;1m"; -c_blue = "\033[0;34m"; -c_reset = "\033[0m"; +c_bright_green = "\u001b[32;1m"; +c_blue = "\033[0;34m"; +c_reset = "\033[0m"; params.help = false -params.debug = false + /************************************************** * HELP MENU ************************************** **************************************************/ @@ -17,19 +17,19 @@ if (params.help) { println("Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version") println("USAGE:") println("Example 1: Submit and run jobs with slurm in singularity containers.") - println(" > nextflow run main.nf -resume -profile slurm,singularity --csv_file PE_file.csv") + println(" > nextflow run main.nf -resume -profile slurm,singularity --input_file PE_file.csv") println() println("Example 2: : Submit and run jobs with slurm in conda environments.") - println(" > nextflow run main.nf -resume -profile slurm,conda --csv_file SE_file.csv") + println(" > nextflow run main.nf -resume -profile slurm,conda --input_file SE_file.csv") println() println("Example 3: Run jobs locally in conda environments, supply a GLDS accession, and specify the path to an existing conda environment.") - println(" > nextflow run main.nf -resume -profile conda --accession OSD-574 --conda.qc ") + println(" > nextflow run main.nf -resume -profile mamba --accession OSD-574 --conda_megahit ") println() println("Required arguments:") println("""-profile [STRING] Specifies the profile to be used to run the workflow. Options are [slurm, singularity, docker, and conda]. singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. To combine profiles, separate two or more profiles with comma. For example, to combine slurm and singularity profiles, pass 'slurm,singularity' as argument. """) - println("--csv_file [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired). Required only if a GLDS accession is not provided.") + println("--input_file [PATH] A 3-column (single-end) or 4-column (paired-end) csv input file (sample_id, forward, [reverse,] paired). Required only if a GLDS accession is not provided. Default : null") println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") println(" The sample_id column should contain unique sample ids.") println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") @@ -86,7 +86,7 @@ if (params.help) { println(" --read_based_dir [PATH] Read-based analysis outputs directory. Default: ../Read-based_Processing/.") println() println("Genelab specific arguements:") - println(" --accession [STRING] A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.") + println(" --accession [STRING] A Genelab accession number if the --input_file parameter is not set. If this parameter is set, it will ignore the --input_file parameter. Default: null.") println(" --RawFilePattern [STRING] If we do not want to download all files (which we often won't), we can specify a pattern here to subset the total files.") println(" For example, if we know we want to download just the fastq.gz files, we can say 'fastq.gz'. We can also provide multiple patterns") println(" as a comma-separated list. For example, If we want to download the fastq.gz files that also have 'NxtaFlex', 'metagenomics', and 'raw' in") @@ -102,33 +102,33 @@ if (params.help) { println(" The strings below will be added to the end of the --database.cat_db path arguement provided below.") println(" --cat_taxonomy_dir [PATH] CAT taxonomy database directory. Default: 2021-01-07_taxonomy/.") println(" --cat_db_sub_dir [PATH] CAT database sub directory. Default: 2021-01-07_CAT_database/.") - println(" --database.CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") + println(" --CAT_DB_LINK [URL] CAT database online download link. Default: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz.") println("CAT database ") - println(" --database.cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") + println(" --cat_db [PATH] Path to CAT database. Example, /path/to/Reference_DBs/CAT_prepare_20210107/. Default: null.") println("Humann database:") - println(" --database.metaphlan_db_dir [PATH] Path to metaphlan database. Example, /path/to/Reference_DBs/metaphlan4-db/. Default: null.") - println(" --database.chocophlan_dir [PATH] Path to Humann's chocophlan nucleotide database. Example, /path/to/Reference_DBs/humann3-db/chocophlan/. Default: null.") - println(" --database.uniref_dir [PATH] Path to Humann's Uniref protein database. Example, /path/to/Reference_DBs/humann3-db/uniref/. Default: null.") - println(" --database.utilities_dir [PATH] Path to Humann's untilities database. Example, /path/to/Reference_DBs/humann3-db/utility_mapping/. Default: null.") + println(" --metaphlan_db_dir [PATH] Path to metaphlan database. Example, /path/to/Reference_DBs/metaphlan4-db/. Default: null.") + println(" --chocophlan_dir [PATH] Path to Humann's chocophlan nucleotide database. Example, /path/to/Reference_DBs/humann3-db/chocophlan/. Default: null.") + println(" --uniref_dir [PATH] Path to Humann's Uniref protein database. Example, /path/to/Reference_DBs/humann3-db/uniref/. Default: null.") + println(" --utilities_dir [PATH] Path to Humann's untilities database. Example, /path/to/Reference_DBs/humann3-db/utility_mapping/. Default: null.") println("GTDBTK database:") - println(" --database.GTDBTK_LINK [URL] GTDBTK database online download link. Default: https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz.") - println(" --database.gtdbtk_db_dir [PATH] Path to GTDBTK database. Example, /path/Reference_DBs/GTDB-tk-ref-db/. Default: null.") + println(" --GTDBTK_LINK [URL] GTDBTK database online download link. Default: https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz.") + println(" --gtdbtk_db_dir [PATH] Path to GTDBTK database. Example, /path/Reference_DBs/GTDB-tk-ref-db/. Default: null.") println("kofam scan database database:") - println(" --database.ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") + println(" --ko_db_dir [PATH] Path to kofam scan database. Example, /path/to/Reference_DBs/kofamscan_db/. Default: null.") println() println("Paths to existing conda environments to use, otherwise, new ones will be created using the yaml files in envs/.") - println(" --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") - println(" --conda.humann3 [PATH] Path to a conda environment with humann3 installed. Default: null.") - println(" --conda.cat [PATH] Path to a conda environment containing CAT (Contig annotation tool). Default: null.") - println(" --conda.prodigal [PATH] Path to a conda environment with prodigal installed. Default: null.") - println(" --conda.metabat [PATH] Path to a conda environment containing metabat. Default: null.") - println(" --conda.gtdbtk [PATH] Path to a conda environment containing gtdbtk. Default: null.") - println(" --conda.kegg_decoder [PATH] Path to a conda environment with kegg_decoder installed. Default: null.") - println(" --conda.megahit [PATH] Path to a conda environment containing megahit. Default: null.") - println(" --conda.bit [PATH] Path to a conda environment with bit installed. Default: null.") - println(" --conda.kofamscan [PATH] Path to a conda environment containing KOFAM SCAN. Default: null.") - println(" --conda.mapping [PATH] Path to a conda environment with bowtie and samtools installed. Default: null.") - println(" --conda.checkm [PATH] Path to a conda environment with checkm installed. Default: null.") + println(" --conda_qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.") + println(" --conda_humann3 [PATH] Path to a conda environment with humann3 installed. Default: null.") + println(" --conda_cat [PATH] Path to a conda environment containing CAT (Contig annotation tool). Default: null.") + println(" --conda_prodigal [PATH] Path to a conda environment with prodigal installed. Default: null.") + println(" --conda_metabat [PATH] Path to a conda environment containing metabat. Default: null.") + println(" --conda_gtdbtk [PATH] Path to a conda environment containing gtdbtk. Default: null.") + println(" --conda_kegg_decoder [PATH] Path to a conda environment with kegg_decoder installed. Default: null.") + println(" --conda_megahit [PATH] Path to a conda environment containing megahit. Default: null.") + println(" --conda_bit [PATH] Path to a conda environment with bit installed. Default: null.") + println(" --conda_kofamscan [PATH] Path to a conda environment containing KOFAM SCAN. Default: null.") + println(" --conda_mapping [PATH] Path to a conda environment with bowtie and samtools installed. Default: null.") + println(" --conda_checkm [PATH] Path to a conda environment with checkm installed. Default: null.") println() print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number of cpus, memory per task etc.") exit 0 @@ -139,12 +139,12 @@ if (params.help) { *************************************************/ if (params.debug) { -log.info """ +log.info """${c_blue} Nextflow Metagenomics Illumina Consensus Pipeline: $workflow.manifest.version You have set the following parameters: Profile: ${workflow.profile} - Input csv file : ${params.csv_file} + Input csv file : ${params.input_file} GLDS or OSD Accession : ${params.accession} GLDS Raw File Pattern: ${params.RawFilePattern} Workflow : ${params.workflow} @@ -187,37 +187,38 @@ log.info """ Additional Filename Prefix: ${params.additional_filename_prefix} Conda Environments: - qc: ${params.conda.qc} - humann3: ${params.conda.humann3} - CAT: ${params.conda.cat} - prodigal: ${params.conda.prodigal} - metabat: ${params.conda.metabat} - gtdbtk: ${params.conda.gtdbtk} - kegg decoder: ${params.conda.kegg_decoder} - megahit: ${params.conda.megahit} - bit: ${params.conda.bit} - kofamscan: ${params.conda.kofamscan} - mapping: ${params.conda.mapping} - checkm: ${params.conda.checkm} + qc: ${params.conda_qc} + humann3: ${params.conda_humann3} + CAT: ${params.conda_cat} + prodigal: ${params.conda_prodigal} + metabat: ${params.conda_metabat} + gtdbtk: ${params.conda_gtdbtk} + kegg decoder: ${params.conda_kegg_decoder} + megahit: ${params.conda_megahit} + bit: ${params.conda_bit} + kofamscan: ${params.conda_kofamscan} + mapping: ${params.conda_mapping} + checkm: ${params.conda_checkm} Databases: CAT Taxonomy: ${params.cat_taxonomy_dir} CAT DB sub directory: ${params.cat_db_sub_dir} - CAT URL: ${params.database.CAT_DB_LINK} - CAT DB: ${params.database.cat_db} - KOFAM Scan: ${params.database.ko_db_dir} - Metaphlan: ${params.database.metaphlan_db_dir} - Chocophlan: ${params.database.chocophlan_dir} - Uniref: ${params.database.uniref_dir} - Utilities: ${params.database.utilities_dir} - GTDBTK URL: ${params.database.GTDBTK_LINK} - GTDBTK DB: ${params.database.gtdbtk_db_dir} - """.stripIndent() + CAT URL: ${params.CAT_DB_LINK} + CAT DB: ${params.cat_db} + KOFAM Scan: ${params.ko_db_dir} + Metaphlan: ${params.metaphlan_db_dir} + Chocophlan: ${params.chocophlan_dir} + Uniref: ${params.uniref_dir} + Utilities: ${params.utilities_dir} + GTDBTK URL: ${params.GTDBTK_LINK} + GTDBTK DB: ${params.gtdbtk_db_dir} + ${c_reset}""" } // Create GLDS runsheet include { GET_RUNSHEET } from "./modules/create_runsheet.nf" +// Make Humann3 database include { make_humann_db } from "./modules/database_creation.nf" // Read quality check and filtering @@ -240,14 +241,9 @@ workflow run_read_based_analysis { main: - chocophlanDirExists = params.database.chocophlan_dir != null - unirefDirExists = params.database.uniref_dir != null - metaphlanDirExists = params.database.metaphlan_db_dir != null - utilitiesDirExists = params.database.utilities_dir != null - - // if any of the four databases - if(!chocophlanDirExists ||!unirefDirExists || - !metaphlanDirExists || !utilitiesDirExists) { + // If any of the four databases does not exist i.e. the paramater is set to null + if(!params.chocophlan_dir || !params.uniref_dir || + !params.metaphlan_db_dir || !params.utilities_dir) { make_humann_db() read_based(filtered_ch, @@ -263,10 +259,10 @@ workflow run_read_based_analysis { }else{ read_based(filtered_ch, - params.database.chocophlan_dir, - params.database.uniref_dir, - params.database.metaphlan_db_dir, - params.database.utilities_dir) + params.chocophlan_dir, + params.uniref_dir, + params.metaphlan_db_dir, + params.utilities_dir) software_versions_ch = read_based.out.versions } @@ -287,9 +283,9 @@ workflow run_assembly_based_analysis { main: software_versions_ch = Channel.empty() - kofam_db = params.database.ko_db_dir - cat_db = params.database.cat_db - gtdbtk_db_dir = params.database.gtdbtk_db_dir + kofam_db = params.ko_db_dir + cat_db = params.cat_db + gtdbtk_db_dir = params.gtdbtk_db_dir // Run assembly based workflow assembly_based(file_ch, filtered_ch, kofam_db, @@ -313,6 +309,15 @@ def deleteWS(string){ // Main workflow workflow { + + // Sanity check : Test input requirement + if (!params.accession && !params.input_file){ + + error("""${c_back_bright_red}INPUT ERROR! + Please supply either an accession (OSD or Genelab number) or an input CSV file + by passing either to the --accession or --input_file parameter, respectively. + ${c_reset}""") + } // Software Version Capturing - runsheet software_versions_ch = Channel.empty() @@ -327,7 +332,7 @@ workflow { GET_RUNSHEET.out.version | mix(software_versions_ch) | set{software_versions_ch} }else{ - Channel.fromPath(params.csv_file, checkIfExists: true) + Channel.fromPath(params.input_file, checkIfExists: true) .splitCsv(header:true) .set{file_ch} } @@ -374,18 +379,37 @@ workflow { // Software Version Capturing - combining all captured sofware versions - nf_version = "Nextflow Version ".concat("${nextflow.version}\n<><><>\n") + nf_version = "Nextflow Version ".concat("${nextflow.version}") nextflow_version_ch = Channel.value(nf_version) + workflow_version = "MGIllimina ".concat("${workflow.manifest.version}") + workflow_version_ch = Channel.value(workflow_version) // Write software versions to file - software_versions_ch | map { it.text + "\n<><><>\n"} + software_versions_ch | map { it.text.strip() } | unique | mix(nextflow_version_ch) + | mix(workflow_version_ch) | collectFile(name: "${params.metadata_dir}/software_versions.txt", newLine: true, cache: false) | set{final_software_versions_ch} } + + workflow.onComplete { - log.info ( workflow.success ? "\nDone! Workflow completed without any error\n" : "Oops .. something went wrong" ) + + println("${c_bright_green}Pipeline completed at: $workflow.complete") + println("""Execution status: ${ workflow.success ? 'OK' : "${c_back_bright_red}failed" }""") + log.info ( workflow.success ? "\nDone! Workflow completed without any error\n" : "Oops .. something went wrong${c_reset}" ) + + if ( workflow.success ) { + + println("FastQC outputs location: ${params.fastqc_out_dir}") + println("Read-based Analysis: ${params.read_based_dir}") + println("Assembly-based Analysis: ${params.assembly_based_dir}") + println("Software versions location: ${params.metadata_dir}") + println("Pipeline tracing/visualization files location: ../Resource_Usage${c_reset}") + println() + } + } diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf index 86ee76df..fd01dadf 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf @@ -65,7 +65,7 @@ workflow assembly_based { CALL_GENES.out.genes | REMOVE_LINEWRAPS genes_ch = REMOVE_LINEWRAPS.out.genes - if (ko_db_dir != null){ + if (ko_db_dir){ KO_ANNOTATION(assembly_ch.join(genes_ch), ko_db_dir) KO_ANNOTATION.out.temp_table | FILTER_KFAMSCAN @@ -81,14 +81,14 @@ workflow assembly_based { } - if (cat_db != null){ + if (cat_db){ TAX_CLASSIFICATION(assembly_ch.join(genes_ch), cat_db) taxonomy_ch = TAX_CLASSIFICATION.out.taxonomy }else{ - SETUP_CAT_DB(params.database.CAT_DB_LINK) + SETUP_CAT_DB(params.CAT_DB_LINK) SETUP_CAT_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} TAX_CLASSIFICATION(assembly_ch.join(genes_ch), SETUP_CAT_DB.out.cat_db) taxonomy_ch = TAX_CLASSIFICATION.out.taxonomy @@ -100,7 +100,7 @@ workflow assembly_based { .join(genes_ch)) coverage_ch = GET_COV_AND_DET.out.coverages - // Combine contig annotation + // Combine contig annotations tax_and_cov_ch = COMBINE_GENE_ANNOTS_TAX_AND_COVERAGE(coverage_ch .join(annotations_ch) .join(taxonomy_ch) @@ -136,13 +136,13 @@ workflow assembly_based { // Check Bins and Summarize MAGs - if(gtdbtk_db_dir != null){ + if(gtdbtk_db_dir){ summarize_mags(summarize_bins.out.bins_checkm_results, bins_ch, gtdbtk_db_dir, use_gtdbtk_scratch_location, gene_coverage_annotation_and_tax_files_ch) }else{ - SETUP_GTDBTK_DB(params.database.GTDBTK_LINK) + SETUP_GTDBTK_DB(params.GTDBTK_LINK) SETUP_GTDBTK_DB.out.version | mix(software_versions_ch) | set{software_versions_ch} summarize_mags(summarize_bins.out.bins_checkm_results, bins_ch, @@ -155,7 +155,7 @@ workflow assembly_based { // Generating a file with sample ids on a new line file_ch.map{row -> "${row.sample_id}"} - .collectFile(name: "${baseDir}/unique-sample-IDs.txt", newLine: true) + .collectFile(name: "${launchDir}/unique-sample-IDs.txt", newLine: true) .set{sample_ids_ch} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf index fb5aca11..9bfd9e62 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf @@ -19,8 +19,6 @@ process GET_COV_AND_DET { path("versions.txt"), emit: version script: """ - # get-cov-and-depth.sh ${sample_id} ${assembly} ${nt} ${bam} ${params.pileup_mem} - # Only running if the assembly produced anything if [ -s ${assembly} ]; then diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf index cde64bcb..b1eb710e 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf @@ -6,7 +6,7 @@ nextflow.enable.dsl = 2 process GET_RUNSHEET { - beforeScript "chmod +x ${baseDir}/bin/create_runsheet.sh" + beforeScript "chmod +x ${projectDir}/bin/create_runsheet.sh" tag "Downloading raw fastq files and runsheet for ${accession}..." input: diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf index 60ef9d03..5ee6cd91 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf @@ -279,5 +279,5 @@ workflow make_databases { workflow { - make_databases(Channel.of(params.database.CAT_DB_LINK), Channel.of(params.database.GTDBTK_LINK)) + make_databases(Channel.of(params.CAT_DB_LINK), Channel.of(params.GTDBTK_LINK)) } diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf index d3929896..9cd52ec6 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl = 2 process CLEAN_FASTQC_PATHS { - tag "Purging genelab paths from MultiQC zip files in ${params.directories.FastQC_Outputs}" + tag "Purging genelab paths from MultiQC zip files in ${params.FastQC_Outputs}" input: path(FastQC_Outputs_dir) output: @@ -84,7 +84,7 @@ process PACKAGE_PROCESSING_INFO { process GENERATE_README { - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating README for ${OSD_accession}" input: tuple val(name), val(email), val(output_prefix), @@ -185,7 +185,7 @@ process VALIDATE_PROCESSING { process GENERATE_CURATION_TABLE { - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating a file association table for curation..." input: @@ -201,14 +201,14 @@ process GENERATE_CURATION_TABLE { // Directory paths tuple path(Assemblies), path(Genes), path(Mapping), path(Bins), path(MAGS), path(FastQC_Outputs) - path(assay_table) + path(input_table) path(runsheet) output: path("${GLDS_accession}_${output_prefix}-associated-file-names.tsv"), emit: curation_table script: - def INPUT_TABLE = "${params.files.assay_table}" == "" ? "--isa-zip ${assay_table}" : "--assay-table ${assay_table}" + def INPUT_TABLE = params.assay_table ? "--assay-table ${input_table}" : "--isa-zip ${input_table}" """ GL-gen-metagenomics-file-associations-table ${INPUT_TABLE} \\ --runsheet '${runsheet}' \\ @@ -268,7 +268,7 @@ process GENERATE_MD5SUMS { process GENERATE_PROTOCOL { - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" tag "Generating your analysis protocol..." input: diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf index e286900b..2addbdfd 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf @@ -9,7 +9,7 @@ nextflow.enable.dsl = 2 //params.prefix = "raw" // "filetered" //params.csv_file = "file.csv" //params.swift_1S = false -//params.adapters = "${baseDir}/config/bbtools_dapters.fa" +//params.adapters = "${projectDir}/config/bbtools_dapters.fa" //params.multiqc_config = "config/multiqc.config" process FASTQC { @@ -65,7 +65,7 @@ process BBDUK { tag "Quality filtering ${sample_id}-s reads.." - beforeScript "chmod +x ${baseDir}/bin/*" + beforeScript "chmod +x ${projectDir}/bin/*" input: tuple val(sample_id), path(reads), val(isPaired) @@ -123,7 +123,7 @@ workflow quality_check { workflow { - Channel.fromPath(params.csv_file) + Channel.fromPath(params.input_file) .splitCsv() .map{ row -> row.paired == 'true' ? tuple( "${row.sample_id}", [file("${row.forward}", checkIfExists: true), file("${row.reverse}", checkIfExists: true)], row.paired) : tuple( "${row.sample_id}", [file("${row.forward}", checkIfExists: true)], row.paired)} diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf index d27366fd..957b09a5 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf @@ -299,9 +299,9 @@ workflow read_based { workflow { read_based(filtered_reads_ch, - params.database.chocophlan_dir, - params.database.uniref_dir, - params.database.metaphlan_db_dir, - params.database.utilities_dir) + params.chocophlan_dir, + params.uniref_dir, + params.metaphlan_db_dir, + params.utilities_dir) } diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf index 53d56da1..d5e0d06b 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf @@ -43,16 +43,15 @@ process SUMMARIZE_BIN_ASSEMBLIES { } -// Runs checkm on recovered bins -process CHECKM_ON_BINS { +// Runs checkm on a recovered bin +process CHECKM_ON_BIN { - tag "Running checkm on the recovered bins..." - label "bins" + tag "Running checkm on a recovered bin if any..." input: - path(bins) + path(bin) output: - path("${params.additional_filename_prefix}bins-checkm-out.tsv"), emit: checkm_output + path("bin-checkm-out.tsv"), emit: checkm_output path("versions.txt"), emit: version script: """ @@ -64,7 +63,7 @@ process CHECKM_ON_BINS { if [ ${params.reduced_tree} == "True" ]; then checkm lineage_wf \\ - -f ${params.additional_filename_prefix}bins-checkm-out.tsv \\ + -f bin-checkm-out.tsv \\ --tab_table \\ -t ${task.cpus} \\ --reduced_tree \\ @@ -75,7 +74,7 @@ process CHECKM_ON_BINS { else checkm lineage_wf \\ - -f ${params.additional_filename_prefix}bins-checkm-out.tsv \\ + -f bin-checkm-out.tsv \\ --tab_table \\ -t ${task.cpus} \\ --pplacer_threads 1 \\ @@ -87,13 +86,44 @@ process CHECKM_ON_BINS { else printf "There were no bins recovered, so checkm was not run.\\n" \\ - > ${params.additional_filename_prefix}bins-checkm-out.tsv + > bin-checkm-out.tsv fi checkm | grep CheckM | head -n 1 | sed -E 's/.+(CheckM\\sv.+)\\s.+/\\1/' > versions.txt """ } +// Combines the outputs of running checkm on every bin into one file +process COMBINE_CHECKM { + tag "Combining CheckM results for all recovered bins..." + label "bins" + + input: + path(checkm_output, stageAs: "?/*") + output: + path("${params.additional_filename_prefix}bins-checkm-out.tsv"), emit: checkm_output + path("versions.txt"), emit: version + script: + """ + [ -f temp-bins-checkm-out.tsv ] && rm -rf temp-bins-checkm-out.tsv + + for bin_file in ${checkm_output}; do + + cat \${bin_file} >> temp-bins-checkm-out.tsv + + done + + (grep "^Bin Id" temp-bins-checkm-out.tsv | sort -u; \\ + grep -v "^Bin Id" temp-bins-checkm-out.tsv | sort -uV) \\ + > ${params.additional_filename_prefix}bins-checkm-out.tsv + + checkm | grep CheckM | head -n 1 | sed -E 's/.+(CheckM\\sv.+)\\s.+/\\1/' > versions.txt + """ + +} + + + process GENERATE_BINS_OVERVIEW_TABLE { tag "Generating an overall overview of the recovered bins..." @@ -153,15 +183,17 @@ workflow summarize_bins { SUMMARIZE_BIN_ASSEMBLIES(bins) bin_assembly_summaries_ch = SUMMARIZE_BIN_ASSEMBLIES.out.summary - CHECKM_ON_BINS(bins) - bins_checkm_results_ch = CHECKM_ON_BINS.out.checkm_output + CHECKM_ON_BIN(bins.flatten()) + COMBINE_CHECKM(CHECKM_ON_BIN.out.checkm_output.collect()) + bins_checkm_results_ch = COMBINE_CHECKM.out.checkm_output table = GENERATE_BINS_OVERVIEW_TABLE(bin_assembly_summaries_ch, bins_checkm_results_ch, bins) software_versions_ch = Channel.empty() ZIP_BINS.out.version | mix(software_versions_ch) | set{software_versions_ch} SUMMARIZE_BIN_ASSEMBLIES.out.version | mix(software_versions_ch) | set{software_versions_ch} - CHECKM_ON_BINS.out.version | mix(software_versions_ch) | set{software_versions_ch} + CHECKM_ON_BIN.out.version | mix(software_versions_ch) | set{software_versions_ch} + COMBINE_CHECKM.out.version | mix(software_versions_ch) | set{software_versions_ch} emit: bins_checkm_results = bins_checkm_results_ch diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config index 22d73881..3ea34bc9 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config @@ -1,19 +1,19 @@ //******** Global parameters *****************// params { - // input file + // Input file // a 3-column (single-end) or 4-column (paired-end) file - csv_file = "PE_file.csv" + input_file = null /* Run assembly-based workflow, read-based, or both (values need to be one of: "assembly-based", "read-based", or "both") It runs both by default */ - workflow = "both" + workflow = "both" assay_suffix = "_GLmetagenomics" - // additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets) + // Additional prefix to add to output files that describe more than one sample (to make them unique compared to other datasets) // leave as empty, i.e. "", if not wanted, include separator at end if adding one, e.g. "Swift1S_" additional_filename_prefix = "" @@ -22,14 +22,12 @@ params { // Quality trimmed/filtered suffixes filtered_R1_suffix = "_R1_filtered.fastq.gz" filtered_R2_suffix = "_R2_filtered.fastq.gz" + filtered_suffix = "_filtered.fastq.gz" // If single-end - // If single-end - filtered_suffix = "_filtered.fastq.gz" - - // Directories + //-------------------------- Directories ----------------------------------// // Raw reads directory (can be relative to workflow directory, or needs to be full path) - raw_reads_dir = "../Raw_Sequence_Data/" + raw_reads_dir = "../Raw_Sequence_Data/" // Output directories (all relative to processing directory, will be created) fastqc_out_dir = "../FastQC_Outputs/" filtered_reads_dir = "../Filtered_Sequence_Data/" @@ -47,36 +45,34 @@ params { metadata_dir = "../Metadata/" //************************* Databases **********************************// - database { - CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" - GTDBTK_LINK = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" - cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" - ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" - metaphlan_db_dir = null // "/path/to/Reference_DBs/metaphlan4-db/" - chocophlan_dir = null // "/path/to/Reference_DBs/humann3-db/chocophlan/" - uniref_dir = null // "/path/to/Reference_DBs/humann3-db/uniref/" - utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" - gtdbtk_db_dir = null // "/path/to/Reference_DBs/GTDB-tk-ref-db/" - } + CAT_DB_LINK = "https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz" + GTDBTK_LINK = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + cat_db = null // "/path/to/Reference_DBs/CAT_prepare_20210107/" + ko_db_dir = null // "/path/to/Reference_DBs/kofamscan_db/" + metaphlan_db_dir = null // "/path/to/Reference_DBs/metaphlan4-db/" + chocophlan_dir = null // "/path/to/Reference_DBs/humann3-db/chocophlan/" + uniref_dir = null // "/path/to/Reference_DBs/humann3-db/uniref/" + utilities_dir = null // "/path/to/Reference_DBs/humann3-db/utility_mapping/" + gtdbtk_db_dir = null // "/path/to/Reference_DBs/GTDB-tk-ref-db/" // Quality assessment parameters swift_1S = false - adapters = "${baseDir}/config/bbtools_adapters.fa" - multiqc_config = "${baseDir}/config/multiqc.config" + adapters = "${projectDir}/config/bbtools_adapters.fa" + multiqc_config = "${projectDir}/config/multiqc.config" - // Assembly - max_mem = 100e9 // 100GB + // Assembly parameter + max_mem = 100e9 // 100GB - // Binning parameters - reduced_tree = "True" + // Binning parameter + reduced_tree = "True" // Annotation parameters - pileup_mem = "5g" // pileup.sh paramater for calculating contig coverage and depth - block_size = 4 // CAT blocksize + pileup_mem = "5g" // pileup.sh paramater for calculating contig coverage and depth + block_size = 4 // CAT blocksize //******************** CAT database directory strings ************************// - // The string below will be added to the end of the params.database.cat_db provided above + // The strings below will be added to the end of the "cat_db" parameter provided above // cat taxonomy directory with cat_db path provided above cat_taxonomy_dir = "2021-01-07_taxonomy/" cat_db_sub_dir = "2021-01-07_CAT_database/" @@ -96,28 +92,26 @@ params { use_gtdbtk_scratch_location = false - conda{ - // Specify paths to existing conda environments - genelab = null // "/path/to/envs/genelab-utils" - qc = null // "/path/to/envs/qc" - humann3 = null // "/path/to/envs/humann3" - cat = null // "/path/to/envs/genelab-utils/envs/CAT" - prodigal = null // "/path/to/envs/prodigal" - metabat = null // "/path/to/envs/metabat" - gtdbtk = null // "/path/to/envs/gtdbtk" - kegg_decoder = null // "/path/to/envs/kegg_decoder" - megahit = null // "/path/to/envs/megahit" - bit = null // "/path/to/envs/bit" - kofamscan = null // "/path/to/envs/kofamscan" - mapping = null // "/path/to/envs/mapping" - checkm = null // "/path/to/envs/checkm" - } - - accession = false // GLDS or OSD acession number for the data to be processed + // Specify paths to existing conda environments + conda_genelab = null // "/path/to/envs/genelab-utils" + conda_qc = null // "/path/to/envs/qc" + conda_humann3 = null // "/path/to/envs/humann3" + conda_cat = null // "/path/to/envs/genelab-utils/envs/CAT" + conda_prodigal = null // "/path/to/envs/prodigal" + conda_metabat = null // "/path/to/envs/metabat" + conda_gtdbtk = null // "/path/to/envs/gtdbtk" + conda_kegg_decoder = null // "/path/to/envs/kegg_decoder" + conda_megahit = null // "/path/to/envs/megahit" + conda_bit = null // "/path/to/envs/bit" + conda_kofamscan = null // "/path/to/envs/kofamscan" + conda_mapping = null // "/path/to/envs/mapping" + conda_checkm = null // "/path/to/envs/checkm" + + accession = null // GLDS or OSD acession number for the data to be processed // Pattern of files on OSDR for the GLDS_accession you want to process. - RawFilePattern = null // "_metaG", "_HRremoved" - errorStrategy = "terminate" - debug = false // should info about the parameters set by the user be shown when the workflow starts. + RawFilePattern = null // "_metaG", "_HRremoved" + errorStrategy = "terminate" // nextflow's error handling strategy + debug = false // should info about the parameters set by the user be shown when the workflow starts. } // Setting the default container engine to singularity @@ -138,21 +132,37 @@ profiles { } conda { - conda.enabled = true - params.use_conda = true + conda.enabled = true + params.use_conda = true + conda.channels = 'conda-forge,bioconda' + conda.cacheDir = 'conda/' // location of conda environments + conda.createTimeout = '2h' + } + + mamba { + conda.enabled = true + conda.useMamba = true + conda.channels = 'conda-forge,bioconda' + params.use_conda = true + conda.cacheDir = 'conda/' // location of conda environments + conda.createTimeout = '2h' } singularity { singularity.enabled = true singularity.autoMounts = true - singularity.cacheDir = "singularity/" // local singularity images location + + /* Uncomment the line below if you'd like to set the cache directory here, + as setting it here takes precedence over setting the nextflow variable + NXF_SINGULARITY_CACHEDIR=singularity/ in your run script + */ + //singularity.cacheDir = "singularity/" // location of singularity images params.containerEngine = "singularity" } docker { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g)' - docker.userEmulation = true params.containerEngine = "docker" } @@ -165,29 +175,18 @@ executor.queueSize = 20 Root directory where the databases will be downloaded if they don't exist. This should be provided as a full path (starting with '/'). Note that relative paths such as '~/' and '../' are not expanded - by nextflow's evaluation of files, so don't use that. + by nextflow's evaluation of files, so don't use thosee. */ -params.DB_ROOT = "${baseDir.getParent()}/Reference_DBs" +params.DB_ROOT = "${launchDir.getParent()}/Reference_DBs" // Mount Humann databases to their predefined locations in the Biobakery container being used -if(params.database.chocophlan_dir == null || - params.database.uniref_dir == null || - params.database.metaphlan_db_dir == null || - params.database.utilities_dir == null) { - - //biobakery/humann:3.6 - replace /usr/local/lib/python3.6/dist-packages/humann/data/ - //chocophlan = "${params.DB_ROOT}/humann3-db/chocophlan/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/chocophlan_DEMO" - //uniref = "${params.DB_ROOT}/humann3-db/uniref/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/uniref_DEMO" - //utilities = "${params.DB_ROOT}/humann3-db/utility_mapping/:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/misc" +if(params.chocophlan_dir == null || params.uniref_dir == null || params.metaphlan_db_dir == null || params.utilities_dir == null) { + utilities = "${params.DB_ROOT}/humann3-db/utility_mapping/:/usr/local/lib/python3.6/dist-packages/humann/data/misc" }else{ - //biobakery/humann:3.6 - replace /usr/local/lib/python3.6/dist-packages/humann/data/ - //chocophlan = "${params.database.chocophlan_dir}:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/chocophlan_DEMO" - //uniref = "${params.database.uniref_dir}:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/uniref_DEMO" - //utilities = "${params.database.utilities_dir}:/opt/conda/envs/humann3/lib/python3.10/site-packages/humann/data/misc" - utilities = "${params.database.utilities_dir}:/usr/local/lib/python3.6/dist-packages/humann/data/misc" + utilities = "${params.utilities_dir}:/usr/local/lib/python3.6/dist-packages/humann/data/misc" } @@ -213,22 +212,21 @@ process { withLabel: genelab { - conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + conda = {params.conda_genelab ? params.conda_genelab : "${projectDir}/envs/genelab.yaml"} container = "olabiyi/genelab-utils:1.3.22" } - withLabel: bit { cpus = 2 - conda = {params.conda.bit != null ? params.conda.bit : "envs/bit.yaml"} + conda = {params.conda_bit ? params.conda_bit : "${projectDir}/envs/bit.yaml"} container = "olabiyi/bit-astrobiomike:1.0" memory = "5 GB" } //*************************************** Database set-up ********************************************// withLabel: humann_setup { - conda = {params.conda.humann3 != null ? params.conda.humann3 : "envs/humann3.yaml"} + conda = {params.conda_humann3 ? params.conda_humann3 : "${projectDir}/envs/humann3.yaml"} container = "biobakery/humann:3.9" } @@ -242,45 +240,45 @@ process { } withName: SETUP_CAT_DB { - conda = {params.conda.cat != null ? params.conda.cat : "envs/cat.yaml"} + conda = {params.conda_cat ? params.conda_cat : "${projectDir}/envs/cat.yaml"} container = "olabiyi/bit-astrobiomike:1.0" } withName: SETUP_KOFAMSCAN_DB { - conda = {params.conda.kofamscan != null ? params.conda.kofamscan : "envs/kofamscan.yaml"} + conda = {params.conda_kofamscan ? params.conda_kofamscan : "${projectDir}/envs/kofamscan.yaml"} container = "olabiyi/bit-astrobiomike:1.0" } withName: SETUP_GTDBTK_DB { - conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} + conda = {params.conda_gtdbtk ? params.conda_gtdbtk : "${projectDir}/envs/gtdb-tk.yaml"} container = "quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_1" } //************************* GLDS_accession runsheet and input file retrieval **************************************// withName: GET_RUNSHEET { cpus = 10 - conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + conda = {params.conda_genelab ? params.conda_genelab : "${projectDir}/envs/genelab.yaml"} container = "olabiyi/genelab-utils:1.3.22" publishDir = [path: params.genelab_dir , mode: params.publishDir_mode] } //********************************** Read quality control and assesment ********************************************// withName: FASTQC { - conda = {params.conda.qc != null ? params.conda.qc : "envs/qc.yaml"} + conda = {params.conda_qc ? params.conda_qc : "${projectDir}/envs/qc.yaml"} container = "staphb/fastqc:0.12.1" cpus = 2 publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode] } withName: MULTIQC { - conda = {params.conda.qc != null ? params.conda.qc: "envs/qc.yaml"} + conda = {params.conda_qc ? params.conda_qc: "${projectDir}/envs/qc.yaml"} container = "staphb/multiqc:1.19" cpus = 2 publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode] } withName: BBDUK { - conda = {params.conda.qc != null ? params.conda.qc: "envs/qc.yaml"} + conda = {params.conda_qc ? params.conda_qc: "${projectDir}/envs/qc.yaml"} container = "staphb/bbtools:38.86" cpus = 5 memory = "40 GB" @@ -292,8 +290,7 @@ process { //************************************ Read-based processing *********************************************************// withLabel: read_based { - conda = {params.conda.humann3 != null ? params.conda.humann3 : "envs/humann3.yaml"} - // this -> "biobakery/humann:3.9" is the latest version + conda = {params.conda_humann3 ? params.conda_humann3 : "${projectDir}/envs/humann3.yaml"} container = "biobakery/humann:3.9" } @@ -320,7 +317,7 @@ process { } withName: ASSEMBLE { - conda = {params.conda.megahit != null ? params.conda.megahit : "envs/megahit.yaml"} + conda = {params.conda_megahit ? params.conda_megahit : "${projectDir}/envs/megahit.yaml"} container = "biocontainers/megahit:1.2.9_cv1" cpus = 8 memory = "20 GB" @@ -335,7 +332,7 @@ process { withLabel: mapping { - conda = {params.conda.mapping != null ? params.conda.mapping : "envs/mapping.yaml"} + conda = {params.conda_mapping ? params.conda_mapping : "${projectDir}/envs/mapping.yaml"} cpus = 8 //errorStrategy = 'retry' //maxRetries = 2 @@ -353,7 +350,7 @@ process { } withName: CALL_GENES { - conda = {params.conda.prodigal != null ? params.conda.prodigal : "envs/prodigal.yaml"} + conda = {params.conda_prodigal ? params.conda_prodigal : "${projectDir}/envs/prodigal.yaml"} container = "quay.io/biocontainers/prodigal:2.6.3--h031d066_8" cpus = 8 publishDir = [path: params.genes_dir, pattern: "*-genes.gff", mode: params.publishDir_mode] @@ -368,16 +365,15 @@ process { } withName: KO_ANNOTATION { - conda = {params.conda.kofamscan != null ? params.conda.kofamscan : "envs/kofamscan.yaml"} + conda = {params.conda_kofamscan ? params.conda_kofamscan : "${projectDir}/envs/kofamscan.yaml"} container = "quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2" cpus = 8 memory = "10 GB" disk = "20 GB" - //publishDir = [path: params.annotations_and_tax_dir, mode: params.publishDir_mode] } withName: TAX_CLASSIFICATION { - conda = {params.conda.cat != null ? params.conda.cat : "envs/cat.yaml"} + conda = {params.conda_cat ? params.conda_cat : "${projectDir}/envs/cat.yaml"} container = "nanozoo/catbat:5.2.3--e9c0a44" cpus = 8 memory = "50 GB" @@ -385,7 +381,7 @@ process { } withName: GET_COV_AND_DET { - conda = {params.conda.mapping != null ? params.conda.mapping : "envs/mapping.yaml"} + conda = {params.conda_mapping ? params.conda_mapping : "${projectDir}/envs/mapping.yaml"} container = "staphb/bbtools:38.86" cpus = 8 memory = "20 GB" @@ -398,7 +394,7 @@ process { withName: METABAT_BINNING { - conda = {params.conda.metabat != null ? params.conda.metabat : "envs/metabat.yaml"} + conda = {params.conda_metabat ? params.conda_metabat : "${projectDir}/envs/metabat.yaml"} container = "nanozoo/metabat2:2.15--c1941c7" cpus = 8 publishDir = [path: params.mapping_dir, mode: params.publishDir_mode, pattern: "*-metabat-assembly-depth.tsv"] @@ -412,8 +408,8 @@ process { publishDir = [path: params.bins_dir, mode: params.publishDir_mode] } - withName: CHECKM_ON_BINS { - conda = {params.conda.checkm != null ? params.conda.checkm : "envs/checkm.yaml"} + withName: "CHECKM_ON_BIN|COMBINE_CHECKM" { + conda = {params.conda_checkm ? params.conda_checkm : "${projectDir}/envs/checkm.yaml"} container = "nanozoo/checkm:1.1.3--c79a047" cpus = 8 memory = "50 GB" @@ -430,7 +426,7 @@ process { withName: GTDBTK_ON_MAGS { - conda = {params.conda.gtdbtk != null ? params.conda.gtdbtk : "envs/gtdb-tk.yaml.yaml"} + conda = {params.conda_gtdbtk ? params.conda_gtdbtk : "${projectDir}/envs/gtdb-tk.yaml"} container = "quay.io/biocontainers/gtdbtk:2.4.0--pyhdfd78af_1" containerOptions = { params.containerEngine == "singularity" ? "-B \${PWD}:/data -B ${gtdbtk_db_dir}:/refdata" : "-v \${PWD}:/data -v ${gtdbtk_db_dir}:/refdata" } cpus = 8 @@ -439,7 +435,7 @@ process { } withName: SUMMARIZE_MAG_KO_ANNOTS_WITH_KEGG_DECODER { - conda = {params.conda.kegg_decoder != null ? params.conda.kegg_decoder : "envs/keggdecoder.yaml"} + conda = {params.conda_kegg_decoder ? params.conda_kegg_decoder : "${projectDir}/envs/keggdecoder.yaml"} container = "fmalmeida/keggdecoder:latest" cpus = 8 } @@ -482,6 +478,6 @@ manifest { description = 'Metagenomics workflow for pipeline document GL-DPPD-7107-A' mainScript = 'main.nf' defaultBranch = 'main' - nextflowVersion = '>=22.10.6' + nextflowVersion = '>=24.04.4' version = '1.0.0' } diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config index 6260e97a..77be4dd7 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config @@ -4,93 +4,97 @@ params { //-------- Parmeters used to generate README.txt ------------------// name = "FirstName M. LastName" // name of analyst email = "name@nasa.gov" // email of analyst - // Genelab pipeline document protocol id used to process the data - protocol_id = "GL-DPPD-7107-A" - GLDS_accession = "" // e.g. "GLDS-574" - OSD_accession = "" // e.g. "OSD-574" - assay_suffix = "_GLmetagenomics" + protocol_id = "GL-DPPD-7107-A" // Genelab pipeline document protocol id followed + GLDS_accession = null // e.g. "GLDS-574" //or any string + OSD_accession = null // e.g. "OSD-574" or any string + assay_suffix = "_GLmetagenomics" readme = "README${params.assay_suffix}.txt" - processing_zip_file = "processing_info${params.assay_suffix}.zip" - logs = "Logs/" // base directory name of directory containg sample logs from processing - should always end with '/' - + /* extra parameters and arguments to GL-gen-processed-metagenomics-data-readme command. - run `GL-gen-processed-metagenomics-readme --help` for extra parameters that can be set + run `bin/GL-gen-processed-metagenomics-readme --help` for extra parameters that can be set "--raw-reads-dir '../Raw_Sequence_Data/' " for Metagenomics */ - readme_extra = "" + readme_extra = "" - output_prefix = "" + output_prefix = "" V_V_guidelines_link = "https://genelab-tools.arc.nasa.gov/confluence/pages/viewpage.action?pageId=8225175" - // A comma separated list of files and/or directories to find in processing_info.zip - target_files = "main.nf,nextflow.config,unique-sample-IDs.txt,envs/,bin/,config/,modules/,${params.logs}" + + + // processing_zip_file and the files in it + processing_zip_file = "processing_info${params.assay_suffix}.zip" + runsheet_basename = null // "GLfile.csv" or "PE_file.csv" // only the base name is required here + runsheet = null // "../GeneLab/GLfile.csv" //or "PE_file.csv" // actual path (absolute or relative) is required here + logs_dir_basename = "Logs/" // base directory name of directory containing sample logs from processing - should always end with '/' + logs_dir = "../Logs/" // actual path (absolute or relative) is required here + // A comma separated list of file basenames to find in processing_info.zip + target_files = "command.txt,nextflow_processing_info_GLmetagenomics.txt,unique-sample-IDs.txt,software_versions.txt,${params.runsheet_basename},${params.logs_dir_basename}" + + // Suffixes - raw_suffix = "_HRremoved_raw.fastq.gz" - raw_R1_suffix = "_R1_HRremoved_raw.fastq.gz" - raw_R2_suffix = "_R2_HRremoved_raw.fastq.gz" - filtered_suffix = "_filtered.fastq.gz" - filtered_R1_suffix = "_R1_filtered.fastq.gz" - filtered_R2_suffix = "_R2_filtered.fastq.gz" + raw_suffix = "_HRremoved_raw.fastq.gz" + raw_R1_suffix = "_R1_HRremoved_raw.fastq.gz" + raw_R2_suffix = "_R2_HRremoved_raw.fastq.gz" + filtered_suffix = "_filtered.fastq.gz" + filtered_R1_suffix = "_R1_filtered.fastq.gz" + filtered_R2_suffix = "_R2_filtered.fastq.gz" + /* Extra parameters and arguments to GL-validate-processed-metagenomics-data command - run `GL-validate-processed-metagenomics-data --help` for extra parameters that can be set + run `bin/GL-validate-processed-metagenomics-data --help` for extra parameters that can be set "--single-ended" if data are single-ended "--R1-used-as-single-ended-data" if processing only R1 reads as single-end */ - validation_extra = "--skip_raw_multiqc" + validation_extra = "--skip_raw_multiqc" /* Extra parameters and arguments to GL-gen-metagenomics-file-associations-table command - run `GL-gen-metagenomics-file-associations-table --help` for extra parameters that can be set + run `bin/GL-gen-metagenomics-file-associations-table --help` for extra parameters that can be set "--single-ended" if data are single-ended "--R1-used-as-single-ended-data" if processing only R1 reads as single-end */ file_association_extra = "--use-sample-names-from-assay-table" - files { - main = "./main.nf" - config = "./nextflow.config" - samples = "./unique-sample-IDs.txt" - assay_table = "" //"../GeneLab/a_OSD-574_metagenomic-sequencing_whole-genome-shotgun-sequencing_illumina.txt" - isa_zip = "" //"../GeneLab/OSD-574_metadata_OSD-574-ISA.zip" - runsheet = "" // "../GeneLab/GLfile.csv" - software_versions = "" //"../Metadata/software_versions.txt" - } + // --------------------------- files --------------------------------- // + run_command = "./processing_scripts/command.txt" + processing_commands = "./processing_scripts/nextflow_processing_info_GLmetagenomics.txt" + samples = "./unique-sample-IDs.txt" + // You only need to supply one of assay_table or isa_zip + // If you supply both it will use only the assay_table + assay_table = null //"../GeneLab/a_OSD-574_metagenomic-sequencing_whole-genome-shotgun-sequencing_illumina.txt" + isa_zip = null //"../GeneLab/OSD-574_metadata_OSD-574-ISA.zip" + + software_versions = "../Metadata/software_versions.txt" + + //------------------------------ Directories -------------------------------------// // Make sure you always end the directory names with a forward slash "/" and that if you use // relative paths, they are located in the run directory (./) or in its parent (../) - directories { - bin = "./bin/" - envs = "./envs/" - config = "./config/" - modules = "./modules/" - logs = "../${params.logs}" - Raw_Sequence_Data = "../Raw_Sequence_Data" - FastQC_Outputs = "../FastQC_Outputs/" - Read_Based_Processing = "../Read-based_Processing/" - Filtered_Sequence_Data = "../Filtered_Sequence_Data/" - Assembly_Based_Processing = "../Assembly-based_Processing/" - Assemblies = "../Assembly-based_Processing/assemblies/" - Genes = "../Assembly-based_Processing/predicted-genes/" - Annotations_And_Tax = "../Assembly-based_Processing/annotations-and-taxonomy/" - Mapping = "../Assembly-based_Processing/read-mapping/" - Combined_Output = "../Assembly-based_Processing/combined-outputs/" - Bins = "../Assembly-based_Processing/bins/" - MAGS = "../Assembly-based_Processing/MAGs/" - Output_dir = "../Post_Processing/" - } - - conda{ - // Specify paths to existing conda environments - // Leave as is if you'd like to create a new conda environment - genelab = null // "/path/to/envs/genelab-utils" - } + Raw_Sequence_Data = "../Raw_Sequence_Data" + FastQC_Outputs = "../FastQC_Outputs/" + Read_Based_Processing = "../Read-based_Processing/" + Filtered_Sequence_Data = "../Filtered_Sequence_Data/" + Assembly_Based_Processing = "../Assembly-based_Processing/" + Assemblies = "../Assembly-based_Processing/assemblies/" + Genes = "../Assembly-based_Processing/predicted-genes/" + Annotations_And_Tax = "../Assembly-based_Processing/annotations-and-taxonomy/" + Mapping = "../Assembly-based_Processing/read-mapping/" + Combined_Output = "../Assembly-based_Processing/combined-outputs/" + Bins = "../Assembly-based_Processing/bins/" + MAGS = "../Assembly-based_Processing/MAGs/" + Output_dir = "../Post_Processing/" + + //Specify paths to existing conda environments + // Leave as is if you'd like to create a new conda environment + conda_genelab = null // "/path/to/envs/genelab-utils" + + debug = false // set to true if you'd like to see the parameters values printed to the terminal } -params.baseDir = "${baseDir}" -parent_dir = "${baseDir.getParent()}" +params.baseDir = "${launchDir}" +parent_dir = "${launchDir.getParent()}" // Setting the default container engine as singularity params.containerEngine = "singularity" // Conda shouldn't be used be default except when using conda-based profiles @@ -107,21 +111,37 @@ profiles { } conda { - conda.enabled = true - params.use_conda = true + conda.enabled = true + params.use_conda = true + conda.channels = 'conda-forge,bioconda' + conda.cacheDir = 'conda/' // location of conda environments + conda.createTimeout = '2h' + } + + mamba { + conda.enabled = true + conda.useMamba = true + conda.channels = 'conda-forge,bioconda' + params.use_conda = true + conda.cacheDir = 'conda/' // location of conda environments + conda.createTimeout = '2h' } singularity { singularity.enabled = true singularity.autoMounts = true - singularity.cacheDir = "singularity/" // local singularity images location + + /* Uncomment the line below if you'd like to set the cache directory here, + as setting it here takes precedence over setting the nextflow variable + NXF_SINGULARITY_CACHEDIR=singularity/ in your run script + */ + //singularity.cacheDir = "singularity/" // local singularity images location params.containerEngine = "singularity" } docker { docker.enabled = true docker.runOptions = '-u $(id -u):$(id -g)' - docker.userEmulation = true params.containerEngine = "docker" } } @@ -138,15 +158,15 @@ process { cpus = 2 memory = '5 GB' cache = 'lenient' - conda = {params.conda.genelab != null ? params.conda.genelab : "envs/genelab.yaml"} + conda = {params.conda_genelab ? params.conda_genelab : "${projectDir}/envs/genelab.yaml"} container = "olabiyi/genelab-utils:1.3.22" - publishDir = [path: params.directories.Output_dir, mode: params.publishDir_mode] + publishDir = [path: params.Output_dir, mode: params.publishDir_mode] // Mount Parent directory for processes that copy files withName: "PACKAGE_PROCESSING_INFO|GENERATE_MD5SUMS" { - containerOptions = { params.containerEngine == "singularity" ? "-B ${parent_dir}" : "-v ${parent_dir}" } + containerOptions = { params.containerEngine == "singularity" ? "-B ${parent_dir}" : "-v ${parent_dir}:${parent_dir}" } } } @@ -160,6 +180,6 @@ manifest { description = 'Metagenomics Illumina post-processing workflow' mainScript = 'post_processing.nf' defaultBranch = 'main' - nextflowVersion = '>=22.10.1' + nextflowVersion = '>=24.04.4' version = '1.0.0' } diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf index b9603162..18eb16ca 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf @@ -1,14 +1,13 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 -// color defs +// Terminal text color definitions c_back_bright_red = "\u001b[41;1m"; -c_bright_green = "\u001b[32;1m"; -c_blue = "\033[0;34m"; -c_reset = "\033[0m"; +c_bright_green = "\u001b[32;1m"; +c_blue = "\033[0;34m"; +c_reset = "\033[0m"; params.help = false -params.debug = false /************************************************** @@ -32,7 +31,8 @@ if(params.help){ println(" --OSD_accession [STRING] A Genelab OSD accession number. Example OSD-574. Default: empty string") println(" --name [STRING] The analyst's full name. E.g. 'FirstName A. LastName'. Default: FirstName A. LastName") println(" --email [STRING] The analyst's email address. E.g. 'mail@nasa.gov'. Default: mail@nasa.gov") - println(" --logs [STRING] Base directory name of directory containig per sample logs from processing - should always end with '/'. E.g. 'Logs/'. Default: Logs/") + println(" --logs_dir_basename [STRING] Base directory name of directory containing per sample logs from processing - should always end with '/'. E.g. 'Logs/'. Default: Logs/") + println(" --runsheet_basename [String] The runsheets base name. Example 'GLfile.csv' or 'PE_file.csv'. Default: null") println(" --assay_suffix [STRING] Genelab's assay suffix. Default: _GLmetagenomics.") println(" --output_prefix [STRING] Unique name to tag onto output files. Default: empty string.") println(" --V_V_guidelines_link [URL] Genelab metagenomics data validation and verification guidelines link. Default: https://genelab-tools.arc.nasa.gov/confluence/pages/viewpage.action?pageId=8225175.") @@ -52,41 +52,40 @@ if(params.help){ println(" --file_association_extra [STRING] Extra parameters and arguments to GL-gen-metagenomics-file-associations-table command. Run 'GL-gen-metagenomics-file-associations-table --help' for extra parameters that can be set. Example '--single-ended --R1-used-as-single-ended-data'. Default: '--use-sample-names-from-assay-table' ") println() println("Files:") - println(" --files.main [PATH] The main workflow script used for processing. Default: ./main.nf") - println(" --files.config [PATH] The main workflow configuration file used for processing. Default: ./nextflow.config") - println(" --files.samples [PATH] A single column file with sample ids on each line generated after running the processing pipeline. Default: ./unique-sample-IDs.txt") - println(" --files.assay_table [PATH] GLDS assay table generated after running the processing pipeline with accession number as input.") + println(" --run_command [PATH] File containing the nextflow run command used in processing. Default: ./processing_scripts/command.txt") + println(" --processing_commands [PATH] File containing all the process names and scripts used during processing. Default: ./processing_scripts/nextflow_processing_info_GLAmpliseq.txt") + println(" --samples [PATH] A single column file with sample ids on each line generated after running the processing pipeline. Default: ./unique-sample-IDs.txt") + println(" --assay_table [PATH] GLDS assay table generated after running the processing pipeline with accession number as input.") println(" Example, ../Genelab/a_OSD-574_metagenomic-sequencing_whole-genome-shotgun-sequencing_illumina.txt. Default: empty string") - println(" --files.isa_zip [PATH] Genelab ISA zip files containing an assay atable for the OSD accession. This is only required if --files.assay_table is not set.") + println(" --isa_zip [PATH] Genelab ISA zip files containing an assay atable for the OSD accession. This is only required if --files.assay_table is not set.") println(" Example, ../Genelab/OSD-574_metadata_OSD-574-ISA.zip. Default: empty string") - println(" --files.runsheet [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired) used to run the processing pipeline. This is the value set to the paremater --csv_file when run the processing pipeline with a csv file as input otherwise it is the GLfile.csv in the GeneLab directory if --GLDS_accession was used as input. Example '../GeneLab/GLfile.csv'. Default: empty string") - println(" --files.software_versions [PATH] A file generated after running the processing pipeline listing the software versions used. Default: ../Metadata/software_versions.txt") + println(" --runsheet [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired) used to run the processing pipeline. This is the value set to the paremater --input_file when run the processing pipeline with a csv file as input otherwise it is the GLfile.csv in the GeneLab directory if --GLDS_accession was used as input. Example '../GeneLab/GLfile.csv'. Default: null") + + + println(" --software_versions [PATH] A file generated after running the processing pipeline listing the software versions used. Default: ../Metadata/software_versions.txt") println() println("Directories:") - println(" --directories.config [PATH] A directory containing configuration files used in the processing pipeline. Only relevent in Metagenomics and AmpIllumina workflows. Default: ./config/") - println(" --directories.bin [PATH] A directory containing scripts used by nextflow. Default: ./bin/") - println(" --directories.envs [PATH] A directory containing conda yaml files. Default: ./envs/") - println(" --directories.config [PATH] A directory containing config files. Default: ./config/") - println(" --directories.modules [PATH] A directory containing nextflow module scripts. Default: ./modules/") - println(" --directories.Raw_Sequence_Data [PATH] A directory containing raw sequence and raw sequence outputs. Default: ../Raw_Sequence_Data/") - println(" --directories.FastQC_Outputs [PATH] A directory containing fastqc and multiqc zip reports. Default: ../FastQC_Outputs/") - println(" --directories.Filtered_Sequence_Data [PATH] A directory containing the outputs of read filtering after running the processing pipeline. Default: ../Filtered_Sequence_Data/") - println(" --directories.Read_based_Processing [PATH] A directory containing the outputs of read based processing after running the processing pipeline. Default: ../Read_based_Processing/") - println(" --directories.Assembly_based_Processing [PATH] A directory containing the outputs of assembly based processing after running the processing pipeline. Default: ../Assembly_based_Processing/") - println(" --directories.Assemblies [PATH] A directory containing sample contig assemblies after running the processing pipeline. Default: ../Assembly_based_Processing/assemblies/") - println(" --directories.Genes [PATH] A directory containing sample predicted genes after running the processing pipeline. Default: ../Assembly_based_Processing/predicted-genes/") - println(" --directories.Annotations_And_Tax [PATH] A directory containing sample gene and contig annotations after running the processing pipeline. Default: ../Assembly_based_Processing/annotations-and-taxonomy/") - println(" --directories.Mapping [PATH] A directory containing sample read mapping (bam) files after running the processing pipeline. Default: ../Assembly_based_Processing/read-mapping/") - println(" --directories.Combined_Output [PATH] A directory containing assembly summaries and reports across samples after running the processing pipeline. Default: ../Assembly_based_Processing/combined-outputs/") - println(" --directories.Bins [PATH] A directory containing metagenome bins after running the processing pipeline. Default: ../Assembly_based_Processing/bins/") - println(" --directories.MAGS [PATH] A directory containing metagenome assembled genomes (MAGS) after running the processing pipeline. Default: ../Assembly_based_Processing/MAGs/") - println(" --directories.Output_dir [PATH] Specifies the directory where outputs of this post-processing workflow will be published. Default: ../Post_Processing/") + println(" --logs_dir [PATH] Full or relative path to directory name of directory containing per sample logs from processing - should always end with '/'. E.g. 'Logs/'. Default: Logs/") + println(" --Raw_Sequence_Data [PATH] A directory containing raw sequence and raw sequence outputs. Default: ../Raw_Sequence_Data/") + println(" --FastQC_Outputs [PATH] A directory containing fastqc and multiqc zip reports. Default: ../FastQC_Outputs/") + println(" --Filtered_Sequence_Data [PATH] A directory containing the outputs of read filtering after running the processing pipeline. Default: ../Filtered_Sequence_Data/") + println(" --Read_based_Processing [PATH] A directory containing the outputs of read based processing after running the processing pipeline. Default: ../Read_based_Processing/") + println(" --Assembly_based_Processing [PATH] A directory containing the outputs of assembly based processing after running the processing pipeline. Default: ../Assembly_based_Processing/") + println(" --Assemblies [PATH] A directory containing sample contig assemblies after running the processing pipeline. Default: ../Assembly_based_Processing/assemblies/") + println(" --Genes [PATH] A directory containing sample predicted genes after running the processing pipeline. Default: ../Assembly_based_Processing/predicted-genes/") + println(" --Annotations_And_Tax [PATH] A directory containing sample gene and contig annotations after running the processing pipeline. Default: ../Assembly_based_Processing/annotations-and-taxonomy/") + println(" --Mapping [PATH] A directory containing sample read mapping (bam) files after running the processing pipeline. Default: ../Assembly_based_Processing/read-mapping/") + println(" --Combined_Output [PATH] A directory containing assembly summaries and reports across samples after running the processing pipeline. Default: ../Assembly_based_Processing/combined-outputs/") + println(" --Bins [PATH] A directory containing metagenome bins after running the processing pipeline. Default: ../Assembly_based_Processing/bins/") + println(" --MAGS [PATH] A directory containing metagenome assembled genomes (MAGS) after running the processing pipeline. Default: ../Assembly_based_Processing/MAGs/") + println(" --Output_dir [PATH] Specifies the directory where outputs of this post-processing workflow will be published. Default: ../Post_Processing/") println() println("Optional arguments:") - println(" --help Print this help message and exit") + println(" --help [BOOLEAN] Print this help message and exit") + println(" --debug [BOOLEAN] Set to true if you'd like to see the values of your set parameters printed to the terminal. Default: false.") println() println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.") - println(" --conda.genelab [PATH] Path to a conda environment containing genelab-utils. Default: null.") + println(" --conda_genelab [PATH] Path to a conda environment containing genelab-utils. Default: null.") exit 0 } @@ -96,21 +95,24 @@ if(params.help){ *************************************************/ if(params.debug){ -log.info """ +log.info """${c_blue} GeneLab Post Processing Pipeline: $workflow.manifest.version You have set the following parameters: Profile: ${workflow.profile} - Analyst's Name : ${params.name} - Analyst's Email : ${params.email} - GLDS Accession : ${params.GLDS_accession} - OSD Accession : ${params.OSD_accession} + Analyst's Name: ${params.name} + Analyst's Email: ${params.email} + GLDS Accession: ${params.GLDS_accession} + OSD Accession: ${params.OSD_accession} Assay Suffix: ${params.assay_suffix} Output Prefix: ${params.output_prefix} - Logs: ${params.logs} V & V Link: ${params.V_V_guidelines_link} Target Files: ${params.target_files} Nextflow Directory publishing mode: ${params.publishDir_mode} + + Base Names: + Run sheet: ${params.runsheet_basename} + Log Directory: ${params.logs_dir_basename} Suffixes: Raw Suffix: ${params.raw_suffix} @@ -122,35 +124,32 @@ log.info """ Extra scripts parameters: Readme Script Extra: ${params.readme_extra} - Validation Script Extra : ${params.validation_extra} + Validation Script Extra: ${params.validation_extra} File association Script Extra: ${params.file_association_extra} Files: - Main Workflow Script: ${params.files.main} - Nextflow Config File: ${params.files.config} - Samples: ${params.files.samples} - Assay Table: ${params.files.assay_table} - ISA Zip: ${params.files.isa_zip} - Input Runsheet: ${params.files.runsheet} - Software Versions: ${params.files.software_versions} + Nextflow Command: ${params.run_command} + Processing Commands: ${params.processing_commands} + Samples: ${params.samples} + Assay Table: ${params.assay_table} + ISA Zip: ${params.isa_zip} + Input Runsheet: ${params.runsheet} + Software Versions: ${params.software_versions} Directories: - Config: ${params.directories.config} - Bin: ${params.directories.bin} - Conda Environments: ${params.directories.envs} - Modules: ${params.directories.modules} - Raw Reads Directory: ${params.directories.Raw_Sequence_Data} - Filtered Sequence Data: ${params.directories.Filtered_Sequence_Data} - FastQC Outputs: ${params.directories.FastQC_Outputs} - Read-based Processing: ${params.directories.Read_based_Processing} - Assemblies: ${params.directories.Assemblies} - Genes: ${params.directories.Genes} - Annotations And Taxonomy: ${params.directories.Annotations_And_Tax} - Mapping: ${params.directories.Mapping} - Combined Output: ${params.directories.Combined_Output} - Bins: ${params.directories.Bins} - MAGS: ${params.directories.MAGS} - Pipeline Outputs: ${params.directories.Output_dir} + Logs directory: ${params.logs_dir} + Raw Reads Directory: ${params.Raw_Sequence_Data} + Filtered Sequence Data: ${params.Filtered_Sequence_Data} + FastQC Outputs: ${params.FastQC_Outputs} + Read-based Processing: ${params.Read_based_Processing} + Assemblies: ${params.Assemblies} + Genes: ${params.Genes} + Annotations And Taxonomy: ${params.Annotations_And_Tax} + Mapping: ${params.Mapping} + Combined Output: ${params.Combined_Output} + Bins: ${params.Bins} + MAGS: ${params.MAGS} + Pipeline Outputs: ${params.Output_dir} """ } @@ -161,30 +160,39 @@ include { CLEAN_FASTQC_PATHS; PACKAGE_PROCESSING_INFO; GENERATE_README; VALIDATE workflow { + // Make sure accession numbers are set + if(!params.GLDS_accession || !params.OSD_accession){ + error("""${c_back_bright_red}ACCESSION ERROR!. + Please supply both --GLDS_accession and --OSD_accession. + They can be any string you choose but they must be set. + ${c_reset}""") + } + + // ---------------------- Input channels -------------------------------- // // Input files - sample_ids_file = Channel.fromPath(params.files.samples, checkIfExists: true) - software_versions = Channel.fromPath(params.files.software_versions, checkIfExists: true) + sample_ids_file = Channel.fromPath(params.samples, checkIfExists: true) + software_versions = Channel.fromPath(params.software_versions, checkIfExists: true) // Directories - Bins = Channel.fromPath(params.directories.Bins, type: 'dir', checkIfExists: true) - MAGS = Channel.fromPath(params.directories.MAGS, type: 'dir', checkIfExists: true) + Bins = Channel.fromPath(params.Bins, type: 'dir', checkIfExists: true) + MAGS = Channel.fromPath(params.MAGS, type: 'dir', checkIfExists: true) // Input Value channels OSD_ch = Channel.of([params.name, params.email, params.output_prefix, params.OSD_accession, params.protocol_id, - params.directories.FastQC_Outputs, - params.directories.Filtered_Sequence_Data, - params.directories.Read_Based_Processing, - params.directories.Assembly_Based_Processing, - params.directories.Assemblies, - params.directories.Genes, - params.directories.Annotations_And_Tax, - params.directories.Mapping, - params.directories.Combined_Output]) + params.FastQC_Outputs, + params.Filtered_Sequence_Data, + params.Read_Based_Processing, + params.Assembly_Based_Processing, + params.Assemblies, + params.Genes, + params.Annotations_And_Tax, + params.Mapping, + params.Combined_Output]) GLDS_ch = Channel.of([params.GLDS_accession, params.V_V_guidelines_link, params.output_prefix, - params.target_files, params.assay_suffix, params.logs, + params.target_files, params.assay_suffix, params.logs_dir_basename, params.raw_suffix, params.raw_R1_suffix, params.raw_R2_suffix, params.filtered_suffix, params.filtered_R1_suffix, params.filtered_R2_suffix]) @@ -195,12 +203,12 @@ workflow { file_label_ch = Channel.of([params.processing_zip_file, params.readme]) // processed as paths but utilized as labels in the genberate curation association table script - dir_label_ch = Channel.of([params.directories.Raw_Sequence_Data, - params.directories.Filtered_Sequence_Data, - params.directories.Read_Based_Processing, - params.directories.Assembly_Based_Processing, - params.directories.Annotations_And_Tax, - params.directories.Combined_Output]) + dir_label_ch = Channel.of([params.Raw_Sequence_Data, + params.Filtered_Sequence_Data, + params.Read_Based_Processing, + params.Assembly_Based_Processing, + params.Annotations_And_Tax, + params.Combined_Output]) .collect() .map{ Raw_Sequence_Data, Filtered_Sequence_Data, Read_Based_Processing, Assembly_Based_Processing, Annotations_And_Tax, Combined_Output -> @@ -214,28 +222,24 @@ workflow { } // If the assay table is provided use it as the input table otherwise use the isa_zip - assay_table_ch = Channel.fromPath("${params.files.assay_table}" == "" ? "${params.files.isa_zip}" : "${params.files.assay_table}", + assay_table_ch = Channel.fromPath( params.assay_table ? params.assay_table : params.isa_zip, checkIfExists: true) // Runsheet used to execute the processing workflow - runsheet_ch = Channel.fromPath(params.files.runsheet) + runsheet_ch = Channel.fromPath(params.runsheet, checkIfExists: true) // Files and directories to be packaged in processing_info.zip - files_and_dirs_ch = Channel.of(params.directories.config, params.directories.logs, - params.directories.bin, params.directories.modules, - params.directories.envs, params.files.main, - params.files.config, params.files.samples) + files_and_dirs_ch = Channel.of(params.logs_dir, params.run_command, params.processing_commands, + params.software_versions, params.runsheet, params.samples) .collect() - .map{ config_dir, logs, bin, modules, envs, main, config_file, samples -> - tuple( file(config_dir, checkIfExists: true), - file(logs, checkIfExists: true), - file(bin, checkIfExists: true), - file(modules, checkIfExists: true), - file(envs, checkIfExists: true), - file(main, checkIfExists: true), - file(config_file, checkIfExists: true), + .map{ logs, run_command, processing_commands, software_versions, runsheet, samples -> + tuple( file(logs, checkIfExists: true), + file(run_command, checkIfExists: true), + file(processing_commands, checkIfExists: true), + file(software_versions, checkIfExists: true), + file(runsheet, checkIfExists: true), file(samples, checkIfExists: true) ) } @@ -246,20 +250,19 @@ workflow { GENERATE_README(OSD_ch, PACKAGE_PROCESSING_INFO.out.zip, Bins, MAGS) - FastQC_Outputs_dir = Channel.fromPath(params.directories.FastQC_Outputs, - type: 'dir', checkIfExists: true) + FastQC_Outputs_dir = Channel.fromPath(params.FastQC_Outputs, type: 'dir', checkIfExists: true) CLEAN_FASTQC_PATHS(FastQC_Outputs_dir) - validation_dirs_ch = Channel.of(params.directories.Filtered_Sequence_Data, - params.directories.Read_Based_Processing, - params.directories.Assembly_Based_Processing, - params.directories.Assemblies, - params.directories.Mapping, - params.directories.Genes, - params.directories.Annotations_And_Tax, - params.directories.Bins, - params.directories.MAGS, - params.directories.Combined_Output) + validation_dirs_ch = Channel.of(params.Filtered_Sequence_Data, + params.Read_Based_Processing, + params.Assembly_Based_Processing, + params.Assemblies, + params.Mapping, + params.Genes, + params.Annotations_And_Tax, + params.Bins, + params.MAGS, + params.Combined_Output) .concat(CLEAN_FASTQC_PATHS.out.clean_dir) .collect() .map{ filtered_sequence, read_based, assembly_based, assemblies, @@ -282,10 +285,11 @@ workflow { sample_ids_file, GENERATE_README.out.readme, PACKAGE_PROCESSING_INFO.out.zip) + // Generate md5sums - dirs_ch = Channel.of(params.directories.Read_Based_Processing, - params.directories.Filtered_Sequence_Data, - params.directories.Assembly_Based_Processing) + dirs_ch = Channel.of(params.Read_Based_Processing, + params.Filtered_Sequence_Data, + params.Assembly_Based_Processing) .concat(CLEAN_FASTQC_PATHS.out.clean_dir) .collect() .map{ read_based, filtered_sequence, assembly_based, fastqc -> @@ -298,12 +302,13 @@ workflow { GENERATE_MD5SUMS(PACKAGE_PROCESSING_INFO.out.zip, GENERATE_README.out.readme, dirs_ch) + // Generate curation file association table - curation_dirs_ch = Channel.of(params.directories.Assemblies, - params.directories.Genes, - params.directories.Mapping, - params.directories.Bins, - params.directories.MAGS) + curation_dirs_ch = Channel.of(params.Assemblies, + params.Genes, + params.Mapping, + params.Bins, + params.MAGS) .concat(CLEAN_FASTQC_PATHS.out.clean_dir) .collect() .map{ assemblies, genes, mapping, bins, mags, fastqc -> @@ -319,5 +324,23 @@ workflow { dir_label_ch, curation_dirs_ch, assay_table_ch, runsheet_ch) + + + // Write methods GENERATE_PROTOCOL(software_versions, params.protocol_id) } + + +workflow.onComplete { + + println("${c_bright_green}Pipeline completed at: $workflow.complete") + println("""Execution status: ${ workflow.success ? 'OK' : "${c_back_bright_red}failed" }""") + log.info ( workflow.success ? "\nDone! Workflow completed without any error\n" : "Oops .. something went wrong${c_reset}" ) + + if ( workflow.success ) { + + println("Post-processing Outputs: ${params.Output_dir} ${c_reset}") + println() + + } +} From 2a16111b08bf6edb6c86211e3cbc0426d0cea1af Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Fri, 2 May 2025 17:03:50 -0700 Subject: [PATCH 46/48] Updated documentation (#150) - Updated Pipeline doc - added missing output files for assembly based processing - added links between steps for file references - fixed step references - Created examples/runsheet folder for CSV examples files - Added runsheet documentation - Updated ChangeLog - added ChangeLog documentation - reformatted references to pipeline and previous snakemake workflow - added closed issue references - added other workflow specific updates vs previous snakemake workflow (checkm implementation) - Updated Workflow README - add docker references since both Docker and Singularity are supported - regularize capitalization of Nextflow and Singularity to match the tools' documentation - reorganize post-processing output definition to match other assays - clarify installation instructions for conda in workflow document - Added missing assay suffix to "Failed" assembly file name - Rename NF_MGIllumina-A to NF_MGIllumina - Checked and updated all broken links --- .../GL-DPPD-7107-A.md | 204 +++++++++++++----- .../NF_MGIllumina-A/CHANGELOG.md | 6 - .../NF_MGIllumina/CHANGELOG.md | 28 +++ .../README.md | 109 +++++----- .../NF_MGIllumina/examples/runsheet/README.md | 23 ++ .../runsheet/paired_end_dataset}/PE_file.csv | 0 .../runsheet/single_end_dataset}/SE_file.csv | 0 ...L-gen-metagenomics-file-associations-table | 0 .../bin/GL-gen-processed-metagenomics-readme | 0 .../GL-validate-processed-metagenomics-data | 0 .../workflow_code/bin/clean-paths.sh | 0 .../bin/combine-all-gene-tables.py | 0 ...evel-coverages-annots-and-tax-per-group.py | 0 .../workflow_code/bin/create_runsheet.sh | 0 .../workflow_code/bin/download-GTDBTK-db.sh | 0 .../bin/format-contig-tax-classifications.sh | 0 .../bin/format-gene-tax-classifications.sh | 0 .../generate-assembly-based-overview-table.sh | 0 .../workflow_code/bin/generate_protocol.sh | 0 .../bin/get_MAGs_estimates_and_taxonomy.sh | 0 .../workflow_code/bin/parse-MAG-annots.py | 0 .../workflow_code/bin/prepull_singularity.sh | 0 .../workflow_code/bin/swap-MAG-IDs.py | 0 .../workflow_code/config/bbtools_adapters.fa | 0 .../workflow_code/config/multiqc.config | 0 .../workflow_code/envs/bit.yaml | 0 .../workflow_code/envs/cat.yaml | 0 .../workflow_code/envs/checkm.yaml | 0 .../workflow_code/envs/genelab.yaml | 0 .../workflow_code/envs/gtdb-tk.yaml | 0 .../workflow_code/envs/humann3.yaml | 0 .../workflow_code/envs/image_def.bit | 0 .../workflow_code/envs/image_def.genelab | 0 .../workflow_code/envs/keggdecoder.yaml | 0 .../workflow_code/envs/kofamscan.yaml | 0 .../workflow_code/envs/mapping.yaml | 0 .../workflow_code/envs/megahit.yaml | 0 .../workflow_code/envs/metabat.yaml | 0 .../workflow_code/envs/prodigal.yaml | 0 .../workflow_code/envs/qc.yaml | 0 .../workflow_code/launch.sh | 0 .../workflow_code/launch.slurm | 0 .../workflow_code/main.nf | 2 +- .../workflow_code/modules/assembly.nf | 0 .../modules/assembly_annotation.nf | 0 .../modules/assembly_based_processing.nf | 2 +- .../workflow_code/modules/binning.nf | 0 .../modules/combine_contig_annotation.nf | 0 .../workflow_code/modules/coverage.nf | 0 .../workflow_code/modules/create_runsheet.nf | 0 .../modules/database_creation.nf | 0 .../workflow_code/modules/genelab.nf | 0 .../modules/quality_assessment.nf | 0 .../modules/read_based_processing.nf | 0 .../workflow_code/modules/read_mapping.nf | 0 .../workflow_code/modules/summarize_MAG.nf | 0 .../summarize_assembly-based_processing.nf | 0 .../workflow_code/modules/summarize_bins.nf | 0 .../workflow_code/modules/zip_fasta.nf | 0 .../workflow_code/nextflow.config | 0 .../workflow_code/post_processing.config | 0 .../workflow_code/post_processing.nf | 0 .../workflow_code/slurm_submit.slurm | 0 .../Illumina/Workflow_Documentation/README.md | 2 +- 64 files changed, 256 insertions(+), 120 deletions(-) delete mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/CHANGELOG.md create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/README.md (52%) create mode 100644 Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/README.md rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A/workflow_code => NF_MGIllumina/examples/runsheet/paired_end_dataset}/PE_file.csv (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A/workflow_code => NF_MGIllumina/examples/runsheet/single_end_dataset}/SE_file.csv (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/GL-gen-metagenomics-file-associations-table (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/GL-gen-processed-metagenomics-readme (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/GL-validate-processed-metagenomics-data (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/clean-paths.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/combine-all-gene-tables.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/create_runsheet.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/download-GTDBTK-db.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/format-contig-tax-classifications.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/format-gene-tax-classifications.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/generate-assembly-based-overview-table.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/generate_protocol.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/parse-MAG-annots.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/prepull_singularity.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/bin/swap-MAG-IDs.py (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/config/bbtools_adapters.fa (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/config/multiqc.config (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/bit.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/cat.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/checkm.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/genelab.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/gtdb-tk.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/humann3.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/image_def.bit (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/image_def.genelab (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/keggdecoder.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/kofamscan.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/mapping.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/megahit.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/metabat.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/prodigal.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/envs/qc.yaml (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/launch.sh (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/launch.slurm (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/main.nf (99%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/assembly.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/assembly_annotation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/assembly_based_processing.nf (97%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/binning.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/combine_contig_annotation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/coverage.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/create_runsheet.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/database_creation.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/genelab.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/quality_assessment.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/read_based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/read_mapping.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/summarize_MAG.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/summarize_assembly-based_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/summarize_bins.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/modules/zip_fasta.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/nextflow.config (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/post_processing.config (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/post_processing.nf (100%) rename Metagenomics/Illumina/Workflow_Documentation/{NF_MGIllumina-A => NF_MGIllumina}/workflow_code/slurm_submit.slurm (100%) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 1709a87c..305a0a2b 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -52,7 +52,7 @@ Barbara Novak (GeneLab Data Processing Lead) - [10. Getting coverage information and filtering based on detection](#10-getting-coverage-information-and-filtering-based-on-detection) - [11. Combining gene-level coverage, taxonomy, and functional annotations into one table for each sample](#11-combining-gene-level-coverage-taxonomy-and-functional-annotations-into-one-table-for-each-sample) - [12. Combining contig-level coverage and taxonomy into one table for each sample](#12-combining-contig-level-coverage-and-taxonomy-into-one-table-for-each-sample) - - [13. Generating normalized, gene-level-coverage summary tables of KO-annotations and taxonomy across samples](#13-generating-normalized-gene-level-coverage-summary-tables-of-ko-annotations-and-taxonomy-across-samples) + - [13. Generating normalized, gene- and contig-level coverage summary tables of KO-annotations and taxonomy across samples](#13-generating-normalized-gene--and-contig-level-coverage-summary-tables-of-ko-annotations-and-taxonomy-across-samples) - [14. **M**etagenome-**A**ssembled **G**enome (MAG) recovery](#14-metagenome-assembled-genome-mag-recovery) - [15. Generating MAG-level functional summary overview](#15-generating-mag-level-functional-summary-overview) - [**Read-based processing**](#read-based-processing) @@ -97,7 +97,7 @@ fastqc -o raw_fastqc_output *raw.fastq.gz **Parameter Definitions:** * `-o` – the output directory to store results -* `*raw.fastq.gz` – the input reads are specified as a positional argument, and can be given all at once with wildcards like this, or as individual arguments with spaces in between them +* `*raw.fastq.gz` – the input reads are specified as a positional argument, and can be given all at once with wildcards like this, or as individual arguments with spaces in between them **Input data:** @@ -259,7 +259,7 @@ megahit -1 sample-1_R1_filtered.fastq.gz -2 sample-1_R2_filtered.fastq.gz \ **Input data:** -* *fastq.gz (filtered/trimmed reads) +* *fastq.gz (filtered/trimmed reads from [step 2](#2-quality-filteringtrimming) above) **Output data:** @@ -288,7 +288,7 @@ bit-rename-fasta-headers -i sample-1-assembly/final.contigs.fa -w c_sample-1 -o **Input data:** -* sample-1-assembly/final.contigs.fa (assembly file) +* sample-1-assembly/final.contigs.fa (assembly file from [step 4](#4-sample-assembly)) **Output files:** @@ -298,7 +298,7 @@ bit-rename-fasta-headers -i sample-1-assembly/final.contigs.fa -w c_sample-1 -o #### 5b. Summarizing assemblies ``` -bit-summarize-assembly -o assembly-summaries.tsv *assembly.fasta +bit-summarize-assembly -o assembly-summaries_GLmetagenomics.tsv *assembly.fasta ``` **Parameter Definitions:** @@ -310,7 +310,7 @@ bit-summarize-assembly -o assembly-summaries.tsv *assembly.fasta **Input data:** -* *-assembly.fasta (contig-renamed assembly files) +* *-assembly.fasta (contig-renamed assembly files from [step 5a](#5a-renaming-contig-headers)) **Output files:** @@ -345,7 +345,7 @@ prodigal -a sample-1-genes.faa -d sample-1-genes.fasta -f gff -p meta -c -q \ **Input data:** -* sample-1-assembly.fasta (assembly file) +* sample-1-assembly.fasta (contig-renamed assembly file from [step 5a](#5a-renaming-contig-headers)) **Output data:** @@ -392,12 +392,12 @@ exec_annotation -p profiles/ -k ko_list --cpu 15 -f detail-tsv -o sample-1-KO-ta * `--report-unannotated` – specifies to generate an output for each entry -* `sample-1-genes.faa` – the input file is specified as a positional argument +* `sample-1-genes.faa` – the input file is specified as a positional argument **Input data:** -* sample-1-genes.faa (amino-acid fasta file) +* sample-1-genes.faa (amino-acid fasta file, from [step 6](#6-gene-prediction)) * profiles/ (reference directory holding the KO HMMs) * ko_list (reference list of KOs to scan for) @@ -423,7 +423,7 @@ rm -rf sample-1-tmp-KO/ sample-1-KO-annots.tmp **Input data:** -* sample-1-KO-tab.tmp (table of KO annotations assigned to gene IDs) +* sample-1-KO-tab.tmp (table of KO annotations assigned to gene IDs from [step 7b](#7b-running-kegg-annotation)) **Output data:** @@ -445,7 +445,7 @@ tar -xvzf CAT_prepare_20200618.tar.gz ``` CAT contigs -c sample-1-assembly.fasta -d CAT_prepare_20200618/2020-06-18_database/ \ -t CAT_prepare_20200618/2020-06-18_taxonomy/ -p sample-1-genes.faa \ - -o sample-1-tax-out.tmp -n 15 -r 3 --top 4 --I_know_what_Im_doing + -o sample-1-tax-out.tmp -n NumberOfThreads -r 3 --top 4 --I_know_what_Im_doing --no-stars ``` **Parameter Definitions:** @@ -460,7 +460,7 @@ CAT contigs -c sample-1-assembly.fasta -d CAT_prepare_20200618/2020-06-18_databa * `-o` – specifies the output prefix -* `-n` – specifies the number of cores to use +* `-n` – specifies the number of CPU cores to use * `-r` – specifies the number of top protein hits to consider in assigning tax @@ -468,11 +468,13 @@ CAT contigs -c sample-1-assembly.fasta -d CAT_prepare_20200618/2020-06-18_databa * `--I_know_what_Im_doing` – allows us to alter the `--top` parameter +* `--no-stars` - suppress marking of suggestive taxonomic assignments + **Input data:** -* sample-1-assembly.fasta (assembly file) -* sample-1-genes.faa (gene-calls amino-acid fasta file) +* sample-1-assembly.fasta (assembly file from [step 5a](#5a-renaming-contig-headers)) +* sample-1-genes.faa (gene-calls amino-acid fasta file from [step 6](#6-gene-prediction)) **Output data:** @@ -482,7 +484,7 @@ CAT contigs -c sample-1-assembly.fasta -d CAT_prepare_20200618/2020-06-18_databa #### 8c. Adding taxonomy info from taxids to genes ``` CAT add_names -i sample-1-tax-out.tmp.ORF2LCA.txt -o sample-1-gene-tax-out.tmp \ - -t CAT_prepare_20200618/2020-06-18_taxonomy/ --only_official + -t CAT_prepare_20200618/2020-06-18_taxonomy/ --only_official --exclude-scores ``` **Parameter Definitions:** @@ -495,9 +497,11 @@ CAT add_names -i sample-1-tax-out.tmp.ORF2LCA.txt -o sample-1-gene-tax-out.tmp \ * `--only_official` – specifies to add only standard taxonomic ranks +* `--exclude-scores` - specifies to exclude bit-score support scores in the lineage + **Input data:** -* sample-1-tax-out.tmp.ORF2LCA.txt (gene-calls taxonomy file) +* sample-1-tax-out.tmp.ORF2LCA.txt (gene-calls taxonomy file from [step 8b](#8b-running-taxonomic-classification)) **Output data:** @@ -508,7 +512,7 @@ CAT add_names -i sample-1-tax-out.tmp.ORF2LCA.txt -o sample-1-gene-tax-out.tmp \ #### 8d. Adding taxonomy info from taxids to contigs ``` CAT add_names -i sample-1-tax-out.tmp.contig2classification.txt -o sample-1-contig-tax-out.tmp \ - -t CAT-ref/2020-06-18_taxonomy/ --only_official + -t CAT-ref/2020-06-18_taxonomy/ --only_official --exclude-scores ``` **Parameter Definitions:** @@ -521,10 +525,12 @@ CAT add_names -i sample-1-tax-out.tmp.contig2classification.txt -o sample-1-cont * `--only_official` – specifies to add only standard taxonomic ranks +* `--exclude-scores` - specifies to exclude bit-score support scores in the lineage + **Input data:** -* sample-1-tax-out.tmp.contig2classification.txt (contig taxonomy file) +* sample-1-tax-out.tmp.contig2classification.txt (contig taxonomy file from [step 8b](#8b-running-taxonomic-classification)) **Output data:** @@ -533,21 +539,21 @@ CAT add_names -i sample-1-tax-out.tmp.contig2classification.txt -o sample-1-cont #### 8e. Formatting gene-level output with awk and sed ``` -awk -F $'\t' ' BEGIN { OFS=FS } { if ( $2 == "lineage" ) { print $1,$2,$4,$5,$6,$7,$8,$9,$10 } \ +awk -F $'\t' ' BEGIN { OFS=FS } { if ( $3 == "lineage" ) { print $1,$3,$5,$6,$7,$8,$9,$10,$11 } \ else if ( $2 == "ORF has no hit to database" || $2 ~ /^no taxid found/ ) \ - { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } else { n=split($2,lineage,";"); \ - print $1,lineage[n],$4,$5,$6,$7,$8,$9,$10 } } ' sample-1-gene-tax-out.tmp | \ - sed 's/not classified/NA/g' | sed 's/superkingdom/domain/' | sed 's/^# ORF/gene_ID/' | \ - sed 's/lineage/taxid/' | sed 's/\*//g' > sample-1-gene-tax-out.tsv + { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } else { n=split($3,lineage,";"); \ + print $1,lineage[n],$5,$6,$7,$8,$9,$10,$11 } } ' sample-1-gene-tax-out.tmp | \ + sed no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/# ORF/gene_ID/' | \ + sed 's/lineage/taxid/' > sample-1-gene-tax-out.tsv ``` #### 8f. Formatting contig-level output with awk and sed ``` awk -F $'\t' ' BEGIN { OFS=FS } { if ( $2 == "classification" ) { print $1,$4,$6,$7,$8,$9,$10,$11,$12 } \ - else if ( $2 == "unclassified" ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ + else if ( $2 == "no taxid assigned" ) { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } \ else { n=split($4,lineage,";"); print $1,lineage[n],$6,$7,$8,$9,$10,$11,$12 } } ' sample-1-contig-tax-out.tmp | \ - sed 's/not classified/NA/g' | sed 's/superkingdom/domain/' | sed 's/: [0-9\.]*//g' | sed 's/^# contig/contig_ID/' | \ - sed 's/lineage/taxid/' | sed 's/\*//g' > sample-1-contig-tax-out.tsv + sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/^# contig/contig_ID/' | \ + sed 's/lineage/taxid/' > sample-1-contig-tax-out.tsv # clearing intermediate files rm sample-1*.tmp* @@ -555,8 +561,8 @@ rm sample-1*.tmp* **Input data:** -* sample-1-gene-tax-out.tmp (gene-calls taxonomy file with lineage info added) -* sample-1-contig-tax-out.tmp (contig taxonomy file with lineage info added) +* sample-1-gene-tax-out.tmp (gene-calls taxonomy file with lineage info added from [step 8c](#8c-adding-taxonomy-info-from-taxids-to-genes)) +* sample-1-contig-tax-out.tmp (contig taxonomy file with lineage info added from [step 8d](#8d-adding-taxonomy-info-from-taxids-to-contigs)) **Output data:** @@ -585,17 +591,19 @@ bowtie2-build sample-1-assembly.fasta sample-1-assembly-bt-index #### 9b. Performing mapping, conversion to bam, and sorting ``` bowtie2 --threads NumberOfThreads -x sample-1-assembly-bt-index -1 sample-1_R1_filtered.fastq.gz \ - -2 sample-1_R2_filtered.fastq.gz 2> sample-1-mapping-info.txt | samtools view -b | samtools sort -@ NumberOfThreads > sample-1.bam + -2 sample-1_R2_filtered.fastq.gz --no-unal 2> sample-1-mapping-info.txt | samtools view -b | samtools sort -@ NumberOfThreads > sample-1.bam ``` **Parameter Definitions:** * `--threads` – specifies the number of threads to run in parallel -* `-x` – specifies the prefix of the reference index files to map to (generated in the previous `bowtie2-build` step +* `-x` – specifies the prefix of the reference index files to map to (generated in [step 9a](#9a-building-reference-index)) * `-1 and -2` – specifies the forward and reverse reads to map (if single-end data, neither `-1` nor `-2` are provided, and the single-end reads are passed to `-r`) +* `--no-unal` - suppress SAM records for unaligned reads + * `2> sample-1-mapping-info.txt` – capture the printed summary results in a log file * `samtools view -b` – convert the output directly to bam format (compressed) @@ -612,12 +620,12 @@ samtools index -@ NumberOfThreads sample-1.bam **Parameter Definitions:** * `-@` – set number of threads to use -* `sample-1.bam` - input bam file is provided as a positional argument as generated from the above mapping step +* `sample-1.bam` - input bam file is provided as a positional argument as generated in [step 9b](#9b-performing-mapping-conversion-to-bam-and-sorting) **Input data:** -* sample-1-assembly.fasta (assembly file) -* *.fastq.gz (filtered/trimmed reads) +* sample-1-assembly.fasta (assembly file from [step 5](#5a-renaming-contig-headers)) +* *.fastq.gz (filtered/trimmed reads from [step 2](#2-quality-filteringtrimming)) **Output data:** @@ -674,8 +682,8 @@ rm sample-1-*.tmp **Input data:** -* sample-1.bam (mapping file) -* sample-1-genes.fasta (gene-calls nucleotide fasta file) +* sample-1.bam (mapping file from [step 9b](#9b-performing-mapping-conversion-to-bam-and-sorting)) +* sample-1-genes.fasta (gene-calls nucleotide fasta file from [step 6](#6-gene-prediction)) **Output data:** @@ -705,9 +713,9 @@ rm sample-1*tmp sample-1-gene-coverages.tsv sample-1-annotations.tsv sample-1-ge **Input data:** -* sample-1-gene-coverages.tsv (table with gene-level coverages from step 10) -* sample-1-annotations.tsv (table of KO annotations assigned to gene IDs from step 7) -* sample-1-gene-tax-out.tsv (gene-level taxonomic classifications from step 8) +* sample-1-gene-coverages.tsv (table with gene-level coverages from [step 10b](#10b-filtering-gene-coverage-based-on-requiring-50-detection-and-parsing-down-to-just-gene-id-and-coverage)) +* sample-1-annotations.tsv (table of KO annotations assigned to gene IDs from [step 7c](#7c-filtering-output-to-retain-only-those-passing-the-ko-specific-score-and-top-hits)) +* sample-1-gene-tax-out.tsv (gene-level taxonomic classifications from [step 8f](#8f-formatting-contig-level-output-with-awk-and-sed)) **Output data:** @@ -737,8 +745,8 @@ rm sample-1*tmp sample-1-contig-coverages.tsv sample-1-contig-tax-out.tsv **Input data:** -* sample-1-contig-coverages.tsv (table with contig-level coverages from step 10) -* sample-1-contig-tax-out.tsv (contig-level taxonomic classifications from step 8) +* sample-1-contig-coverages.tsv (table with contig-level coverages from [step 10b](#10b-filtering-gene-coverage-based-on-requiring-50-detection-and-parsing-down-to-just-gene-id-and-coverage)) +* sample-1-contig-tax-out.tsv (contig-level taxonomic classifications from [step 8f](#8f-formatting-contig-level-output-with-awk-and-sed)) **Output data:** @@ -749,11 +757,14 @@ rm sample-1*tmp sample-1-contig-coverages.tsv sample-1-contig-tax-out.tsv --- -### 13. Generating normalized, gene-level-coverage summary tables of KO-annotations and taxonomy across samples +### 13. Generating normalized, gene- and contig-level coverage summary tables of KO-annotations and taxonomy across samples + > **Notes** > * To combine across samples to generate these summary tables, we need the same "units". This is done for annotations based on the assigned KO terms, and all non-annotated functions are included together as "Not annotated". It is done for taxonomic classifications based on taxids (full lineages included in the table), and any not classified are included together as "Not classified". > * The values we are working with are coverage per gene (so they are number of bases recruited to the gene normalized by the length of the gene). These have been normalized by making the total coverage of a sample 1,000,000 and setting each individual gene-level coverage its proportion of that 1,000,000 total. So basically percent, but out of 1,000,000 instead of 100 to make the numbers more friendly. +#### 13a. Generating gene-level coverage summary tables + ``` bit-GL-combine-KO-and-tax-tables *-gene-coverage-annotation-and-tax.tsv -o Combined ``` @@ -767,12 +778,36 @@ bit-GL-combine-KO-and-tax-tables *-gene-coverage-annotation-and-tax.tsv -o Combi **Input data:** -* *-gene-coverage-annotation-and-tax.tsv (tables with combined gene coverage, annotation, and taxonomy info generated for individual samples from step 12) +* *-gene-coverage-annotation-and-tax.tsv (tables with combined gene coverage, annotation, and taxonomy info generated for individual samples from [step 11](#11-combining-gene-level-coverage-taxonomy-and-functional-annotations-into-one-table-for-each-sample)) **Output data:** * **Combined-gene-level-KO-function-coverages-CPM_GLmetagenomics.tsv** (table with all samples combined based on KO annotations; normalized to coverage per million genes covered) * **Combined-gene-level-taxonomy-coverages-CPM_GLmetagenomics.tsv** (table with all samples combined based on gene-level taxonomic classifications; normalized to coverage per million genes covered) +* **Combined-gene-level-KO-function-coverages_GLmetagenomics.tsv** (table with all samples combined based on KO annotations) +* **Combined-gene-level-taxonomy-coverages_GLmetagenomics.tsv** (table with all samples combined based on gene-level taxonomic classifications) + + +#### 13b. Generating contig-level coverage summary tables + +``` +bit-GL-combine-contig-tax-tables *-contig-coverage-and-tax.tsv -o Combined +``` +**Parameter Definitions:** + +* takes positional arguments specifying the input tsv files, can be provided as a space-delimited list of files, or with wildcards like above + +- `-o` – specifies the output prefix + + +**Input data:** + +* *-contig-coverage-annotation-and-tax.tsv (tables with combined contig coverage, annotation, and taxonomy info generated for individual samples from [step 12](#12-combining-contig-level-coverage-and-taxonomy-into-one-table-for-each-sample)) + +**Output data:** + +* **Combined-contig-level-taxonomy-coverages-CPM_GLmetagenomics.tsv** (table with all samples combined based on contig-level taxonomic classifications; normalized to coverage per million genes covered) +* **Combined-contig-level-taxonomy-coverages_GLmetagenomics.tsv** (table with all samples combined based on contig-level taxonomic classifications)
@@ -795,11 +830,11 @@ zip -r sample-1-bins.zip sample-1-bins * `--outputDepth` – specifies the output depth file * `--percentIdentity` – minimum end-to-end percent identity of a mapped read to be included -* `--minContigLength` – minimum contig length to include +* `--minContigLength` – minimum contig length to include * `--minContigDepth` – minimum contig depth to include -* `--referenceFasta` – the assembly fasta file generated in step 4 +* `--referenceFasta` – the assembly fasta file generated in step 5a * `sample-1.bam` – final positional arguments are the bam files generated in step 9 -* `--inFile` - the assembly fasta file generated in step 4 +* `--inFile` - the assembly fasta file generated in step 5a * `--outFile` - the prefix of the identified bins output files * `--abdFile` - the depth file generated by the previous `jgi_summarize_bam_contig_depths` command * `-t` - specifies number of threads to use @@ -807,8 +842,8 @@ zip -r sample-1-bins.zip sample-1-bins **Input data:** -* sample-1-assembly.fasta (assembly fasta file created in step 4) -* sample-1.bam (bam file created in step 9) +* sample-1-assembly.fasta (assembly fasta file created in [step 5a](#5a-renaming-contig-headers)) +* sample-1.bam (bam file created in [step 9b](#9b-performing-mapping-conversion-to-bam-and-sorting)) **Output data:** @@ -834,7 +869,7 @@ checkm lineage_wf -f bins-overview_GLmetagenomics.tsv --tab_table -x fa ./ check **Input data:** -* bin fasta files generated by step 14a +* sample-1-bins/sample-1-bin\*.fasta (bin fasta files generated in [step 14a](#14a-binning-contigs)) **Output data:** @@ -868,7 +903,7 @@ done **Input data:** -* bins-overview_GLmetagenomics.tsv (tab-delimited file with quality estimates per bin) +* bins-overview_GLmetagenomics.tsv (tab-delimited file with quality estimates per bin from [step 14b](#14b-bin-quality-assessment)) **Output data:** @@ -894,12 +929,63 @@ gtdbtk classify_wf --genome_dir MAGs/ -x fa --out_dir gtdbtk-output-dir --skip_ **Input data:** -* **MAGs/\*.fasta (directory holding high-quality MAGs) +* MAGs/\*.fasta (directory holding high-quality MAGs from [step 14c](#14c-filtering-mags)) **Output data:** * gtdbtk-output-dir/gtdbtk.\*.summary.tsv (files with assigned taxonomy and info) +#### 14e. Generating overview table of all MAGs + +```bash +# combine summaries +for MAG in $(cut -f 1 assembly-summaries_GLmetagenomics.tsv | tail -n +2); do + + grep -w -m 1 "^${MAG}" checkm-MAGs-overview.tsv | cut -f 12,13,14 \ + >> checkm-estimates.tmp + + grep -w "^${MAG}" gtdbtk-output-dir/gtdbtk.*.summary.tsv | \ + cut -f 2 | sed 's/^.__//' | \ + sed 's/;.__/\t/g' | \ + awk 'BEGIN{ OFS=FS="\t" } { for (i=1; i<=NF; i++) if ( $i ~ /^ *$/ ) $i = "NA" }; 1' \ + >> gtdb-taxonomies.tmp + +done + +# Add headers +cat <(printf "est. completeness\test. redundancy\test. strain heterogeneity\n") checkm-estimates.tmp \ + > checkm-estimates-with-headers.tmp + +cat <(printf "domain\tphylum\tclass\torder\tfamily\\tgenus\tspecies\n") gtdb-taxonomies.tmp \ + > gtdb-taxonomies-with-headers.tmp + +paste assembly-summaries_GLmetagenomics.tsv \ +checkm-estimates-with-headers.tmp \ +gtdb-taxonomies-with-headers.tmp \ + > MAGs-overview.tmp + +# Ordering by taxonomy +head -n 1 MAGs-overview.tmp > MAGs-overview-header.tmp + +tail -n +2 MAGs-overview.tmp | sort -t \$'\t' -k 14,20 > MAGs-overview-sorted.tmp + +cat MAGs-overview-header.tmp MAGs-overview-sorted.tmp \ + > MAGs-overview_GLmetagenomics.tsv + +``` + +**Input data:** + +* assembly-summaries_GLmetagenomics.tsv (table of assembly summary statistics from [step 5b](#5b-summarizing-assemblies)) +* MAGs/\*.fasta (directory holding high-quality MAGs from [step 14c](#14c-filtering-mags)) +* checkm-MAGs-overview.tsv (tab-delimited file with quality estimates per MAG from [step 14c](#14c-filtering-mags)) +* gtdbtk-output-dir/gtdbtk.\*.summary.tsv (directory of files with assigned taxonomy and info from [step 14d](#14d-mag-taxonomic-classification)) + +**Output data:** + +* **MAGs-overview_GLmetagenomics.tsv** (a tab-delimited overview of all recovered MAGs) + +
--- @@ -907,7 +993,7 @@ gtdbtk classify_wf --genome_dir MAGs/ -x fa --out_dir gtdbtk-output-dir --skip_ ### 15. Generating MAG-level functional summary overview #### 15a. Getting KO annotations per MAG -This utilizes the helper script [`parse-MAG-annots.py`](../Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py). +This utilizes the helper script [`parse-MAG-annots.py`](../Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py) ```bash for file in $( ls MAGs/*.fasta ) @@ -929,7 +1015,7 @@ done **Parameter Definitions:** -* `-i` – specifies the input sample gene-coverage-annotation-and-tax.tsv file generated in step 11 above +* `-i` – specifies the input sample gene-coverage-annotation-and-tax.tsv file generated in step 11 * `-w` – specifies the appropriate temporary file holding all the contigs in the current MAG @@ -939,7 +1025,8 @@ done **Input data:** -* \*-gene-coverage-annotation-and-tax.tsv (sample gene-coverage-annotation-and-tax.tsv file generated in step 11 above) +* \*-gene-coverage-annotation-and-tax.tsv (sample gene-coverage-annotation-and-tax.tsv file generated in [step 11](#11-combining-gene-level-coverage-taxonomy-and-functional-annotations-into-one-table-for-each-sample)) +* MAGs/\*.fasta (directory holding high-quality MAGs from [step 14c](#14c-filtering-mags)) **Output data:** @@ -956,13 +1043,13 @@ KEGG-decoder -v interactive -i MAG-level-KO-annotations_GLmetagenomics.tsv -o MA * `-v interactive` – specifies to create an interactive html output -* `-i` – specifies the input MAG-level-KO-annotations_GLmetagenomics.tsv file generated in step 15a above +* `-i` – specifies the input MAG-level-KO-annotations_GLmetagenomics.tsv file generated in [step 15a](#15a-getting-ko-annotations-per-mag) * `-o` – specifies the output table **Input data:** -* MAG-level-KO-annotations_GLmetagenomics.tsv (tab-delimited table holding MAGs and their KO annotations, generated in step 15a above) +* MAG-level-KO-annotations_GLmetagenomics.tsv (tab-delimited table holding MAGs and their KO annotations, generated in [step 15a](#15a-getting-ko-annotations-per-mag)) **Output data:** @@ -971,7 +1058,6 @@ KEGG-decoder -v interactive -i MAG-level-KO-annotations_GLmetagenomics.tsv -o MA * **MAG-KEGG-Decoder-out_GLmetagenomics.html** (interactive heatmap html file of the above output table)
- --- ## Read-based processing @@ -1016,7 +1102,7 @@ humann --input sample-1-combined.fastq.gz --output sample-1-humann3-out-dir --th # they need to be in their own directories mkdir genefamily-results/ pathabundance-results/ pathcoverage-results/ - # copying results from previous running humann3 step (14a) to get them all together in their own directories (as is needed) + # copying results from previous running humann3 step (16a) to get them all together in their own directories (as is needed) cp *-humann3-out-dir/*genefamilies.tsv genefamily-results/ cp *-humann3-out-dir/*abundance.tsv pathabundance-results/ cp *-humann3-out-dir/*coverage.tsv pathcoverage-results/ @@ -1105,14 +1191,14 @@ merge_metaphlan_tables.py *-humann3-out-dir/*_humann_temp/*_metaphlan_bugs_list. **Parameter Definitions:** -* input metaphlan tables are provided as position arguments (produced during humann3 run above, step 14a) +* input metaphlan tables are provided as position arguments (produced during humann3 run in [step 16a](#16a-running-humann-which-also-runs-metaphlan) * `>` – output is redirected from stdout to a file **Input data:** -* *fastq.gz (filtered/trimmed reads from step 2, forward and reverse reads concatenated if paired-end) +* *fastq.gz (filtered/trimmed reads from [step 2](#2-quality-filteringtrimming), forward and reverse reads concatenated if paired-end) **Output data:** diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/CHANGELOG.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/CHANGELOG.md deleted file mode 100644 index 5a7ab315..00000000 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/CHANGELOG.md +++ /dev/null @@ -1,6 +0,0 @@ -# Workflow change log - -> ***Note:** The initial GeneLab Illumina metagenomics sequencing data processing pipeline, [GL-DPPD-7101](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md), was wrapped in a Snakemake workflow and can be found in the [SW_MGIllumina](../SW_MGIllumina) directory. The current pipeline version, [GL-DPPD-7101-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is wrapped in a Nextflow workflow and can be found in the [NF_MGIllumina](./) directory. This change log details changes for the Nextflow workflow implementation only.* - -## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina) -- workflow version that converted snakemake to nextflow diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md new file mode 100644 index 00000000..37e39039 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/CHANGELOG.md @@ -0,0 +1,28 @@ +# Workflow change log + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + + +## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_MGIllumina_1.0.0/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina) + +### Changed +- Update to the latest pipeline version [GL-DPPD-7101-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md) +of the GeneLab Metagenomics consensus processing pipeline. +- Pipeline implementation as a Nextflow workflow [NF_MGIllumina](./) rather than Snakemake as in +previous workflow versions. +- Run checkm separately on each bin and combine results to improve performance + +### Fixed +- Allow explicit specification of the humann3 database location ([#62](https://github.com/nasa/GeneLab_Data_Processing/issues/62)) +- Package bin and MAGs fasta files into per sample zip archives ([#76](https://github.com/nasa/GeneLab_Data_Processing/issues/76)) + +
+ +--- + +> ***Note:** All previous workflow changes were associated with the previous version of the GeneLab Metagenomics Pipeline +[GL-DPPD-7101](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md) and can be found in the +[change log of the Snakemake workflow (SW_MGIllumina)](../SW_MGIllumina/CHANGELOG.md).* \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md similarity index 52% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 2c4d1468..93ae6f9b 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -4,7 +4,7 @@ ### Implementation Tools -The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina-A), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. +The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina-A), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers, [docker](https://docs.docker.com/get-started/) containers, or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. > **Note on reference databases** > Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. @@ -13,8 +13,8 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M ## Utilizing the Workflow -1. [Install Nextflow and Singularity](#1-install-nextflow-and-singularity) - 1a. [Install Nextflow](#1a-install-nextflow) +1. [Installing Nextflow, Singularity, and conda](#1-install-nextflow-and-singularity) + 1a. [Install Nextflow and conda](#1a-install-nextflow-and-conda) 1b. [Install Singularity](#1b-install-singularity) 2. [Download the workflow files](#2-download-the-workflow-files) @@ -22,10 +22,10 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M 3. [Fetch Singularity Images](#3-fetch-singularity-images) 4. [Run the workflow](#4-run-the-workflow) - 4a. [Approach 1: Run slurm jobs in singularity containers with OSD accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-accession-as-input) - 4b. [Approach 2: Run slurm jobs in singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 4d. [Modify parameters and cpu resources in the nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) + 4a. [Approach 1: Run slurm jobs in Singularity containers with OSD or GLDS accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-or-glds-accession-as-input) + 4b. [Approach 2: Run slurm jobs in Singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) + 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-3-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) + 4d. [Modify parameters and cpu resources in the Nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) 5. [Workflow outputs](#5-workflow-outputs) 5a. [Main outputs](#5a-main-outputs) @@ -37,13 +37,13 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M --- -### 1. Install Nextflow and Singularity +### 1. Installing Nextflow, Singularity, and conda -#### 1a. Install Nextflow +#### 1a. Install Nextflow and conda Nextflow can be installed either through [Anaconda](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). -> Note: If you want to install Anaconda, we recommend installing a Miniconda, Python3 version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). +> Note: If you want to install Anaconda, we recommend installing a Miniconda version appropriate for your system, as instructed by [Happy Belly Bioinformatics](https://astrobiomike.github.io/unix/conda-intro#getting-and-installing-conda). > > Once conda is installed on your system, you can install the latest version of Nextflow by running the following commands: > @@ -51,10 +51,13 @@ Nextflow can be installed either through [Anaconda](https://anaconda.org/biocond > conda install -c bioconda nextflow > nextflow self-update > ``` -> You may also install [mamba](https://mamba.readthedocs.io/en/latest/index.html) which is a faster implementation of conda like so: +> You may also install [mamba](https://mamba.readthedocs.io/en/latest/index.html) first which is a faster implementation of conda and can be used as a drop-in replacement: > ```bash > conda install -c conda-forge mamba +> conda install -c bioconda nextflow +> nextflow self-update > ``` +
#### 1b. Install Singularity @@ -65,17 +68,19 @@ We recommend installing Singularity on a system wide level as per the associated > Note: Singularity is also available through [Anaconda](https://anaconda.org/conda-forge/singularity). +> Note: Alternatively, Docker can be used in place of Singularity. To get started with Docker, see the [Docker CE installation documentation](https://docs.docker.com/engine/install/). +
--- ### 2. Download the workflow files -All files required for utilizing the NF_MGIllumina-A GeneLab workflow for processing metagenomics Illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_MGIllumina-A* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: +All files required for utilizing the NF_MGIllumina GeneLab workflow for processing metagenomics Illumina data are in the [workflow_code](workflow_code) directory. To get a copy of latest *NF_MGIllumina* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: ```bash -wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina-A_1.0.0/NF_MGIllumina-A_1.0.0.zip -unzip NF_MGIllumina-A_1.0.0.zip && cd NF_MGIllumina-A_1.0.0 +wget https://github.com/nasa/GeneLab_Data_Processing/releases/download/NF_MGIllumina_1.0.0/NF_MGIllumina_1.0.0.zip +unzip NF_MGIllumina_1.0.0.zip && cd NF_MGIllumina_1.0.0 ```
@@ -86,9 +91,9 @@ unzip NF_MGIllumina-A_1.0.0.zip && cd NF_MGIllumina-A_1.0.0 Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). -To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_MGIllumina-A workflow: +To avoid this issue, run the following command to fetch the Singularity images prior to running the NF_MGIllumina workflow: -> Note: This command should be run from within the `NF_MGIllumina-A_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. +> Note: This command should be run from within the `NF_MGIllumina_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. ```bash bash ./bin/prepull_singularity.sh nextflow.config @@ -106,7 +111,7 @@ export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity ### 4. Run the Workflow -> ***Note:** All the commands in this step must be run from within the `NF_MGIllumina-A_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above.* +> ***Note:** All the commands in this step must be run from within the `NF_MGIllumina_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above.* For options and detailed help on how to run the workflow, run the following command: @@ -114,11 +119,13 @@ For options and detailed help on how to run the workflow, run the following comm nextflow run main.nf --help ``` -> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general nextflow arguments and double hyphen arguments (e.g. --input_file) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. +> Note: Nextflow commands use both single hyphen arguments (e.g. -help) that denote general Nextflow +arguments and double hyphen arguments (e.g. --input_file) that denote workflow specific parameters. +Take care to use the proper number of hyphens for each argument.
-#### 4a. Approach 1: Run slurm jobs in singularity containers with OSD or GLDS accession as input +#### 4a. Approach 1: Run slurm jobs in Singularity containers with OSD or GLDS accession as input ```bash nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574 @@ -126,7 +133,7 @@ nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574
-#### 4b. Approach 2: Run slurm jobs in singularity containers with a csv file as input +#### 4b. Approach 2: Run slurm jobs in Singularity containers with a csv file as input ```bash nextflow run main.nf -resume -profile slurm,singularity --input_file PE_file.csv @@ -144,21 +151,20 @@ nextflow run main.nf -resume -profile mamba --input_file SE_file.csv --conda_meg **Required Parameters For All Approaches:** -* `-run main.nf` - Instructs nextflow to run the NF_MGIllumina-A workflow +* `-run main.nf` - Instructs Nextflow to run the NF_MGIllumina workflow * `-resume` - Resumes workflow execution using previously cached results -* `-profile` – Specifies the configuration profile(s) to load, `singularity` instructs nextflow to setup and use singularity for all software called in the workflow - -*Required only if you would like to pull and process data directly from OSDR* +* `-profile` – Specifies the configuration profile(s) to load; `singularity` instructs Nextflow to setup and use Singularity for all software called in the workflow. + > Note: Use `docker` to instruct Nextflow to use the Docker container environment instead. * `--accession` – A Genelab / OSD accession number e.g. OSD-574. + > *Required only if you would like to pull and process data directly from OSDR* -*Required only if --accession is not passed as an argument* - -* `--input_file` – A single-end or paired-end input csv file containing assay metadata for each sample, including sample_id, forward, reverse, and/or paired. Please see the sample [SE_file.csv](workflow_code/SE_file.csv) and [PE_file.csv](workflow_code/PE_file.csv) in this repository for examples on how to format this file. +* `--input_file` – A single-end or paired-end input csv file containing assay metadata for each sample, including sample_id, forward, reverse, and/or paired. Please see the [runsheet documentation](./examples/runsheet) in this repository for examples on how to format this file. + > *Required only if --accession is not passed as an argument* -> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run nextflow. +> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run Nextflow.
@@ -176,16 +182,17 @@ Once you've downloaded the workflow template, you can modify the parameters in t #### 5a. Main outputs -The outputs from this pipeline are documented in the [GL-DPPD-7107-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md) processing protocol. +> Note: The outputs from the GeneLab Illumina metagenomics sequencing data processing pipeline workflow are documented in the [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md) processing protocol. #### 5b. Resource logs -Standard nextflow resource usage logs are also produced as follows: +Standard Nextflow resource usage logs are also produced as follows: -- **Output:** - - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) - - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) - - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) +**Nextflow Resource Usage Logs** + - Output: + - Resource_Usage/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) + - Resource_Usage/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) + - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) > Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). @@ -195,33 +202,31 @@ Standard nextflow resource usage logs are also produced as follows: ### 6. Post Processing +The post-processing workflow generates a README file, a protocols file, an md5sums +table, and a file association table suitable for uploading to OSDR. + For options and detailed help on how to run the post-processing workflow, run the following command: ```bash nextflow run post_processing.nf --help ``` -To generate a README file, a protocols file, a md5sums table and a file association table after running the processing workflow sucessfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config) then run the following command: +To generate the post-processing files after running the main processing workflow successfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config), then run the following command: ```bash nextflow -C post_processing.config run post_processing.nf -resume -profile slurm,singularity ``` -The outputs of the run will be in a directory called `Post_Processing` by default and they are as follows: - - - Post_processing/FastQC_Outputs/filtered_multiqc_GLmetagenomics_report.zip (Filtered sequence multiqc report with paths purged) - - - Post_processing/FastQC_Outputs/raw_multiqc_GLmetagenomics_report.zip (Raw sequence multiqc report with paths purged) - - - Post_processing/_-associated-file-names.tsv (File association table for curation) - - - Post_processing/_metagenomics-validation.log (Automatic verification and validation log file) - - - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR) - - - Post_processing/processing_info_GLmetagenomics.zip (Zip file containing all files used to run the workflow and required logs with paths purged) - - - Post_processing/protocol.txt (File describing the methods used by the workflow) - - - Post_processing/README_GLmetagenomics.txt (README file listing and describing the outputs of the workflow) +The outputs of the post-processing workflow are described below: + +**Post processing workflow** + - Output: + - Post_processing/FastQC_Outputs/filtered_multiqc_GLmetagenomics_report.zip (Filtered sequence multiqc report with paths purged) + - Post_processing/FastQC_Outputs/raw_multiqc_GLmetagenomics_report.zip (Raw sequence multiqc report with paths purged) + - Post_processing/_-associated-file-names.tsv (File association table for curation) + - Post_processing/_metagenomics-validation.log (Automated verification and validation log file) + - Post_processing/processed_md5sum_GLmetagenomics.tsv (md5sums for the files to be released on OSDR) + - Post_processing/processing_info_GLmetagenomics.zip (Zip file containing all files used to run the workflow and required logs with paths purged) + - Post_processing/protocol.txt (File describing the methods used by the workflow) + - Post_processing/README_GLmetagenomics.txt (README file listing and describing the outputs of the workflow) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/README.md new file mode 100644 index 00000000..b91ea2a3 --- /dev/null +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/README.md @@ -0,0 +1,23 @@ +# Runsheet File Specification + +## Description + +* The runsheet is a comma-separated file that contains the metadata required for processing +metagenomics sequence datasets through the GeneLab Illumina metagenomics sequencing data +processing pipeline (MGIllumina). + + +## Examples + +1. Runsheet for an example [paired-end dataset](paired_end_dataset/PE_file.csv) +2. Runsheet for an example [single-end dataset](single_end_dataset/SE_file.csv) + + +## Required columns + +| Column Name | Type | Description | Example | +|:------------|:-----|:------------|:--------| +| sample_id | string | Unique Sample Name, added as a prefix to sample-specific processed data output files. Should not include spaces or weird characters. | RR23_FCS_FLT_F1 | +| forward | string (local path) | Location of the raw reads file. For paired-end data, this specifies the forward reads fastq.gz file. | /my/data/sample1_R1_HRremoved_raw.fastq.gz | +| reverse | string (local path) | Location of the raw reads file. For paired-end data, this specifies the reverse reads fastq.gz file. For single-end data, this column should be omitted. | /my/data/sample1_R2_HRremoved_raw.fastq.gz | +| paired | bool | Set to True if the samples were sequenced as paired-end. If set to False, samples are assumed to be single-end. | False | \ No newline at end of file diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/PE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/paired_end_dataset/PE_file.csv similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/PE_file.csv rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/paired_end_dataset/PE_file.csv diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/SE_file.csv b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/single_end_dataset/SE_file.csv similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/SE_file.csv rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/examples/runsheet/single_end_dataset/SE_file.csv diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-metagenomics-file-associations-table b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-metagenomics-file-associations-table rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-metagenomics-file-associations-table diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-processed-metagenomics-readme b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-gen-processed-metagenomics-readme rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-gen-processed-metagenomics-readme diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-validate-processed-metagenomics-data b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/GL-validate-processed-metagenomics-data rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/GL-validate-processed-metagenomics-data diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/clean-paths.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/clean-paths.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-all-gene-tables.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-all-gene-tables.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-all-gene-tables.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-all-gene-tables.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/combine-gene-level-coverages-annots-and-tax-per-group.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/create_runsheet.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/create_runsheet.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/create_runsheet.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/create_runsheet.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/download-GTDBTK-db.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/download-GTDBTK-db.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/download-GTDBTK-db.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-contig-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-contig-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-contig-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-gene-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/format-gene-tax-classifications.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/format-gene-tax-classifications.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate-assembly-based-overview-table.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate-assembly-based-overview-table.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate-assembly-based-overview-table.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate_protocol.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/generate_protocol.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/generate_protocol.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/get_MAGs_estimates_and_taxonomy.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/parse-MAG-annots.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/parse-MAG-annots.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/parse-MAG-annots.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/prepull_singularity.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/prepull_singularity.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/prepull_singularity.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/swap-MAG-IDs.py b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/swap-MAG-IDs.py similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/bin/swap-MAG-IDs.py rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/bin/swap-MAG-IDs.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/bbtools_adapters.fa b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/bbtools_adapters.fa similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/bbtools_adapters.fa rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/bbtools_adapters.fa diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/multiqc.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/config/multiqc.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/config/multiqc.config diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/bit.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/bit.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/bit.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/cat.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/cat.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/cat.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/checkm.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/checkm.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/checkm.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/genelab.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/genelab.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/genelab.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/genelab.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/gtdb-tk.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/gtdb-tk.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/gtdb-tk.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/humann3.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/humann3.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/humann3.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.bit b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.bit rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.bit diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.genelab b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/image_def.genelab rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/image_def.genelab diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/keggdecoder.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/keggdecoder.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/keggdecoder.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/kofamscan.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/kofamscan.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/kofamscan.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/mapping.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/mapping.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/mapping.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/megahit.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/megahit.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/megahit.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/metabat.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/metabat.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/metabat.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/prodigal.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/prodigal.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/prodigal.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/qc.yaml b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/envs/qc.yaml rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/envs/qc.yaml diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/launch.sh similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.sh rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/launch.sh diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/launch.slurm similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/launch.slurm rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/launch.slurm diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf similarity index 99% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf index c53a3d55..f7d97410 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/main.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/main.nf @@ -30,7 +30,7 @@ if (params.help) { singularity, docker and conda will run the pipeline locally using singularity, docker, and conda, respectively. To combine profiles, separate two or more profiles with comma. For example, to combine slurm and singularity profiles, pass 'slurm,singularity' as argument. """) println("--input_file [PATH] A 3-column (single-end) or 4-column (paired-end) csv input file (sample_id, forward, [reverse,] paired). Required only if a GLDS accession is not provided. Default : null") - println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.") + println(" Please see the files: examples/runsheet/single_end_dataset/SE_file.csv and examples/runsheet/paired_end_dataset/PE_file.csv for single-end and paired-end examples, respectively.") println(" The sample_id column should contain unique sample ids.") println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.") println(" The paired column should be true for paired-end or anything else for single-end reads.") diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_annotation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_annotation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf similarity index 97% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf index fd01dadf..75ee1846 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/assembly_based_processing.nf +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/assembly_based_processing.nf @@ -53,7 +53,7 @@ workflow assembly_based { failed_assemblies = RENAME_HEADERS.out.failed_assembly failed_assemblies .map{ it.text } - .collectFile(name: "${params.assemblies_dir}/Failed-assemblies.tsv", cache: false) + .collectFile(name: "${params.assemblies_dir}/Failed-assemblies${params.assay_suffix}.tsv", cache: false) // Map reads to assembly MAPPING(assembly_ch.join(filtered_ch)) diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/binning.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/binning.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/binning.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/binning.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/combine_contig_annotation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/combine_contig_annotation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/combine_contig_annotation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/combine_contig_annotation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/coverage.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/coverage.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/create_runsheet.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/create_runsheet.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/database_creation.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/database_creation.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/genelab.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/genelab.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/quality_assessment.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/quality_assessment.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_mapping.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/read_mapping.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/read_mapping.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_MAG.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_MAG.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_MAG.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_assembly-based_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_assembly-based_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_assembly-based_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/summarize_bins.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/summarize_bins.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/zip_fasta.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/zip_fasta.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/modules/zip_fasta.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/modules/zip_fasta.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/nextflow.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/nextflow.config diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.config rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.config diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/post_processing.nf rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/post_processing.nf diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm similarity index 100% rename from Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina-A/workflow_code/slurm_submit.slurm rename to Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/workflow_code/slurm_submit.slurm diff --git a/Metagenomics/Illumina/Workflow_Documentation/README.md b/Metagenomics/Illumina/Workflow_Documentation/README.md index eb2a7ad8..28ddd29c 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/README.md @@ -6,7 +6,7 @@ |Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| |:---------------|:---------------------------------------------------------|:---------------| -|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[NF_MGIllumina-A_1.0.0](NF_MGIllumina-A)|24.04.4| +|*[GL-DPPD-7107-A.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md)|[NF_MGIllumina_1.0.0](NF_MGIllumina)|24.04.4| |[GL-DPPD-7107.md](../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md)|[SW_MGIllumina_2.0.4](SW_MGIllumina)|N/A (Snakemake v7.26.0)| From f9dec7c3eae8583461873e3e203cea6be8508c30 Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Wed, 7 May 2025 17:03:06 -0700 Subject: [PATCH 47/48] [DEV_Metagenomics_Illumina] Minor documentation updates (#151) - Pipeline doc - fixed typo in sed command - NF workflow doc - added explicit definitions for all profiles available - removed slurm from example command-line calls - updated headings to adjust slurm removal - simplified instructions for different run approaches - added conda env configuration information as additional information under the conda profile definition. --- .../GL-DPPD-7107-A.md | 2 +- .../NF_MGIllumina/README.md | 62 +++++++++---------- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md index 305a0a2b..31f6eaf9 100644 --- a/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md +++ b/Metagenomics/Illumina/Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md @@ -543,7 +543,7 @@ awk -F $'\t' ' BEGIN { OFS=FS } { if ( $3 == "lineage" ) { print $1,$3,$5,$6,$7, else if ( $2 == "ORF has no hit to database" || $2 ~ /^no taxid found/ ) \ { print $1,"NA","NA","NA","NA","NA","NA","NA","NA" } else { n=split($3,lineage,";"); \ print $1,lineage[n],$5,$6,$7,$8,$9,$10,$11 } } ' sample-1-gene-tax-out.tmp | \ - sed no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/# ORF/gene_ID/' | \ + sed 's/no support/NA/g' | sed 's/superkingdom/domain/' | sed 's/# ORF/gene_ID/' | \ sed 's/lineage/taxid/' > sample-1-gene-tax-out.tsv ``` diff --git a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md index 93ae6f9b..c82b13be 100644 --- a/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md +++ b/Metagenomics/Illumina/Workflow_Documentation/NF_MGIllumina/README.md @@ -4,7 +4,7 @@ ### Implementation Tools -The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina-A), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers, [docker](https://docs.docker.com/get-started/) containers, or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. +The current GeneLab Illumina metagenomics sequencing data processing pipeline (MGIllumina-A), [GL-DPPD-7107-A.md](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107-A.md), is implemented as a [Nextflow](https://nextflow.io/) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers, [Docker](https://docs.docker.com/get-started/) containers, or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow is run using the command line interface (CLI) of any unix-based system. While knowledge of creating workflows in Nextflow is not required to run the workflow as is, [the Nextflow documentation](https://nextflow.io/docs/latest/index.html) is a useful resource for users who want to modify and/or extend this workflow. > **Note on reference databases** > Many reference databases are relied upon throughout this workflow. They will be installed and setup automatically the first time the workflow is run. All together, after installed and unpacked, they will take up about about 340 GB of storage, but they may also require up to 500GB during installation and initial un-packing, so be sure there is enough room on your system before running the workflow. @@ -13,24 +13,18 @@ The current GeneLab Illumina metagenomics sequencing data processing pipeline (M ## Utilizing the Workflow -1. [Installing Nextflow, Singularity, and conda](#1-install-nextflow-and-singularity) +1. [Installing Nextflow, Singularity, and conda](#1-installing-nextflow-singularity-and-conda) 1a. [Install Nextflow and conda](#1a-install-nextflow-and-conda) 1b. [Install Singularity](#1b-install-singularity) - 2. [Download the workflow files](#2-download-the-workflow-files) - 3. [Fetch Singularity Images](#3-fetch-singularity-images) - 4. [Run the workflow](#4-run-the-workflow) - 4a. [Approach 1: Run slurm jobs in Singularity containers with OSD or GLDS accession as input](#4a-approach-1-run-slurm-jobs-in-singularity-containers-with-osd-or-glds-accession-as-input) - 4b. [Approach 2: Run slurm jobs in Singularity containers with a csv file as input](#4b-approach-2-run-slurm-jobs-in-singularity-containers-with-a-csv-file-as-input) - 4c. [Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environments](#4c-approach-3-run-jobs-locally-in-conda-environments-and-specify-the-path-to-one-or-more-existing-conda-environments) - 4d. [Modify parameters and cpu resources in the Nextflow config file](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) - + 4a. [Approach 1: Start with OSD or GLDS accession as input](#4a-approach-1-start-with-an-osd-or-glds-accession-as-input) + 4b. [Approach 2: Start with a runsheet csv file as input](#4b-approach-2-start-with-a-runsheet-csv-file-as-input) + 4c. [Modify parameters and compute resources in the Nextflow config file](#4c-modify-parameters-and-compute-resources-in-the-nextflow-config-file) 5. [Workflow outputs](#5-workflow-outputs) 5a. [Main outputs](#5a-main-outputs) 5b. [Resource logs](#5b-resource-logs) - 6. [Post Processing](#6-post-processing)
@@ -125,26 +119,18 @@ Take care to use the proper number of hyphens for each argument.
-#### 4a. Approach 1: Run slurm jobs in Singularity containers with OSD or GLDS accession as input +#### 4a. Approach 1: Start with an OSD or GLDS accession as input ```bash -nextflow run main.nf -resume -profile slurm,singularity --accession OSD-574 +nextflow run main.nf -resume -profile singularity --accession OSD-574 ```
-#### 4b. Approach 2: Run slurm jobs in Singularity containers with a csv file as input +#### 4b. Approach 2: Start with a runsheet csv file as input ```bash -nextflow run main.nf -resume -profile slurm,singularity --input_file PE_file.csv -``` - -
- -#### 4c. Approach 3: Run jobs locally in conda environments and specify the path to one or more existing conda environment(s) - -```bash -nextflow run main.nf -resume -profile mamba --input_file SE_file.csv --conda_megahit +nextflow run main.nf -resume -profile singularity --input_file PE_file.csv ```
@@ -155,24 +141,34 @@ nextflow run main.nf -resume -profile mamba --input_file SE_file.csv --conda_meg * `-resume` - Resumes workflow execution using previously cached results -* `-profile` – Specifies the configuration profile(s) to load; `singularity` instructs Nextflow to setup and use Singularity for all software called in the workflow. - > Note: Use `docker` to instruct Nextflow to use the Docker container environment instead. +* `-profile` – Specifies the configuration profile(s) to load (multiple options can be provided as a comma-separated list) + * Software environment profile options (choose one): + * `singularity` - instructs Nextflow to use Singularity container environments + * `docker` - instructs Nextflow to use Docker container environments + * `conda` - instructs Nextflow to use conda environments via the conda package manager. By default, Nextflow will create environments at runtime using the yaml files in the [workflow_code/envs](workflow_code/envs/) folder. You can change this behavior by using the `--conda_*` workflow parameters or by editing the [nextflow.config](workflow_code/nextflow.config) file to specify a centralized conda environments directory via the `conda.cacheDir` parameter + * `mamba` - instructs Nextflow to use conda environments via the mamba package manager. + * Other option (can be combined with the software environment option above): + * `slurm` - instructs Nextflow to use the [Slurm cluster management and job scheduling system](https://slurm.schedmd.com/overview.html) to schedule and run the jobs on a Slurm HPC cluster. * `--accession` – A Genelab / OSD accession number e.g. OSD-574. - > *Required only if you would like to pull and process data directly from OSDR* + > *Required only if you would like to download and process data directly from OSDR* + +* `--input_file` – A single-end or paired-end runsheet csv file containing assay metadata for each sample, including sample_id, forward, reverse, and/or paired. Please see the [runsheet documentation](./examples/runsheet) in this repository for examples on how to format this file. + > *Required only if `--accession` is not passed as an argument* + +
-* `--input_file` – A single-end or paired-end input csv file containing assay metadata for each sample, including sample_id, forward, reverse, and/or paired. Please see the [runsheet documentation](./examples/runsheet) in this repository for examples on how to format this file. - > *Required only if --accession is not passed as an argument* +> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run Nextflow. +> For additional information on editing the `nextflow.config` file, see [Step 4d](#4d-modify-parameters-and-cpu-resources-in-the-nextflow-config-file) below. -> See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run Nextflow.
-#### 4d. Modify parameters and cpu resources in the nextflow config file +#### 4c. Modify parameters and compute resources in the Nextflow config file -Additionally, the parameters and workflow resources can be directly specified in the nextflow.config file. For detailed instructions on how to modify and set parameters in the nextflow.config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). +Additionally, all parameters and workflow resources can be directly specified in the [nextflow.config](./workflow_code/nextflow.config) file. For detailed instructions on how to modify and set parameters in the config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). -Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. Additionally, if necessary, you'll need to modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the machine you're using. +Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. Additionally, if necessary, you can modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the computer you're using for processing.
@@ -214,7 +210,7 @@ nextflow run post_processing.nf --help To generate the post-processing files after running the main processing workflow successfully, modify and set the parameters in [post_processing.config](workflow_code/post_processing.config), then run the following command: ```bash -nextflow -C post_processing.config run post_processing.nf -resume -profile slurm,singularity +nextflow -C post_processing.config run post_processing.nf -resume -profile singularity ``` The outputs of the post-processing workflow are described below: From eee1c4c2e7b78a759623e0adba3b130765d1b588 Mon Sep 17 00:00:00 2001 From: Barbara Novak <19824106+bnovak32@users.noreply.github.com> Date: Thu, 8 May 2025 10:23:37 -0700 Subject: [PATCH 48/48] revert permissions on SW_MGIllumina scripts --- .../workflow_code/scripts/combine-all-gene-tables.py | 0 .../SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh | 0 .../combine-gene-level-coverages-annots-and-tax-per-group.py | 0 .../workflow_code/scripts/format-contig-tax-classifications.sh | 0 .../workflow_code/scripts/format-gene-tax-classifications.sh | 0 .../scripts/generate-assembly-based-overview-table.sh | 0 .../SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py | 0 .../SW_MGIllumina/workflow_code/scripts/slurm-status.py | 0 .../SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py | 0 9 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py mode change 100644 => 100755 Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-all-gene-tables.py old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-benchmarks.sh old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/combine-gene-level-coverages-annots-and-tax-per-group.py old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-contig-tax-classifications.sh old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/format-gene-tax-classifications.sh old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/generate-assembly-based-overview-table.sh old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/parse-MAG-annots.py old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/slurm-status.py old mode 100644 new mode 100755 diff --git a/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py b/Metagenomics/Illumina/Workflow_Documentation/SW_MGIllumina/workflow_code/scripts/swap-MAG-IDs.py old mode 100644 new mode 100755