Added figshare links and documentation

olabiyi · olabiyi · commit 9fc0765624d9 · 2024-05-16T16:33:52.000-07:00
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/bin/Illumina-PE-R-processing.R
@@ -4,7 +4,7 @@
 ## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                              ##
 ##################################################################################
 
-# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region> <concatenate_reads_only> <assay_suffix>
+# as called from the associated process, this expects to be run as: Rscript full-R-processing.R <left_trunc> <right_trunc> <left_maxEE> <right_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <input_file_R2_suffix> <filtered_filename_R1_suffix> <filtered_filename_R2_suffix> <final_outputs_directory> <output_prefix> <target_region> <concatenate_reads_only> <assay_suffix>
     # where <left_trim> and <right_trim> are the values to be passed to the truncLen parameter of dada2's filterAndTrim()
     # and <left_maxEE> and <right_maxEE> are the values to be passed to the maxEE parameter of dada2's filterAndTrim()
 
@@ -161,8 +161,7 @@ dna <- DNAStringSet(getSequences(seqtab.nochim))
     # downloading reference R taxonomy object (at some point this will be stored somewhere on GeneLab's server and we won't download it, but should leave the code here, just commented out)
 cat("\n\n  Downloading reference database...\n\n")
 if ( target_region == "16S" ) { 
-    #download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData")
-    download.file("https://figshare.com/ndownloader/files/23739737", "SILVA_SSU_r138_2019.RData")
+    download.file("https://figshare.com/ndownloader/files/46245217", "SILVA_SSU_r138_2019.RData")
     # loading reference taxonomy object
     load("SILVA_SSU_r138_2019.RData")
     # removing downloaded file
@@ -171,22 +170,22 @@ if ( target_region == "16S" ) {
 
 } else if (target_region == "ITS" ) {
 
-    download.file("http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2023_July2023.RData", "UNITE_v2023_July2023.RData")    
+    download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData")    
     # loading reference taxonomy object
-    load("UNITE_v2023_July2023.RData")
+    load("UNITE_v2020_February2020.RData")
     # removing downloaded file
-    file.remove("UNITE_v2023_July2023.RData")
+    #file.remove("UNITE_v2020_February2020.RData")
 
     ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
 
 } else if (target_region == "18S" ) {
 
-    download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData")   
+    download.file("https://figshare.com/ndownloader/files/46241917", "PR2_v4_13_March2021.RData")   
     # https://github.com/pr2database/pr2database/releases/download/v4.14.0/pr2_version_4.14.0_SSU.decipher.trained.rds 
     # loading reference taxonomy object
     load("PR2_v4_13_March2021.RData")
     # removing downloaded file
-    file.remove("PR2_v4_13_March2021.RData")
+    #file.remove("PR2_v4_13_March2021.RData")
 
     ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species")
 
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/bin/Illumina-SE-R-processing.R
@@ -5,7 +5,7 @@
 ## Developed by Michael D. Lee (Mike.Lee@nasa.gov)                              ##
 ##################################################################################
 
-# as called from the associated Snakefile, this expects to be run as: Rscript full-R-processing.R <left_trunc> <left_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <filtered_filename_R1_suffix> <final_outputs_directory> <output_prefix> <target_region> <assay_suffix>
+# as called from the associated process, this expects to be run as: Rscript full-R-processing.R <left_trunc> <left_maxEE> <TRUE/FALSE - GL trimmed primers or not> <unique-sample-IDs-file> <starting_reads_dir_for_R> <filtered_reads_dir> <input_file_R1_suffix> <filtered_filename_R1_suffix> <final_outputs_directory> <output_prefix> <target_region> <assay_suffix>
     # where <left_trim> is the value to be passed to the truncLen parameter of dada2's filterAndTrim()
     # and <left_maxEE> is the value to be passed to the maxEE parameter of dada2's filterAndTrim()
 
@@ -130,8 +130,7 @@ dna <- DNAStringSet(getSequences(seqtab.nochim))
     # downloading reference R taxonomy object (at some point this will be stored somewhere on GeneLab's server and we won't download it, but should leave the code here, just commented out)
 cat("\n\n  Downloading reference database...\n\n")
 if ( target_region == "16S" ) { 
-    #download.file("http://www2.decipher.codes/Classification/TrainingSets/SILVA_SSU_r138_2019.RData", "SILVA_SSU_r138_2019.RData")
-    download.file("https://figshare.com/ndownloader/files/23739737", "SILVA_SSU_r138_2019.RData")
+    download.file("https://figshare.com/ndownloader/files/46245217", "SILVA_SSU_r138_2019.RData")
     # loading reference taxonomy object
     load("SILVA_SSU_r138_2019.RData")
     # removing downloaded file
@@ -140,22 +139,22 @@ if ( target_region == "16S" ) {
 
 } else if (target_region == "ITS" ) {
 
-    download.file("http://www2.decipher.codes/Classification/TrainingSets/UNITE_v2023_July2023.RData", "UNITE_v2023_July2023.RData")    
+    download.file("https://figshare.com/ndownloader/files/46245586", "UNITE_v2020_February2020.RData")    
     # loading reference taxonomy object
-    load("UNITE_v2023_July2023.RData")
+    load("UNITE_v2020_February2020.RData")
     # removing downloaded file
-    file.remove("UNITE_v2023_July2023.RData")
+    #file.remove("UNITE_v2020_February2020.RData")
 
     ranks <- c("kingdom", "phylum", "class", "order", "family", "genus", "species")
 
 } else if (target_region == "18S" ) {
 
-    download.file("http://www2.decipher.codes/Classification/TrainingSets/PR2_v4_13_March2021.RData", "PR2_v4_13_March2021.RData")    
+    download.file("https://figshare.com/ndownloader/files/46241917", "PR2_v4_13_March2021.RData")   
+    # https://github.com/pr2database/pr2database/releases/download/v4.14.0/pr2_version_4.14.0_SSU.decipher.trained.rds 
     # loading reference taxonomy object
     load("PR2_v4_13_March2021.RData")
     # removing downloaded file
-    file.remove("PR2_v4_13_March2021.RData")
-
+    #file.remove("PR2_v4_13_March2021.RData")
     ranks <- c("kingdom", "division", "phylum", "class", "order", "family", "genus", "species")
 
 } else { 
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/main.nf b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/main.nf
@@ -20,72 +20,73 @@ if (params.help) {
   println("Example 2: : Submit and run jobs with slurm in conda environments.")
   println("   > nextflow run main.nf -resume -profile slurm_conda --csv_file SE_file.csv --target_region 1TS --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT")
   println()
-  println("Example 3: Run jobs locally in conda environments, supplying a GLDS accession, and specify the path to an existing conda environment")
-  println("   > nextflow run main.nf -resume -profile conda --GLDS_accession OSD-256 --target_region 18S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --conda.qc <path/to/existing/conda/environment>")
+  println("Example 3: Run jobs locally in conda environments, supplying a GLDS accession, and specifying the path to an existing conda environment")
+  println("   > nextflow run main.nf -resume -profile conda --GLDS_accession GLDS-487 --target_region 16S --F_primer AGAGTTTGATCCTGGCTCAG --R_primer CTGCCTCCCGTAGGAGT --conda.qc <path/to/existing/conda/environment>")
   println()
   println("Required arguments:")
-  println("""-profile [STRING] What profile should be used be use to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda].
+  println("""-profile [STRING] What profile should be used to run the workflow. Options are [singularity, docker, conda, slurm_sing, slurm_conda].
 	         singularity, docker and conda will run the pipelne locally using singularity, docker, and conda, respectively.
              slurm_sing and slurm_conda will submit and run jobs using slurm in singularity containers and conda environments, respectively. """)			 
   println("--csv_file  [PATH] A 3-column (single-end) or 4-column (paired-end) input file (sample_id, forward, [reverse,] paired). Mandatory if a GLDS accession is not provided.")
   println(" Please see the files: SE_file.csv and PE_file.csv for single-end and paired-end examples, respectively.")
   println(" The sample_id column should contain unique sample ids.")
   println(" The forward and reverse columns should contain the absolute or relative path to the sample's forward and reverse reads.")
   println(" The paired column should be true for paired-end or anything else for single-end reads.")
-  println("--target_region [STRING] What is the amplicon target region to be analyzed. Options are one of [16S, 18S, ITS]. Default: 16S")
-  println("--trim_primers [BOOLEAN] Should primers be trimmed? true or false. Default: true") 
-  println("--raw_R1_suffix [STRING] Raw forward reads suffix (region following the unique part of the sample names). e.g. _R1_raw.fastq.gz") 
-  println("--raw_R2_suffix [STRING] Raw reverse reads suffix (region following the unique part of the sample names). e.g. _R2_raw.fastq.gz") 
+  println("--target_region [STRING] What is the amplicon target region to be analyzed. Options are one of [16S, 18S, ITS]. Default: 16S.")
+  println("--trim_primers [BOOLEAN] Should primers be trimmed? true or false. Default: true.") 
+  println("PLEASE NOTE: This workflow assumes that all your raw reads end with the same suffix. If they don't please modify your filenames to have the same suffix as shown below.")
+  println("--raw_R1_suffix [STRING] Raw forward reads suffix (region following the unique part of the sample names). e.g. _R1_raw.fastq.gz.") 
+  println("--raw_R2_suffix [STRING] Raw reverse reads suffix (region following the unique part of the sample names). e.g. _R2_raw.fastq.gz.") 
 
   println("Cutadapt (trimming) parameters:")
-  println("	    --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG")
-  println("	    --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT")
-  println("	    --min_cutadapt_len [int] What should be the minimum read length after quality trimming with cutadapt. Default: 130")
-  println("	    --primers_linked [STRING] Are the primers linked?. https://cutadapt.readthedocs.io/en/stable/recipes.html#trimming-amplicon-primers-from-paired-end-reads. Default: TRUE ")
-  println("	    --discard_untrimmed [STRING] Should untrimmed reads be discarded? Any supplied string except TRUE will not discard them. Default: TRUE")
+  println("	    --F_primer [STRING] Forward primer sequence e.g. AGAGTTTGATCCTGGCTCAG. Default: emptry string.")
+  println("	    --R_primer [STRING] Reverse primer sequence e.g. CTGCCTCCCGTAGGAGT. Default: emptry string.")
+  println("	    --min_cutadapt_len [INTEGER] What should be the minimum read length after quality trimming with cutadapt. Default: 130.")
+  println("	    --primers_linked [STRING] Are the primers linked?. https://cutadapt.readthedocs.io/en/stable/recipes.html#trimming-amplicon-primers-from-paired-end-reads. Default: TRUE. ")
+  println("	    --discard_untrimmed [STRING] Should untrimmed reads be discarded? Any supplied string except TRUE will not discard them. Default: TRUE.")
 	
   println("Optional arguments:")  
-  println("  --help  Print this help message and exit")
-  println("  --publishDir_mode [STRING]  How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir Default: link.")
-  println("  --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: ignore")
-  println("  --enable_visualizations [BOOLEAN] Should ASV plots be made? true or false. if true supply a path to the ruhnsheet for plotting to the --runsheet option. Default: false")
-  println("  --runsheet [PATH] A 4-column file with these exact headers [ Sample Name, read1_path, raw_R1_suffix, groups] for plotting. Only relevant if --enable_visualizations is true. Default: null") 
-  println("  --multiqc_config [PATH] Path to a custome multiqc config file. Default: config/multiqc.config")
+  println("  --help  Print this help message and exit.")
+  println("  --publishDir_mode [STRING]  How should nextflow publish file outputs. Options can be found here https://www.nextflow.io/docs/latest/process.html#publishdir. Default: link.")
+  println("  --errorStrategy [STRING] How should nextflow handle errors. Options can be found here https://www.nextflow.io/docs/latest/process.html#errorstrategy. Default: terminate")
+  println("  --enable_visualizations [BOOLEAN] Should ASV plots be made? true or false. if true supply a path to the ruhnsheet for plotting to the --runsheet option. Default: false.")
+  println("  --runsheet [PATH] A 4-column file with these exact headers [Sample Name, read1_path, raw_R1_suffix, groups] for plotting. Only relevant if --enable_visualizations is true. Default: null.") 
+  println("  --multiqc_config [PATH] Path to a custome multiqc config file. Default: config/multiqc.config.")
 
   println("Dada2 parameters passed to filterAndTrim() function:")
-  println("	    --left_trunc [INT] truncate the sequences to the left by this number of bases. Default: 0") 
-  println("	    --right_trunc [INT] truncate the sequences to the right by this number of bases. Default: 0") 
-  println("	    --left_maxEE [INT] Maximum allowed errors to the left. Default: 1")
-  println("	    --right_maxEE [INT] Maximum allowed errors to the right. Default: 1")
+  println("	    --left_trunc [INTEGER] truncate the sequences to the left by this number of bases. Default: 0.") 
+  println("	    --right_trunc [INTEGER] truncate the sequences to the right by this number of bases. Default: 0.") 
+  println("	    --left_maxEE [INTEGER] Maximum allowed errors to the left. Default: 1.")
+  println("	    --right_maxEE [INTEGER] Maximum allowed errors to the right. Default: 1.")
   println("	    --concatenate_reads_only [STRING] Concatenate only with dada2 instead of merging paired reads if TRUE.")
   println("      This is typically used with primers like 515-926, that captured 18S fragments that are typically too long to merge.")
-  println("      Note that 16S and 18S should have been separated already prior to running this workflow. This should likely be left as FALSE for any option other than 18S above") 	    
-  println("	     Values are TRUE or FALSE Default: FALSE")
+  println("      Note that 16S and 18S should have been separated already prior to running this workflow. This should likely be left as FALSE for any option other than 18S above.") 	    
+  println("	     Values are TRUE or FALSE Default: FALSE.")
 
   println("File Suffixes:")
-  println("      --primer_trimmed_R1_suffix [STRING] Suffix to use for naming your primer trimmed forward reads. Default: _R1_trimmed.fastq.gz")
-  println("      --primer_trimmed_R2_suffix [STRING] Suffix to use for naming your primer trimmed reverse reads. Default: _R2_trimmed.fastq.gz")  
-  println("      --filtered_R1_suffix [STRING]  Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz")
-  println("      --filtered_R2_suffix [STRING]  Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz")
+  println("      --primer_trimmed_R1_suffix [STRING] Suffix to use for naming your primer trimmed forward reads. Default: _R1_trimmed.fastq.gz.")
+  println("      --primer_trimmed_R2_suffix [STRING] Suffix to use for naming your primer trimmed reverse reads. Default: _R2_trimmed.fastq.gz.")  
+  println("      --filtered_R1_suffix [STRING]  Suffix to use for naming your quality filtered forward reads. Default: _R1_filtered.fastq.gz.")
+  println("      --filtered_R2_suffix [STRING]  Suffix to use for naming your quality filtered reverse reads. Default: _R2_filtered.fastq.gz.")
   println("Output directories:")
-  println("      --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/")
-  println("      --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: workflow_output/FastQC_Outputs/")
-  println("      --trimmed_reads_dir [PATH] Where should your cutadapt trimmed reads be stored. Default: workflow_output/Trimmed_Sequence_Data/")
-  println("      --filtered_reads_dir [PATH] Where should your filtered reads be stored.  Default: workflow_output/Filtered_Sequence_Data/")
-  println("      --info_out_dir [PATH] Where should output metadata be stored. Default: workflow_output/Metadata/")
-  println("      --plots_dir [PATH] Where should your plots be stored if visualization is enabled. Default: workflow_output/Final_Outputs/Plots/")
-  println("      --final_outputs_dir [PATH] Where should most outputs and summary reports be stored.  Default: workflow_output/Final_Outputs/")
+  println("      --raw_reads_dir [PATH] Where should the fastqc report of the raw reads be stored. Default: Raw_Sequence_Data/.")
+  println("      --fastqc_out_dir [PATH] Where should multiqc outputs be stored. Default: workflow_output/FastQC_Outputs/.")
+  println("      --trimmed_reads_dir [PATH] Where should your cutadapt trimmed reads be stored. Default: workflow_output/Trimmed_Sequence_Data/.")
+  println("      --filtered_reads_dir [PATH] Where should your filtered reads be stored.  Default: workflow_output/Filtered_Sequence_Data/.")
+  println("      --info_out_dir [PATH] Where should output metadata be stored. Default: workflow_output/Metadata/.")
+  println("      --plots_dir [PATH] Where should your plots be stored if visualization is enabled. Default: workflow_output/Final_Outputs/Plots/.")
+  println("      --final_outputs_dir [PATH] Where should most outputs and summary reports be stored.  Default: workflow_output/Final_Outputs/.")
   println("Genelab specific arguements:")
   println("      --GLDS_accession [STRING]  A Genelab accession number if the --csv_file parameter is not set. If this parameter is set, it will ignore the --csv_file parameter.")
   println("      --assay_suffix [STRING]  Genelabs assay suffix. Default: GLAmpSeq.")
-  println("      --output_prefix [STRING] Unique name to tag onto output files. Default: ''")
+  println("      --output_prefix [STRING] Unique name to tag onto output files. Default: empty string.")
   println("Paths to existing conda environments to use otherwise a new one will be created using the yaml file in envs/.")
   println("      --conda.qc [PATH] Path to a conda environment containing fastqc, multiqc, zip and python. Default: null.")
   println("      --conda.R [PATH] Path to a conda environment containing R along with the packages decipher and biomformat installed. Default: null.")
   println("      --conda.genelab  [PATH] Path to a conda environment containing genlab-utils. Default: null.")
   println("      --conda.cutadapt [PATH] Path to a conda environment containing cutadapt. Default: null.")
   println("      --conda.R_visualizations [PATH] Path to a conda environment containing R packages required for plotting. Default: null.")
-  print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number cpus, memory per task etc.")
+  print("Advanced users can edit the nextflow.config file for more control over default settings such container choice, number of cpus, memory per task etc.")
   exit 0
   }
 
diff --git a/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/nextflow.config b/Amplicon/Illumina/Workflow_Documentation/SW_AmpIllumina-B/workflow_code/nextflow.config