add bowtie2 alignment

torres-alexis · torres-alexis · commit 20161fa805a5 · 2025-01-27T12:26:59.000-08:00
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/conf/by_docker_image.config b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/conf/by_docker_image.config
@@ -44,6 +44,16 @@ process {
         container = "quay.io/biocontainers/star:2.7.11b--h43eeafb_1"
     }
 
+    withName: 'BUILD_BOWTIE2_INDEX|ALIGN_BOWTIE2' {
+        // Bowtie2 2.5.4 (12/10/2024)
+        container = "quay.io/biocontainers/bowtie2:2.5.4--he96a11b_5"
+    }
+
+    withName: 'FEATURECOUNTS' {
+        // featureCounts 2.0.8 (12/10/2024)
+        container = "quay.io/biocontainers/subread:2.0.8--h577a1d6_0
+    }
+
     withName: 'BUILD_RSEM_INDEX|COUNT_ALIGNED' {
         // RSEM (02/14/2020)
         // Known issue: version is printed as 1.31 https://github.com/deweylab/RSEM/issues/153 
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/main.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/main.nf
@@ -58,7 +58,7 @@ if ((params.accession) || params.runsheet_path || params.isa_archive_path) {
 }
 
 include { RNASEQ } from './workflows/rnaseq.nf'
-
+include { RNASEQ_MICROBES } from './workflows/rnaseq_microbes.nf'
 // Validate accession format. Must be OSD-#. 
 if (params.accession && !params.accession.matches(/^(OSD|GLDS)-\d+$/)) {
     log.error "Invalid accession format. Expected format: OSD-# or GLDS-#"
@@ -102,7 +102,22 @@ ch_reference_gtf = params.reference_gtf ? Channel.fromPath(params.reference_gtf)
 // Main workflows
 workflow {
     if (params.mode == 'microbes') {
-        //RNASEQ_MICROBES() // Uncomment after implemented
+        RNASEQ_MICROBES(
+            ch_dp_tools_plugin,
+            ch_reference_table,
+            ch_accession,
+            ch_isa_archive,
+            ch_runsheet,
+            ch_api_url,
+            ch_force_single_end,
+            ch_truncate_to,
+            ch_reference_source,
+            ch_reference_version,
+            ch_reference_fasta,
+            ch_reference_gtf,
+            ch_reference_store_path,
+            ch_derived_store_path
+        )
     } else {
         RNASEQ(
             ch_dp_tools_plugin,
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/align_bowtie2.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/align_bowtie2.nf
@@ -0,0 +1,37 @@
+process ALIGN_BOWTIE2 {
+  // Aligns reads against Bowtie2 index
+  tag "Sample: ${ meta.id }"
+
+  input:
+    tuple val(meta), path(reads)
+    path(bowtie2_index_dir)
+
+  output:
+    path("${ meta.id }/${ meta.id }*"), emit: publishables // used to ensure direct files are available for publishing directive
+    path("${ meta.id }/${ meta.id }.bowtie2.log"), emit: alignment_logs
+    tuple val(meta), path("${ meta.id }/${meta.id}.bam"), emit: bam
+    path("${ meta.id }/${ meta.id }.unmapped.fastq"), emit: unmapped_reads
+    path("versions.yml"), emit: versions
+
+  script:
+    def readArgs = meta.paired_end ? "-1 ${ reads[0] } -2 ${ reads[1] }" : "-U ${ reads }"
+
+    """
+    export BOWTIE2_INDEXES=${ bowtie2_index_dir }
+
+    
+    mkdir -p ${ meta.id }
+    bowtie2 -x ${ BOWTIE2_INDEX_DIR } \
+    ${readArgs} \
+    --threads ${ task.cpus } \
+    --minins 0 \
+    --maxins 500 \
+    -k 1 \
+    --un ${ meta.id }/${ meta.id }.unmapped.fastq \
+    2> ${ meta.id }/${ meta.id }.bowtie2.log \
+    | samtools view -bS --threads ${ task.cpus } -o ${ meta.id }/${ meta.id }.bam -
+
+    echo '"${task.process}":' > versions.yml
+    echo "    bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//')" >> versions.yml
+    """
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/build_bowtie2_index.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/build_bowtie2_index.nf
@@ -0,0 +1,25 @@
+process BUILD_BOWTIE2_INDEX {
+  // Builds Bowtie 2 index, this is ercc-spike-in and organism specific
+  tag "Refs: ${ genome_fasta }, ${ genome_gtf }, Source: ${reference_source}${reference_source.toLowerCase().contains('ensembl') ? ', Version: ' + reference_version : ''}, GenomeSubsample: ' + params.genome_subsample : ''}"
+  storeDir "${ derived_store_path }/Bowtie2_Indices/${ reference_source }/${reference_source.toLowerCase().contains('ensembl') ? reference_version + '/' : ''}${ meta.organism_sci }"
+
+  input:
+    val(derived_store_path)
+    val(organism_sci)
+    val(reference_source)
+    val(reference_version)
+    tuple path(genome_fasta), path(genome_gtf)
+    val(meta)
+
+
+  output:
+    path("${ genome_fasta.baseName }"), emit: index_dir
+  script:
+    """
+    mkdir -p ${ genome_fasta.baseName }
+
+    bowtie2-build --threads ${task.cpus} \
+      -f ${ genome_fasta } \
+      ${ genome_fasta.baseName }/${ genome_fasta.baseName }
+    """
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/dge_deseq2.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/dge_deseq2.nf
@@ -3,8 +3,7 @@
  */
 // ERCC counts are removed before normalization
 
-process DESEQ2_DGE {
-    tag "Dataset-wide"
+process DGE_DESEQ2 {
 
     input:
         val(meta)
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/featurecounts.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/featurecounts.nf
@@ -0,0 +1,31 @@
+process FEATURECOUNTS {
+
+  input:
+    val(meta)
+    tuple path(genomeFasta), path(genomeGtf)
+    val(strandedness)
+    path(bam_files)
+
+  output:
+    tuple path("FeatureCounts_GLbulkRNAseq.csv"), path("FeatureCounts_GLbulkRNAseq.csv.summary"), emit: publishables
+    path("versions.yml"), emit: versions
+  script:
+    def pairedOption = meta.paired_end ? "-p" : ""
+    def strandOption = (strandedness == "unstranded") ? 0 : (strandedness == "sense") ? 1 : 2
+    def bamList = bam_files.join(' ')
+    """
+    featureCounts ${pairedOption} \
+    -T ${ task.cpus } \
+    -a ${ genomeGtf } \
+    -s ${strandOption} \
+    -t exon \
+    -g gene_id \
+    -o "FeatureCounts_GLbulkRNAseq.csv" \
+    ${bamList}
+
+
+    echo '"${task.process}":' > versions.yml
+    echo "    featurecounts: \$(echo \$(featureCounts -v 2>&1) | sed 's/^.*featureCounts v//; s/ .*\$//')" >> versions.yml
+    """
+
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/gtf_to_bed.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/modules/gtf_to_bed.nf
@@ -0,0 +1,19 @@
+process GTF_TO_BED {
+    // Converts reference gtf into bed 
+    storeDir "${ derived_store_path }/Genome_GTF_BED_Files/${reference_source}/${reference_source.toLowerCase().contains('ensembl') ? reference_version + '/' : ''}${organism_sci}/microbes"
+    
+    input:
+        val(derived_store_path)
+        val(organism_sci)
+        val(reference_source) // Used for defining storage location 
+        val(reference_version) // Used for defining storage location 
+        path(genome_gtf)
+
+    output:
+        path("${ genome_gtf.baseName }.bed"), emit: genome_bed
+
+    script:
+    """
+    gtf_to_bed.py ${ genome_gtf } ${ genome_gtf.baseName }.bed
+    """
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq.nf
@@ -45,10 +45,10 @@ include { MULTIQC as ALL_MULTIQC } from '../modules/multiqc.nf' addParams(MQCLab
 //include { QUALIMAP_BAM_QC } from '../modules/qualimap.nf' not implemented
 //include { QUALIMAP_RNASEQ_QC } from '../modules/qualimap.nf' not implemented
 
-include { DESEQ2_DGE } from '../modules/deseq2_dge.nf'
+include { DGE_DESEQ2 } from '../modules/dge_deseq2.nf'
 include { ADD_GENE_ANNOTATIONS } from '../modules/add_gene_annotations.nf'
-include { EXTEND_DGE_TABLE } from '../modules/extend_dge_table.nf'
-include { GENERATE_PCA_TABLE } from '../modules/generate_pca_table.nf'
+// include { EXTEND_DGE_TABLE } from '../modules/extend_dge_table.nf'
+// include { GENERATE_PCA_TABLE } from '../modules/generate_pca_table.nf'
 
 
 include { VV_RAW_READS;
@@ -282,8 +282,8 @@ workflow RNASEQ {
         
 
         // Normalize counts, DGE 
-        DESEQ2_DGE( ch_meta, runsheet_path, COUNT_ALIGNED.out.genes_results | toSortedList )
-        dge_table = DESEQ2_DGE.out.dge_table
+        DGE_DESEQ2( ch_meta, runsheet_path, COUNT_ALIGNED.out.genes_results | toSortedList )
+        dge_table = DGE_DESEQ2.out.dge_table
         // Add annotations to DGE table
         ADD_GENE_ANNOTATIONS( ch_meta, gene_annotations_url, dge_table )
         annotated_dge_table = ADD_GENE_ANNOTATIONS.out.annotated_dge_table
@@ -292,7 +292,7 @@ workflow RNASEQ {
         //EXTEND_DGE_TABLE( annotated_dge_table )
         // Generate PCA table from normalized counts 
         // Step being removed on update
-        //GENERATE_PCA_TABLE ( DESEQ2_DGE.out.norm_counts | map { it[1] })
+        //GENERATE_PCA_TABLE ( DGE_DESEQ2.out.norm_counts | map { it[1] })
 
         // Parse QC metrics
         all_multiqc_output = RAW_READS_MULTIQC.out.data
@@ -380,12 +380,12 @@ workflow RNASEQ {
         //     QUANTIFY_RSEM_GENES.out.publishables,
         //     COUNT_MULTIQC.out.zipped_report,
         //     COUNT_MULTIQC.out.unzipped_report,
-        //     DESEQ2_DGE.out.norm_counts,
-        //     DESEQ2_DGE.out.contrasts
-        //         .mix( DESEQ2_DGE.out.sample_table )
+        //     DGE_DESEQ2.out.norm_counts,
+        //     DGE_DESEQ2.out.contrasts
+        //         .mix( DGE_DESEQ2.out.sample_table )
         //         .mix( annotated_dge_table )
         //         .mix( GENERATE_PCA_TABLE.out.pca_table ),
-        //     DESEQ2_DGE.out.norm_counts_ercc | ifEmpty( { file("NO_FILES.placeholder") }),
+        //     DGE_DESEQ2.out.norm_counts_ercc | ifEmpty( { file("NO_FILES.placeholder") }),
         //     DGE_BY_DESEQ2.out.dge_ercc | ifEmpty( { file("NO_FILES.placeholder") }),
         //     "${ projectDir }/bin/dp_tools__NF_RCP" // dp_tools plugin
         //             )
diff --git a/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf b/RNAseq/Workflow_Documentation/NF_RCP/workflow_code/workflows/rnaseq_microbes.nf