IARCbioinfo
diff --git a/‎.circleci/config.yml
Lines changed: 1 addition & 0 deletions b/‎.circleci/config.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 5 additions & 16 deletions b/‎Dockerfile
Lines changed: 5 additions & 16 deletions
diff --git a/‎README.md
Lines changed: 9 additions & 2 deletions b/‎README.md
Lines changed: 9 additions & 2 deletions
diff --git a/‎RNAseq.nf
Lines changed: 33 additions & 30 deletions b/‎RNAseq.nf
Lines changed: 33 additions & 30 deletions
diff --git a/‎Singularity/Singularity.v2.3
Lines changed: 9 additions & 0 deletions b/‎Singularity/Singularity.v2.3
Lines changed: 9 additions & 0 deletions
diff --git a/‎dag_STAR.html
Lines changed: 3 additions & 3 deletions b/‎dag_STAR.html
Lines changed: 3 additions & 3 deletions
diff --git a/‎dag_STAR.png
-12.1 KB b/‎dag_STAR.png
-12.1 KB
diff --git a/‎dag_STAR_sjtrim.html
Lines changed: 4 additions & 4 deletions b/‎dag_STAR_sjtrim.html
Lines changed: 4 additions & 4 deletions
diff --git a/‎dag_STAR_sjtrim.png
-11.6 KB b/‎dag_STAR_sjtrim.png
-11.6 KB
@@ -16,6 +16,7 @@ jobs:
                         - run: cd ; nextflow run ~/project/RNAseq.nf -with-docker iarcbioinfo/rnaseq-nf --input_folder ~/data_test/BAM/ --output_folder BAM_realigned_sjtrim --ref_folder  ~/data_test/REF --gtf ~/data_test/REF/TP53_small.gtf --bed ~/data_test/BED/TP53_small.bed --cpu 2 --mem 4 --sjtrim --ref ~/data_test/REF/17_7572000-7591000.fasta -with-dag dag_STAR_sjtrim.html
                         - run: cd ; nextflow run ~/project/RNAseq.nf -with-docker iarcbioinfo/rnaseq-nf --input_folder ~/data_test/BAM/ --output_folder BAM_realigned_sjtrim --ref_folder  ~/data_test/REF --gtf ~/data_test/REF/TP53_small.gtf --bed ~/data_test/BED/TP53_small.bed --cpu 2 --mem 4 --sjtrim --recalibration --snp_vcf ~/data_test/REF/dbsnp_138.17_7572000-7591000.vcf.gz --indel_vcf ~/data_test/REF/1000G_phase1.indels.17_7572000-7591000.sites.vcf.gz --ref ~/data_test/REF/17_7572000-7591000.fasta -with-dag dag_STAR_sjtrim_bqsr.png
                         - run: cd ; nextflow run ~/project/RNAseq.nf -with-docker iarcbioinfo/rnaseq-nf --input_folder ~/data_test/BAM/ --output_folder BAM_realigned_sjtrim --ref_folder  ~/data_test/REF --gtf ~/data_test/REF/TP53_small.gtf --bed ~/data_test/BED/TP53_small.bed --cpu 2 --mem 4 --sjtrim --recalibration --snp_vcf ~/data_test/REF/dbsnp_138.17_7572000-7591000.vcf.gz --indel_vcf ~/data_test/REF/1000G_phase1.indels.17_7572000-7591000.sites.vcf.gz --ref ~/data_test/REF/17_7572000-7591000.fasta -with-dag dag_STAR_sjtrim_bqsr.html
+                        - run: cd ; echo -e 'SM\tRG\tpair1\tpair2\nNA06984\t\tdata_test/FASTQ/NA06984_T_1.fastq.gz\tdata_test/FASTQ/NA06984_T_2.fastq.gz\nNA06984_2RG\tRG1\tdata_test/FASTQ/NA06984_T_RG1_1.fastq.gz\tdata_test/FASTQ/NA06984_T_RG1_2.fastq.gz\nNA06984_2RG\tRG2\tdata_test/FASTQ/NA06984_T_RG2_1.fastq.gz\tdata_test/FASTQ/NA06984_T_RG2_2.fastq.gz' > input.txt ; nextflow run ~/project/RNAseq.nf -with-docker iarcbioinfo/rnaseq-nf --input_file input.txt --output_folder BAM_inputfile --ref_folder  ~/data_test/REF --gtf ~/data_test/REF/TP53_small.gtf --bed ~/data_test/BED/TP53_small.bed --fastq_ext fastq.gz --cpu 2 --cpu_trim 2 --mem 4 --sjtrim --recalibration --cutadapt --snp_vcf ~/data_test/REF/dbsnp_138.17_7572000-7591000.vcf.gz --indel_vcf ~/data_test/REF/1000G_phase1.indels.17_7572000-7591000.sites.vcf.gz --ref ~/data_test/REF/17_7572000-7591000.fasta
                         - run: cd ; cp ~/dag* ~/project/.
                         - deploy:
                                 branch: [master, dev]
 
@@ -1,13 +1,12 @@
 ################## BASE IMAGE #####################
-FROM nfcore/base
-
+FROM continuumio/miniconda3:4.7.12
 
 ################## METADATA #######################
 
-LABEL base_image="nfcore/base"
-LABEL version="1.0"
+LABEL base_image="continuumio/miniconda3"
+LABEL version="4.7.12"
 LABEL software="rnaseq-nf"
-LABEL software.version="2.2"
+LABEL software.version="2.3"
 LABEL about.summary="Container image containing all requirements for rnaseq-nf"
 LABEL about.home="http://github.com/IARCbioinfo/RNAseq-nf"
 LABEL about.documentation="http://github.com/IARCbioinfo/RNAseq-nf/README.md"
@@ -17,18 +16,8 @@ LABEL about.license="GNU-3.0"
 ################## MAINTAINER ######################
 MAINTAINER **nalcala** <**alcalan@fellows.iarc.fr**>
 
-
-#RUN mkdir -p /var/cache/apt/archives/partial && \
-#	touch /var/cache/apt/archives/lock && \
-#	chmod 640 /var/cache/apt/archives/lock && \
-#	apt-get update -y &&\
-#	apt-get install -y gnupg2
-#	RUN	apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F76221572C52609D && \
-#	apt-get clean && \
-#	apt-get update -y && \
-
 ################## INSTALLATION ######################
 COPY environment.yml /
+RUN apt-get update && apt-get install -y procps && apt-get clean -y
 RUN conda env create -n rnaseq-nf -f /environment.yml && conda clean -a
 ENV PATH /opt/conda/envs/rnaseq-nf/bin:$PATH
-#RUN echo ". /opt/conda/etc/profile.d/conda.sh"  >> ~/.bashrc 
@@ -20,6 +20,9 @@ Nextflow pipeline for RNA sequencing mapping, quality control, reads counting, a
 5. [*STAR*](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
 6. [*htseq*](http://www-huber.embl.de/HTSeq/doc/install.html#install); the python script htseq-count must also be in the PATH
 
+**A singularity container is available with all the tools needed to run the pipeline (see "Usage")**
+
+### References
 A bundle with reference genome and corresponding annotations for STAR is available at https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/.
 
 Alternatively, STAR genome indices can be generated from a genome fasta file ref.fa and a splice junction annotation file ref.gtf using the following command:
@@ -66,7 +69,7 @@ In order to perform the optional base quality score recalibration, several files
   |--input_folder    | a folder with fastq files or bam files |
   |--input_file |  input tabulation-separated values file with columns SM (sample name), RG (read group), pair1 (first fastq pair file), and pair2 (second fastq pair file) |
 
-  Note that there are two input methods--folder and file. Although the input folder method is the easiest because it does not require to create an input file with the right format, the input file mode is recommended in cases when a single sample has multiple paired files (e.g., due to multiplexed sequencing); in that case, users should have one line per pair of file and put a same SM identifier so that the workflow can group them into the same output bam file.
+  Note that there are two input methods: folder and file. Although the input folder method is the easiest because it does not require to create an input file with the right format, the input file mode is recommended in cases when a single sample has multiple paired files (e.g., due to multiplexed sequencing); in that case, users should have one line per pair of file and put a same SM identifier so that the workflow can group them into the same output bam file.
 
 
 ## Parameters
@@ -95,9 +98,11 @@ In order to perform the optional base quality score recalibration, several files
 |--ref |    ref.fa | reference genome fasta file for GATK |
 |--snp_vcf |  dbsnp.vcf | VCF file with known variants for GATK BQSR |
 |--indel_vcf |  Mills_100G_indels.vcf | VCF file with known indels for GATK BQSR |
+|--STAR_mapqUnique | 255  | STAR default mapping quality for unique mappers |
 |--RG          |  PL:ILLUMINA | string to be added to read group information in BAM file |
 |--stranded   |  no | Strand information for counting with htseq [no, yes, reverse] | 
 |--hisat2_idx   |  genome_tran | index filename prefix for hisat2 | 
+|--htseq_maxreads | 30000000 | Maximum number of reads taken into account by htseq-count |
 |--multiqc_config   |  null | config yaml file for multiqc | 
 
 
@@ -115,8 +120,10 @@ In order to perform the optional base quality score recalibration, several files
 ## Usage
 To run the pipeline on a series of paired-end fastq files (with suffixes *_1* and *_2*) in folder *fastq*, a reference genome with indexes in folder *ref_genome*, an annotation file ref.gtf, and a bed file ref.bed, one can type:
 ```bash
-nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed
+nextflow run iarcbioinfo/RNAseq-nf -r v2.2 -profile singularity --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed
 ``` 
+To run the pipeline without singularity just remove "-profile singularity"
+
 ### Use hisat2 for mapping
 To use hisat2 instead of STAR for the reads mapping, you must add the ***--hisat2* option**, specify the path to the folder containing the hisat2 index files (genome_tran.1.ht2 to genome_tran.8.ht2), as well as satisfy the requirements above mentionned. For example:
 ```bash
 
@@ -48,7 +48,7 @@ params.help         = null
 
 log.info "" 
 log.info "--------------------------------------------------------"
-log.info "  RNAseq-nf 2.1.0: alignment, QC, and reads counting workflow for RNA sequencing "
+log.info "  RNAseq-nf 2.3.0: alignment, QC, and reads counting workflow for RNA sequencing "
 log.info "--------------------------------------------------------"
 log.info "Copyright (C) IARC/WHO"
 log.info "This program comes with ABSOLUTELY NO WARRANTY; for details see LICENSE"
@@ -184,7 +184,7 @@ if(params.input_file){
 	    if (file(params.input_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0){
 	        println "BAM files found, proceed with realignment"; mode ='bam'
 		files = Channel.fromPath( params.input_folder+'/*.bam' )
-		               .map {  path -> [ path.name.replace(".bam",""), path.name.replace(".bam","") ,  path ] }
+		               .map {  path -> [ path.name.replace(".bam",""), "" ,  path ] }
 	    }else{
 	        println "ERROR: input folder contains no fastq nor BAM files"; System.exit(0)
 	    }
@@ -208,9 +208,7 @@ if(mode=='bam'){
 
         '''
         set -o pipefail
-        samtools collate -uOn 128 !{file_tag}.bam tmp_!{file_tag} | samtools fastq -1 !{file_tag}_1.fq -2 !{file_tag}_2.fq -
-	gzip !{file_tag}_1.fq
-	gzip !{file_tag}_2.fq
+        samtools collate -uOn 128 !{file_tag}.bam tmp_!{file_tag} | samtools fastq -1 !{file_tag}!{params.suffix1}.!{params.fastq_ext} -2 !{file_tag}!{params.suffix2}.!{params.fastq_ext} -
         '''
     }
     readPairs0.into{ readPairs ; readPairs2}
@@ -224,6 +222,10 @@ if(mode=='fastq'){
                                                                                                                .collect { it.replace("${params.suffix2}.${params.fastq_ext}",'') }
     if ( !(keys1.containsAll(keys2)) || !(keys2.containsAll(keys1)) ) {println "\n ERROR : There is not at least one fastq without its mate, please check your fastq files."; System.exit(0)}
 
+readPairs0 = Channel.fromFilePairs(params.input_folder +"/*{${params.suffix1},${params.suffix2}}" +'.'+ params.fastq_ext)
+			      .map { row -> [ row[0] , "" , row[1][0], row[1][1] ] }
+                  .subscribe{ row -> println "${row}" }
+
 // Gather files ending with _1 suffix
    reads1 = Channel
     .fromPath( params.input_folder+'/*'+params.suffix1+'.'+params.fastq_ext )
@@ -237,13 +239,12 @@ if(mode=='fastq'){
 // Match the pairs on two channels having the same 'key' (name) and emit a new pair containing the expected files
    reads1
     .phase(reads2)
-    .map { pair1, pair2 -> [ pair1[0] , pair1[0] , pair1[1], pair2[1] ] }
+    .map { pair1, pair2 -> [ pair1[0] , "" , pair1[1], pair2[1] ] }
     .into{ readPairs ; readPairs2}
 }
 }
 
 
-
 // pre-trimming QC
 process fastqc_pretrim {
 	cpus params.cpu
@@ -259,39 +260,39 @@ process fastqc_pretrim {
 	publishDir "${params.output_folder}/QC/fastq", mode: 'copy', pattern: '{*fastqc.zip}'
 
 	shell:
-	//basename1=pair1.baseName.split("\\.")[0]
-	//basename2=pair2.baseName.split("\\.")[0]
-        '''
+	basename1=pair1.name.replace(".${params.fastq_ext}","") //baseName.split("\\.")[0]
+	basename2=pair2.name.replace(".${params.fastq_ext}","") //baseName.split("\\.")[0]
+    '''
 	fastqc -t !{task.cpus} !{pair1} !{pair2}
-	mv !{file_tag}!{params.suffix1}_fastqc.zip !{file_tag}!{params.suffix1}_pretrim_fastqc.zip
-	mv !{file_tag}!{params.suffix2}_fastqc.zip !{file_tag}!{params.suffix2}_pretrim_fastqc.zip 
-        '''
+	mv !{basename1}_fastqc.zip !{file_tag}!{params.suffix1}!{rg}_pretrim_fastqc.zip
+	mv !{basename2}_fastqc.zip !{file_tag}!{params.suffix2}!{rg}_pretrim_fastqc.zip 
+    '''
 }
 
 // adapter sequence trimming and post trimming QC
 if(params.cutadapt!=null){
 	process adapter_trimming {
-            cpus params.cpu_trim
-            memory params.mem_QC+'GB'
-            tag { file_tag }
+        cpus params.cpu_trim
+        memory params.mem_QC+'GB'
+        tag { file_tag +rg }
 
-            input:
+        input:
 	    set val(file_tag), val(rg), file(pair1), file(pair2) from readPairs2
 
-            output:
-            set val(file_tag), val(rg) , file("${file_tag}*val_1.fq.gz"), file("${file_tag}*val_2.fq.gz")  into readPairs3
+        output:
+        set val(file_tag), val(rg) , file("${file_tag}${rg}*val_1.fq.gz"), file("${file_tag}${rg}*val_2.fq.gz")  into readPairs3
 	    file("*_val_*_fastqc.zip") into fastqc_postpairs
 	    file("*trimming_report.txt") into trimming_reports
 
 	    publishDir "${params.output_folder}/QC/adapter_trimming", mode: 'copy', pattern: '{*report.txt,*fastqc.zip}'
 
-            shell:
+        shell:
 	    cpu_tg = params.cpu_trim -1
 	    cpu_tg2 = cpu_tg.div(3.5)
 	    cpu_tg3 = Math.round(Math.ceil(cpu_tg2))
-            '''
-	    trim_galore --paired --fastqc --gzip --basename !{file_tag}_!{rg} -j !{cpu_tg3} !{pair1} !{pair2}
-            '''
+        '''
+	    trim_galore --paired --fastqc --gzip --basename !{file_tag}!{rg} -j !{cpu_tg3} !{pair1} !{pair2}
+        '''
 	}
 }else{
 	readPairs3 = readPairs2
@@ -337,15 +338,18 @@ process alignment {
       sort_mem     = params.mem.intdiv(4)
       input_f1="${pair1[0]}"
       input_f2="${pair2[0]}"
-      rgline="ID:${file_tag}_${rg[0]} SM:${file_tag} ${params.RG}"
+      rgtmp="${rg[0]}"
+      if(rgtmp=="") rgtmp="${file_tag}"
+      rgline="ID:${rgtmp} SM:${file_tag} ${params.RG}"
       for( p1tmp in pair1.drop(1) ){
 	input_f1=input_f1+",${p1tmp}"
       }
       for( p2tmp in pair2.drop(1) ){
         input_f2=input_f2+",${p2tmp}"
       }
       for( rgtmp in rg.drop(1) ){
-        rgline=rgline+" , ID:${file_tag}_${rgtmp} SM:${file_tag} ${params.RG}"
+	if(rgtmp=="") rgtmp="${file_tag}"
+        rgline=rgline+" , ID:${rgtmp} SM:${file_tag} ${params.RG}"
       }
       MQ=""
       '''
@@ -515,11 +519,10 @@ if( recal_bam_files4QCsplit4test.ifEmpty(0)==0 ){
 
 //Quantification
 process quantification{
+	cpus params.cpu
     	if( (params.sjtrim)||(params.recalibration) ){
-		cpus params.cpu
 		memory params.mem+'GB'
 	}else{
-		cpus '1'
 		memory params.mem_QC+'GB'
 	}
 
@@ -541,11 +544,11 @@ process quantification{
 	'''
 	mv !{file_tag}.bam !{file_tag}_coordinate_sorted.bam
 	sambamba sort -n -t !{task.cpus} -m !{params.mem}G --tmpdir=!{file_tag}_tmp -o !{file_tag}.bam !{file_tag}_coordinate_sorted.bam
-	htseq-count -r name -s !{params.stranded} -f bam !{file_tag}.bam !{gtf} !{buffer} --additional-attr=gene_name > !{file_tag}_count.txt 
+	htseq-count -n !{params.cpu} -r name -s !{params.stranded} -f bam !{file_tag}.bam !{gtf} !{buffer} --additional-attr=gene_name > !{file_tag}_count.txt 
 	'''
 	}else{
 	 	'''
-		htseq-count -r pos -s !{params.stranded} -f bam !{file_tag}.bam !{gtf} !{buffer} --additional-attr=gene_name > !{file_tag}_count.txt 
+		htseq-count -n !{params.cpu} -r pos -s !{params.stranded} -f bam !{file_tag}.bam !{gtf} !{buffer} --additional-attr=gene_name > !{file_tag}_count.txt 
     		'''
 	}
 }
@@ -607,7 +610,7 @@ process multiqc_posttrim {
 	opt = "--config ${multiqc_config}"
     }
     '''
-    for f in $(find *fastqc.zip -type l);do cp --remove-destination $(readlink $f) $f;done;
+    if $(compgen -G "*fastq.zip" > /dev/null); then for f in $(find *fastqc.zip -type l);do cp --remove-destination $(readlink $f) $f;done; fi;
     multiqc . -n multiqc_posttrim_report.html -m fastqc -m cutadapt -m star -m rseqc -m htseq !{opt} --comment "RNA-seq Post-trimming QC report"
     '''
 }
 
@@ -0,0 +1,9 @@
+From:iarcbioinfo/rnaseq-nf:v2.3
+Bootstrap:docker
+
+%labels
+    MAINTAINER **alcalan** <**alcalan@fellows.iarc.fr**>
+    DESCRIPTION Container image containing all requirements for pipeline RNAseq-nf
+    VERSION 2.3
+
+
@@ -187,8 +187,8 @@
 { data: { source: 'p16', target: 'p17'} },
 { data: { source: 'p17', target: 'p18', label: 'files' } },
 { data: { source: 'p18', target: 'p19', label: 'readPairs0' } },
-{ data: { source: 'p19', target: 'p23', label: 'readPairs2' } },
 { data: { source: 'p19', target: 'p20', label: 'readPairs' } },
+{ data: { source: 'p19', target: 'p23', label: 'readPairs2' } },
 { data: { source: 'p20', target: 'p41', label: 'fastqc_pairs' } },
 { data: { source: 'p21', target: 'p53', label: 'fastqc_postpairs' } },
 { data: { source: 'p22', target: 'p51', label: 'trimming_reports' } },
@@ -199,15 +199,15 @@
 { data: { source: 'p26', target: 'p46', label: 'align_out' } },
 { data: { source: 'p26', target: 'p28', label: 'SJ_out' } },
 { data: { source: 'p26', target: 'p27', label: 'SJ_out_others' } },
-{ data: { source: 'p29', target: 'p40', label: 'recal_bam_files4quant' } },
 { data: { source: 'p29', target: 'p31', label: 'recal_bam_files4QC' } },
 { data: { source: 'p29', target: 'p32', label: 'recal_bam4QCsplittmp' } },
+{ data: { source: 'p29', target: 'p40', label: 'recal_bam_files4quant' } },
 { data: { source: 'p30', target: 'p31', label: 'bed' } },
 { data: { source: 'p31', target: 'p49', label: 'rseqc_files' } },
 { data: { source: 'p31', target: 'p48', label: 'rseqc_clip_files' } },
 { data: { source: 'p31', target: 'p50', label: 'rseqc_jsat_files' } },
-{ data: { source: 'p32', target: 'p34', label: 'recal_bam_files4QCsplit0' } },
 { data: { source: 'p32', target: 'p33', label: 'simple' } },
+{ data: { source: 'p32', target: 'p34', label: 'recal_bam_files4QCsplit0' } },
 { data: { source: 'p34', target: 'p36', label: 'recal_bam_files4QCsplit' } },
 { data: { source: 'p34', target: 'p37', label: 'recal_bam_files4QCsplit4test' } },
 { data: { source: 'p35', target: 'p36', label: 'bed' } },
 
@@ -191,8 +191,8 @@
 { data: { source: 'p16', target: 'p17'} },
 { data: { source: 'p17', target: 'p18', label: 'files' } },
 { data: { source: 'p18', target: 'p19', label: 'readPairs0' } },
-{ data: { source: 'p19', target: 'p20', label: 'readPairs' } },
 { data: { source: 'p19', target: 'p23', label: 'readPairs2' } },
+{ data: { source: 'p19', target: 'p20', label: 'readPairs' } },
 { data: { source: 'p20', target: 'p45', label: 'fastqc_pairs' } },
 { data: { source: 'p21', target: 'p57', label: 'fastqc_postpairs' } },
 { data: { source: 'p22', target: 'p55', label: 'trimming_reports' } },
@@ -207,17 +207,17 @@
 { data: { source: 'p30', target: 'p32', label: 'fasta_ref_fai' } },
 { data: { source: 'p31', target: 'p32', label: 'fasta_ref_dict' } },
 { data: { source: 'p32', target: 'p33', label: 'bam_files2' } },
-{ data: { source: 'p33', target: 'p36', label: 'recal_bam4QCsplittmp' } },
 { data: { source: 'p33', target: 'p35', label: 'recal_bam_files4QC' } },
 { data: { source: 'p33', target: 'p44', label: 'recal_bam_files4quant' } },
+{ data: { source: 'p33', target: 'p36', label: 'recal_bam4QCsplittmp' } },
 { data: { source: 'p34', target: 'p35', label: 'bed' } },
 { data: { source: 'p35', target: 'p53', label: 'rseqc_files' } },
 { data: { source: 'p35', target: 'p52', label: 'rseqc_clip_files' } },
 { data: { source: 'p35', target: 'p54', label: 'rseqc_jsat_files' } },
-{ data: { source: 'p36', target: 'p37', label: 'simple' } },
 { data: { source: 'p36', target: 'p38', label: 'recal_bam_files4QCsplit0' } },
-{ data: { source: 'p38', target: 'p41', label: 'recal_bam_files4QCsplit4test' } },
+{ data: { source: 'p36', target: 'p37', label: 'simple' } },
 { data: { source: 'p38', target: 'p40', label: 'recal_bam_files4QCsplit' } },
+{ data: { source: 'p38', target: 'p41', label: 'recal_bam_files4QCsplit4test' } },
 { data: { source: 'p39', target: 'p40', label: 'bed' } },
 { data: { source: 'p40', target: 'p59', label: 'rseqc_files_split' } },
 { data: { source: 'p41', target: 'p42'} },