maxplanck-ie
diff --git a/‎.ci_stuff/test_dag.sh
Lines changed: 35 additions & 35 deletions b/‎.ci_stuff/test_dag.sh
Lines changed: 35 additions & 35 deletions
diff --git a/‎conda-recipe/meta.yaml
Lines changed: 1 addition & 1 deletion b/‎conda-recipe/meta.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/content/News.rst
Lines changed: 3 additions & 0 deletions b/‎docs/content/News.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/content/workflows/ChIPseq.rst
Lines changed: 25 additions & 34 deletions b/‎docs/content/workflows/ChIPseq.rst
Lines changed: 25 additions & 34 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎snakePipes/common_functions.py
Lines changed: 4 additions & 1 deletion b/‎snakePipes/common_functions.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎snakePipes/shared/rscripts/CSAW.R
Lines changed: 7 additions & 3 deletions b/‎snakePipes/shared/rscripts/CSAW.R
Lines changed: 7 additions & 3 deletions
diff --git a/‎snakePipes/shared/rscripts/chipqc.R
Lines changed: 121 additions & 0 deletions b/‎snakePipes/shared/rscripts/chipqc.R
Lines changed: 121 additions & 0 deletions
diff --git a/‎snakePipes/shared/rscripts/clean_histoneHMM_result.R
Lines changed: 27 additions & 0 deletions b/‎snakePipes/shared/rscripts/clean_histoneHMM_result.R
Lines changed: 27 additions & 0 deletions
@@ -1,4 +1,4 @@
-{% set version = "3.1.0" %}
+{% set version = "3.2.0" %}
 
 package:
   name: snakepipes
 
@@ -4,7 +4,10 @@ snakePipes News
 snakePipes 3.2.0
 ________________
 
+
+* QC in the ChIPseq workflow is now performed with the ChIPQC R package
 * added allelic-whatshap mode to mRNA seq
+* fixes #1048
 * fixes #1085
 * fixes #1083
 * fixes #1082
 
@@ -163,41 +163,28 @@ Understanding the outputs
 The ChIPseq pipeline will generate additional output as follows::
 
     .
+    ├── AnnotatedResults_MACS2_diffChIP_k4me3
+    ├── Annotation
+    ├── bamCoverage
+    ├── CSAW_MACS2_diffChIP_k4me3
     ├── deepTools_ChIP
     │   ├── bamCompare
-    │   │   ├── sample1.filtered.log2ratio.over_SRR6761502.bw
-    │   │   ├── sample1.filtered.subtract.SRR6761502.bw
-    │   │   ├── sample2.filtered.log2ratio.over_SRR6761502.bw
-    │   │   └── sample2.filtered.subtract.SRR6761502.bw
     │   └── plotFingerprint
-    │       ├── plotFingerprint.metrics.txt
-    │       └── plotFingerprint.png
+    ├── deepTools_qc
+    │   ├── bamPEFragmentSize
+    │   ├── estimateReadFiltering
+    │   ├── multiBamSummary
+    │   ├── plotCorrelation
+    │   ├── plotCoverage
+    │   └── plotPCA
+    ├── filtered_bam
     ├── histoneHMM
-    │   ├── sample2.filtered.histoneHMM-em-posterior.txt.gz
-    │   ├── sample2.filtered.histoneHMM-regions.gff.gz
-    │   ├── sample2.filtered.histoneHMM-regions.gff.gz.tbi
-    │   ├── sample2.filtered.histoneHMM.txt.gz
-    │   ├── sample2.filtered.histoneHMM-zinba-emfit.pdf
-    │   ├── sample2.filtered.histoneHMM-zinba-params-em.RData
-    │   └── sample2.filtered.histoneHMM-zinba-params-em.txt
-    ├── Genrich
-    │   └── sample2.narrowPeak
-    └── MACS2
-        ├── sample1.filtered.BAM_peaks.narrowPeak
-        ├── sample1.filtered.BAM_peaks.qc.txt
-        ├── sample1.filtered.BAM_peaks.xls
-        ├── sample1.filtered.BAMPE_peaks.narrowPeak
-        ├── sample1.filtered.BAMPE_peaks.xls
-        ├── sample1.filtered.BAMPE_summits.bed
-        ├── sample1.filtered.BAM_summits.bed
-        ├── sample2.filtered.BAM_peaks.broadPeak
-        ├── sample2.filtered.BAM_peaks.gappedPeak
-        ├── sample2.filtered.BAM_peaks.qc.txt
-        ├── sample2.filtered.BAM_peaks.xls
-        ├── sample2.filtered.BAMPE_peaks.broadPeak
-        ├── sample2.filtered.BAMPE_peaks.gappedPeak
-        └── sample2.filtered.BAMPE_peaks.xls
-    
+    ├── histoneHMM_chipqc
+    ├── logs
+    ├── MACS2
+    ├── MACS2_chipqc
+    ├── QC_report
+    ├── Sambamba
 
 
 Following up on the DNAmapping module results (see :doc:`DNAmapping`), the workflow produces the following output directories :
@@ -206,12 +193,16 @@ Following up on the DNAmapping module results (see :doc:`DNAmapping`), the workf
 
 * **Genrich**: This folder contains the output of `Genrich <https://github.com/jsh58/Genrich>`__. This will only exist IF you specified ``--peakCaller Genrich`` and you have samples with non-broad peaks. The output is in narrowPeak format, like that from MACS2.
 
-* **MACS2**: This folder contains the output of `MACS2 <https://github.com/taoliu/MACS>`__ on the ChIP samples, MACS2 would perform either a **narrow** or **broad** peak calling on the samples, as indicated by the ChIP sample configuration file (see :ref:`ChIPconfig`). The outputs files would contain the respective tags (**narrowPeak** or **broadPeak**). This folder will only exist if you have non-broad marks and use MACS2 for peak calling
+* **MACS2**: This folder contains the output of `MACS2 <https://github.com/taoliu/MACS>`__ on the ChIP samples, MACS2 would perform either a **narrow** or **broad** peak calling on the samples, as indicated by the ChIP sample configuration file (see :ref:`ChIPconfig`). The outputs files would contain the respective tags (**narrowPeak** or **broadPeak**). This folder will only exist if you have non-broad marks and use MACS2 for peak calling (default).
+
+* **MACS2_chipqc**: This folder contains the output of `ChIPQC <https://bioconductor.org/packages/release/bioc/vignettes/ChIPQC/inst/doc/ChIPQC.pdf>`__ analysis of the peaks called by MACS2. If you used a different peak caller, a chipqc output folder with the peak caller in its name will be listed.
 
 * **histoneHMM**: This folder contains the output of `histoneHMM <https://github.com/matthiasheinig/histoneHMM>`__. This folder will only exist if you have broad marks.
 
-* **CSAW_sampleSheet**: This folder is created optionally, if you provide a sample sheet for differential binding analysis. (see :ref:`diffBinding`) CSAW will be run using peaks called by the chosen peak caller, and the output folder will be named accordingly.
-* **AnnotatedResults_sampleSheet**: This folder is created optionally, if you provide a sample sheet for differential binding analysis. (see :ref:`diffBinding`). Differentially bound regions annotated with distance to nearest gene are stored here.
+* **histoneHMM_chipqc**: This folder contains the output of `ChIPQC <https://bioconductor.org/packages/release/bioc/vignettes/ChIPQC/inst/doc/ChIPQC.pdf>`__ analysis of the peaks called by histoneHMM. This folder will only exist if you have broad marks.
+
+* **CSAW_peakCaller_sampleSheet**: This folder is created optionally, if you provide a sample sheet for differential binding analysis. (see :ref:`diffBinding`) CSAW will be run using peaks called by the chosen peak caller, and the output folder will be named accordingly.
+* **AnnotatedResults_peakCaller_sampleSheet**: This folder is created optionally, if you provide a sample sheet for differential binding analysis. (see :ref:`diffBinding`). Differentially bound regions annotated with distance to nearest gene are stored here.
 
 .. note:: Although in case of broad marks, we also perform the MACS2 `broadpeak` analysis (output available as ``MACS2/<sample>.filtered.BAM_peaks.broadPeak``), we would recommend using the histoneHMM outputs in these cases, since histoneHMM produces better results than MACS2 for broad peaks.
 
 
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "snakePipes"
 description = 'Snakemake workflows and wrappers for NGS data processing from the MPI-IE'
 readme = "README.md"
-version = "3.1.0"
+version = "3.2.0"
 keywords = [
     "DNAmapping",
     "ChIPSeq",
 
@@ -46,7 +46,8 @@ def set_env_yamls():
             'CONDA_SEACR_ENV': 'envs/chip_seacr.yaml',
             'CONDA_FQLINT_ENV': 'envs/fqlint.yaml',
             'CONDA_WHATSHAP_ENV': 'envs/whatshap.yaml',
-            'CONDA_PICARD_ENV': 'envs/picard.yaml'
+            'CONDA_PICARD_ENV': 'envs/picard.yaml',
+            'CONDA_CHIPQC_ENV': 'envs/chipqc.yaml'
             }
 
 
@@ -783,6 +784,8 @@ def runAndCleanup(args, cmd, logfile_name):
     for _l in p.stdout:
         sys.stdout.write(_l.strip() + '\n')
         f.write(_l.strip() + '\n')
+        sys.stdout.flush()
+        f.flush()
     p.wait()
 
     # Exit with an error if snakemake encountered an error
 
@@ -118,9 +118,13 @@ if (! external_bed) {
         })
     } else {
         allpeaks = lapply(snakemake@input[['peaks']], function(x) {
-            bed = read.delim(paste0("../", x), header=FALSE)
-            bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4)
-            return(bed.gr)
+            peakfile<-paste0("../", x)
+            if(file.exists(peakfile) & file.info(peakfile)$size > 0){
+                bed = read.delim(peakfile, header=FALSE)
+                bed.gr = GRanges(seqnames = bed$V1, ranges = IRanges(start = bed$V2, end = bed$V3), name = bed$V4)
+                }else{message(paste0("Skipping peakfile ",peakfile))
+                      bed.gr=GRanges(c(seqnames=NULL,ranges=NULL,strand=NULL,name=NULL))}
+                return(bed.gr)
         })
     }
     # merge
 
@@ -0,0 +1,121 @@
+#!/usr/bin/env Rscript
+
+library(GenomicRanges)
+library(rtracklayer)
+library(ChIPQC)
+library(yaml)
+library(stringr)
+library(purrr)
+
+
+#options(MulticoreParam=MulticoreParam(workers=8))
+register(MulticoreParam(8))
+registered()$MulticoreParam
+
+bamdir<-unlist(snakemake@params[["bams"]])
+peakdir<-unlist(snakemake@params[["peaks"]])
+genome<-gsub("_.+","",snakemake@params[["genome"]])
+wdir <- snakemake@params[["outdir"]]
+blacklist<-snakemake@params[["blacklist"]]
+chipdict<-snakemake@input[["chipdict"]]
+
+setwd(wdir)
+
+spikein<-toupper(snakemake@params[["useSpikeinForNorm"]])
+message(paste0("useSpikeinForNorm is set to: ",spikein))
+if(spikein){
+    ms<-"host"}else{ms<-"filtered"}
+
+
+sampleSheet<-snakemake@input[["sampleSheet"]]
+
+#take samples,marks,replicates from the union of narrow samples and broad samples
+
+#yaml<-read_yaml(chipdict,as.named.list=TRUE) #not used due to buggy conversion of yaml -> list with NULL entries -> data.frame: all-NULL entries are dropped entirely
+narrow_samples<-unlist(snakemake@params[["narrow_samples"]])
+broad_samples<-unlist(snakemake@params[["broad_samples"]])
+samples<-c(narrow_samples,broad_samples)
+ydat<-data.frame("sample"=samples,"broad"=c(rep(FALSE,length(narrow_samples)),rep(TRUE,length(broad_samples))))
+rownames(ydat)<-ydat$sample
+
+ydat
+
+#list of supported factors
+markv<-c("H3K4me1","H3K4me2","H3K4me3","H3K27ac","H3K27me3","H3K9me3","H3K36me3","H4K16ac","RAD21","CTCF","MSL2","BMAL1","CLOCK")
+a<-sapply(markv,function(X)grep(X,samples,ignore.case=TRUE),simplify=TRUE)
+a<-a[!lapply(a,length)<1]
+b<-unlist(a)
+names(b)<-sub("[0-9]$","",names(b))
+markv<-names(sort(b))
+if(all(is.na(markv))){
+  markv<-rep("All",length(samples))
+}
+
+if(all(grepl("rep",samples))){
+  #regres<-regexpr("rep[0-9]?",samples)
+  repv<-str_extract(samples,"rep[0-9]+")
+  repv<-as.numeric(gsub("rep","",repv))
+}else{
+  repv<-rep(1,length(samples))
+}
+
+#check if sample sheet is NA or a file path
+#first implementation: ignore sample sheet and condition and replicates
+#if sample sheet is a file path: get condition and replicate information
+#the check that the sample sheet file exists is taken care of by the python wrapper
+if (!is.null(sampleSheet)){
+  sampleinfo<-read.table(sampleSheet,header=TRUE,sep="\t",quote="")
+  condv<-sampleinfo$condition[match(samples,sampleinfo$name)]
+}else{
+  condv<-rep("All",length(samples))
+}
+
+
+sampledat<-data.frame("SampleID"=samples,"Condition"=condv,"Factor"=markv,"Replicate"=repv)
+
+#ensure that samples,bamdir and peakdir are in the same order!
+    
+sampledat$bamReads<-bamdir[match(samples,sub(paste0("\\.",ms,".bam"),"",basename(bamdir)))]
+message(sprintf("Provided peak files: %s", unlist(peakdir)))
+##for MACS2, modify input peak files: .xls -> .narrowPeak, .broadPeak
+if(all(grepl("histoneHMM",peakdir))){
+sampledat$Peaks<-peakdir[match(samples,sub("_avgp0.5.bed","",basename(peakdir)))]
+}else{sampledat$Peaks<-peakdir[match(samples,sub("\\.filtered.+","",basename(peakdir)))]}
+
+sampledat$PeakCaller<-"bed"
+sampledat$PeakFormat<-"bed"	
+if(all(grepl("MACS2",sampledat$Peaks))){
+        
+        #samples should be in the same order
+        sampledat$Peaks[ydat$broad==TRUE]<-gsub(paste0(".",ms,".BAM_peaks.xls"),paste0(".",ms,".BAM_peaks.broadPeak"),sampledat$Peaks[ydat$broad==TRUE])
+        sampledat$Peaks[ydat$broad==FALSE]<-gsub(paste0(".",ms,".BAM_peaks.xls"),paste0(".",ms,".BAM_peaks.narrowPeak"),sampledat$Peaks[ydat$broad==FALSE])
+        sampledat$PeakFormat[ydat$broad==FALSE]<-"narrow"
+        sampledat$PeakCaller[ydat$broad==FALSE]<-"narrow"
+}
+
+sampledat
+
+##annotation -> check for supported genome versions
+message(paste0("Provided genome: ",genome))
+supported_annotations<-c("hg19","hg18","mm10","mm9","ce6","dm3")
+extended_annotations<-c("GRCh38","GRCh37","GRCm38","GRCm37","ce6","dm3")
+#modify genome string
+if( genome %in% supported_annotations){
+
+    annotation<-genome
+} else if (genome %in% extended_annotations){
+ 
+    annotation<-supported_annotations[grep(genome,extended_annotations)]
+    
+}else {stop("No matching annotation was found.")}
+
+if(file.exists(blacklist)){
+    blist<-blacklist}else{blist<-NULL}
+
+message(paste0("Using blacklist: ",blist))
+QC<-ChIPQC(sampledat,annotation=annotation,mapQCth=3,blacklist=blist)
+ChIPQCreport(QC,reportFolder=".",facet=FALSE,colourBy="Factor")
+
+sink("sessionInfo.txt")
+sessionInfo()
+sink()
@@ -0,0 +1,27 @@
+library(GenomicRanges)
+
+wdir <- snakemake@params[["outdir"]]
+setwd(wdir)
+
+input_peaks <- snakemake@params[["input_peaks"]]
+
+reslist<-lapply(input_peaks,function(X)rtracklayer::import.gff(X))
+names(reslist)<-gsub(".filtered.histoneHMM-regions.gff","",basename(input_peaks))
+for(i in seq_along(reslist)){
+  png(paste0(names(reslist)[i],"_avg_posterior.hist.png"))
+  hist(as.numeric(mcols(reslist[[i]])$avg_posterior),main=names(reslist)[i],xlab="Average posterior probability")
+  abline(v=0.5,col="red")
+  dev.off()
+}
+
+filtlist<-lapply(reslist,function(X)X[mcols(X)$avg_posterior >= 0.5,])
+for(i in seq_along(filtlist)){
+  rtracklayer::export.gff3(filtlist[[i]],paste0(names(filtlist)[i],"_avgp0.5.gff"))
+  a<-filtlist[[i]]
+  mcols(a)$score<-as.numeric(mcols(a)$avg_posterior)
+  rtracklayer::export.bed(a,paste0(names(filtlist)[i],"_avgp0.5.bed"))
+}
+
+sink("sessionInfo.txt")
+sessionInfo()
+sink()
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-{% set version = "3.1.0" %}`
	`1`	`+{% set version = "3.2.0" %}`
`2`	`2`
`3`	`3`	`package:`
`4`	`4`	`name: snakepipes`