Skip to content

Commit 55e560b

Browse files
authored
Merge pull request #1067 from maxplanck-ie/pairtools_merge
Pairtools merge
2 parents ae1c940 + cf2d4fe commit 55e560b

File tree

16 files changed

+333
-133
lines changed

16 files changed

+333
-133
lines changed

.github/workflows/linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
runs-on: ubuntu-latest
1313
strategy:
1414
matrix:
15-
python-version: ['3.11', '3.12']
15+
python-version: ['3.11', '3.12' ,'3.13']
1616
optdeps: [".", ".[actions]", ".[docs]"]
1717
steps:
1818
- uses: actions/checkout@v4

.github/workflows/osx.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,6 @@ jobs:
4646
pip install .
4747
- name: createEnvsOSX
4848
run: |
49-
conda config --add subdirs osx-64
49+
conda config --set subdir osx-64
5050
snakePipes createEnvs --only ${{matrix.envs}}
5151

conda-recipe/meta.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{% set version = "3.0.0" %}
1+
{% set version = "3.1.0" %}
22

33
package:
44
name: snakepipes
@@ -14,9 +14,10 @@ build:
1414

1515
requirements:
1616
host:
17-
- python >=3
17+
- python >=3, < 3.13
1818
- pip
1919
- seaborn
20+
- setuptools
2021
run:
2122
- python >=3.11
2223
- snakemake >=8

docs/conf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import sys
1616
import os
1717
from importlib.metadata import version as importlibversion
18-
import sphinx_rtd_theme
1918

2019
# to allow readthedocs to compile without installing some dependencies
2120
import mock
@@ -150,7 +149,7 @@
150149

151150
# import them both locally and on rtd
152151
html_theme = 'sphinx_rtd_theme' # 'alabaster' 'sphinx_rtd_theme'
153-
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
152+
# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
154153

155154
# Theme options are theme-specific and customize the look and feel of a theme
156155
# further. For a list of options available for each theme, see the

docs/content/News.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
snakePipes News
22
===============
33

4+
snakePipes 3.1.0
5+
________________
6+
7+
* installation test for python 3.13
8+
* bulk mode in makePairs wf
9+
10+
snakePipes 3.0.0
11+
________________
12+
13+
* clusteryaml omitted for profiles
14+
* All workflows have '-' removed from their name
15+
* toml file installation
16+
* makePairs mode introduced
17+
418
snakePipes 2.9.0
519
________________
620

docs/content/workflows/makePairs.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ makePairs
66
What it does
77
------------
88

9-
The snakePipes makePairs workflow allows users to process their HiC data from raw fastq files to HiC matrices in
10-
an allele-specific manner. The workflow utilized mapping by bwa, followed by analysis
9+
The snakePipes makePairs workflow allows users to process their HiC/uC data from raw fastq files to HiC matrices (in
10+
an allele-specific manner). The workflow utilizes mapping by bwa, followed by analysis
1111
using `pairtools <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9949071/>`__ . The workflow follows the `example workflow described in the documentation of pairtools <https://pairtools.readthedocs.io/en/latest/examples/pairtools_phase_walkthrough.html>`__ ,
1212
which explains each step in detail and would be useful for new users to have a look at.
1313
Currently the output matrices are produced in the `.pairs <https://pairtools.readthedocs.io/en/latest/formats.html>`__ format.

pyproject.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
66
name = "snakePipes"
77
description = 'Snakemake workflows and wrappers for NGS data processing from the MPI-IE'
88
readme = "README.md"
9-
version = "3.0.0"
9+
version = "3.1.0"
1010
keywords = [
1111
"DNAmapping",
1212
"ChIPSeq",
@@ -24,8 +24,7 @@ authors = [
2424
]
2525

2626
classifiers = [
27-
"Intended Audience :: Bioinformaticians",
28-
"Intended Audience :: Biologists",
27+
"Intended Audience :: Science/Research",
2928
"License :: OSI Approved :: MIT License",
3029
"Programming Language :: Python :: 3",
3130
]

snakePipes/parserCommon.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def snpArguments(defaults):
139139
snpargs = parser.add_argument_group('Allele-specific mapping arguments')
140140
snpargs.add_argument("--VCFfile",
141141
default='',
142-
help="VCF file to create N-masked genomes (default: 'None')")
142+
help="VCF file to create N-masked genomes (default: 'None'). Note that for the makePairs workflow this file is assumed to be gzipped and indexed (with tabix).")
143143

144144
snpargs.add_argument("--strains",
145145
default='',

snakePipes/shared/defaults.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@ max_thread: 25
1919
#print tools versions
2020
toolsVersion: True
2121
oldConfig:
22-
configMode: manual
22+
configMode: manual

snakePipes/shared/rules/envs/makePairs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ name: pairtools_phased
22
channels:
33
- conda-forge
44
- bioconda
5-
- defaults
65
dependencies:
76
- bwa
7+
- cooler
88
- samtools
99
- tabix
1010
- bcftools
Lines changed: 67 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -1,199 +1,167 @@
1-
# based on https://github.com/caballero/snakemake-pairtools-phased/tree/df410ff
1+
rule generate_chromsizes:
2+
input:
3+
genome_index
4+
output:
5+
'genome/genome.chromsizes'
6+
threads: 1
7+
shell:'''
8+
cut -f1,2 {input} > {output}
9+
'''
210

3-
4-
# Define function that returns pair files (phased or unphased), based on the reference.
5-
def ret_pair(wildcards):
6-
if "diploid_genome" in wildcards.ref:
7-
# Phased setting
8-
return f"pairs/{wildcards.sample}.{wildcards.ref}_phased.pairs.gz"
9-
else:
10-
return f"pairs/{wildcards.sample}.{wildcards.ref}.pairs.gz"
11-
12-
13-
# different to bwa.snakefile
14-
# here we skip the expensive sorting with samtools after bwa mem
15-
# consider making this optional in bwa.snakefile
1611
rule bwa_mapping:
1712
input:
1813
fq1="FASTQ_fastp/{sample}_R1.fastq.gz",
19-
fq2="FASTQ_fastp/{sample}_R2.fastq.gz",
20-
ix="genome/{ref}.fa.gz.bwt",
14+
fq2="FASTQ_fastp/{sample}_R2.fastq.gz"
2115
output:
22-
bam="bam/{sample}.{ref}.bam",
16+
bam="bam/{sample}.bam",
2317
threads: 30
2418
params:
25-
bwathreads=config["alignerThreads"],
2619
bwaparams=config["alignerOptions"],
27-
fna=lambda wildcards, input: Path(input.ix).with_suffix(""),
20+
bwa_index = bwa_index
2821
resources:
2922
mem_mb=3000,
3023
benchmark:
31-
"bam/.benchmark/bwa_mapping.{sample}.{ref}.benchmark"
24+
"bam/.benchmark/bwa_mapping.{sample}.benchmark"
3225
conda:
3326
CONDA_MAKEPAIRS_ENV
3427
shell:
3528
"""
3629
bwa mem \
3730
{params.bwaparams} \
38-
-t {params.bwathreads} \
39-
{params.fna} \
31+
-t 22 \
32+
{params.bwa_index} \
4033
{input.fq1} \
4134
{input.fq2} \
4235
| samtools view -@ 8 -b \
4336
> {output.bam}
4437
"""
4538

46-
4739
rule pairtools_parse:
4840
input:
49-
bam="bam/{sample}.{ref}.bam",
50-
chr_sizes="genome/{ref}.chromsizes",
41+
bam="bam/{sample}.bam",
42+
chr_sizes='genome/genome.chromsizes'
5143
output:
52-
pairs="pairs/{sample}.{ref}.pairs.gz",
44+
pairs=temp("pairs/{sample}.unsorted.pairs.gz"),
5345
params:
54-
minmapq=40,
55-
cols=lambda wildcards: (
56-
"--add-columns XB,AS,XS" if "diploid_genome" in wildcards.ref else ""
57-
),
46+
minmapq=40
5847
threads: 12
5948
benchmark:
60-
"pairs/.benchmark/pairtools_parse.{sample}.{ref}.benchmark"
49+
"pairs/.benchmark/pairtools_parse.{sample}.benchmark"
6150
conda:
6251
CONDA_MAKEPAIRS_ENV
6352
shell:
6453
"""
6554
pairtools parse \
6655
--min-mapq {params.minmapq} \
67-
{params.cols} \
6856
--drop-sam \
6957
--walks-policy 5unique \
7058
-c {input.chr_sizes} \
7159
{input.bam} \
7260
-o {output.pairs}
7361
"""
7462

75-
76-
rule pairtools_phase:
77-
input:
78-
pairs="pairs/{sample}.diploid_genome.pairs.gz",
79-
output:
80-
pairs="pairs/{sample}.diploid_genome_phased.pairs.gz",
81-
params:
82-
hap1=strains[0],
83-
hap2=strains[1],
84-
threads: 12
85-
benchmark:
86-
"pairs/.benchmark/pairtools_phase.{sample}.benchmark"
87-
conda:
88-
CONDA_MAKEPAIRS_ENV
89-
shell:
90-
"""
91-
pairtools phase \
92-
--phase-suffixes _{params.hap1} _{params.hap2} \
93-
--tag-mode XB \
94-
--clean-output \
95-
{input.pairs} -o {output.pairs}
96-
"""
97-
98-
9963
rule pairtools_sort:
10064
input:
101-
ret_pair,
65+
pairs = "pairs/{sample}.unsorted.pairs.gz",
10266
output:
103-
pairs="pairs/{sample}.{ref}.pairs.sorted.gz",
67+
pairs = "pairs/{sample}.pairs.gz",
10468
threads: 20
10569
benchmark:
106-
"pairs/.benchmark/pairtools_sort.{sample}.{ref}.benchmark"
70+
"pairs/.benchmark/pairtools_sort.{sample}.benchmark"
10771
conda:
10872
CONDA_MAKEPAIRS_ENV
10973
shell:
11074
"""
11175
pairtools sort \
112-
{input} \
76+
{input.pairs} \
11377
-o {output.pairs} \
11478
--memory 20G
11579
"""
11680

117-
11881
rule pairtools_dedup:
11982
input:
120-
pairs="pairs/{sample}.{ref}.pairs.sorted.gz",
83+
pairs="pairs/{sample}.pairs.gz",
12184
output:
122-
pairs="pairs/{sample}.{ref}.pairs.dedup.gz",
123-
stats="pairs/{sample}.{ref}.pairs.dedup.stats",
124-
params:
125-
extra_cols=lambda wildcards: (
126-
"--extra-col-pair phase1 phase2" if "diploid" in wildcards.ref else ""
127-
),
85+
pairs="pairs/{sample}.pairs.dedup.gz",
86+
stats="pairs/{sample}.pairs.dedup.stats"
12887
threads: 12
12988
benchmark:
130-
"pairs/.benchmark/pairtools_dedup.{sample}.{ref}.benchmark"
89+
"pairs/.benchmark/pairtools_dedup.{sample}.benchmark"
13190
conda:
13291
CONDA_MAKEPAIRS_ENV
13392
shell:
13493
"""
13594
pairtools dedup \
13695
--mark-dups \
137-
{params.extra_cols} \
13896
--output-dups - \
13997
--output-unmapped - \
14098
--output-stats {output.stats} \
14199
-o {output.pairs} \
142100
{input.pairs}
143101
"""
144102

103+
rule pairix:
104+
input:
105+
pairs = 'pairs/{sample}.pairs.dedup.gz'
106+
output:
107+
ix = 'pairs/{sample}.pairs.dedup.gz.px2'
108+
threads: 2
109+
conda:
110+
CONDA_MAKEPAIRS_ENV
111+
shell:
112+
"""
113+
pairix -f -p pairs {input.pairs}
114+
"""
145115

146-
rule pairtools_filter_phased:
116+
rule cooler:
147117
input:
148-
pairs="pairs/{sample}.diploid_genome.pairs.dedup.gz",
118+
pairs = 'pairs/{sample}.pairs.dedup.gz',
119+
ix = 'pairs/{sample}.pairs.dedup.gz.px2',
120+
chromsizes = 'genome/genome.chromsizes'
149121
output:
150-
stats="phase_stats/{sample}.diploid_genome_{phasetype}.pairs.stats",
151-
pairs="phase_stats/{sample}.diploid_genome_{phasetype}.pairs.gz",
152-
params:
153-
filterparam=lambda wildcards: PHASEDIC[wildcards.phasetype],
154-
resources:
155-
mem_mb=1000,
156-
benchmark:
157-
"phase_stats/.benchmark/pairtools_filter_phased.{sample}.diploid_genome_{phasetype}.benchmark"
158-
threads: 12
122+
cool = 'cooler/{sample}.5000.cool'
123+
threads: 20
159124
conda:
160125
CONDA_MAKEPAIRS_ENV
161126
shell:
162127
"""
163-
pairtools select \
164-
'{params.filterparam}' \
165-
{input.pairs} \
166-
-o {output.pairs}
167-
pairtools stats {output.pairs} -o {output.stats}
128+
cooler cload pairix -p {threads} {input.chromsizes}:5000 {input.pairs} {output.cool}
129+
cooler balance --nproc {threads} {output.cool}
168130
"""
169131

132+
rule mcool:
133+
input:
134+
cool = 'cooler/{sample}.5000.cool'
135+
output:
136+
mcool = 'cooler/{sample}.5000.mcool'
137+
threads: 20
138+
conda:
139+
CONDA_MAKEPAIRS_ENV
140+
shell:
141+
"""
142+
cooler zoomify --resolutions 5000,10000,20000,40000,80000,120000 --balance --nproc {threads} {input.cool}
143+
"""
170144

171145
rule multiqc:
172146
input:
173-
stats=expand(
174-
"pairs/{sample}.{ref}.pairs.dedup.stats", sample=samples, ref=REFERENCES
175-
),
176-
phasedstats=expand(
177-
"phase_stats/{sample}.diploid_genome_{phasetype}.pairs.stats",
178-
sample=samples,
179-
phasetype=PHASEDIC.keys(),
180-
),
147+
cools=expand(
148+
'cooler/{sample}.5000.mcool', sample=samples
149+
)
181150
output:
182-
html="multiqc/multiqc_report.html",
151+
html="multiQC/multiqc_report.html",
183152
params:
184-
odir="multiqc",
153+
odir="multiQC",
185154
benchmark:
186-
"multiqc/.benchmark/multiqc.benchmark"
155+
"multiQC/.benchmark/multiqc.benchmark"
187156
threads: 1
188157
conda:
189158
CONDA_MAKEPAIRS_ENV
190159
shell:
191160
"""
192-
echo input: {input.phasedstats}
193161
multiqc \
194162
--module pairtools \
195163
--module fastqc \
196164
--module fastp \
197165
-o {params.odir} \
198166
.
199-
"""
167+
"""

0 commit comments

Comments
 (0)