Skip to content

Commit 20c3cca

Browse files
committed
Update version to 0.1.9
1 parent 6cd7de0 commit 20c3cca

21 files changed

+1853
-240
lines changed

README.rst

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,24 @@ A toolkit for working with ATAC-seq data.
77
What's in the box?
88
==================
99

10-
Programs we've found useful in ATAC-seq pipelines
11-
-------------------------------------------------
10+
Programs we've found useful in analyzing ATAC-seq data
11+
------------------------------------------------------
1212

13-
* ``trim_adapters``: based on Jason Buenrostro's utility for trimming
14-
Illumina adapters by aligning paired reads to each other.
1513
* ``make_cut_matrix``: useful in conjunction with CENTIPEDE, and in
1614
generating plots of transcription factor binding sites.
17-
* ``plot_aggregate_matrix.R``: generates plots for motifs given the
15+
* ``make_midpoint_matrix``: useful in conjunction with CENTIPEDE, and in
16+
generating plots of transcription factor binding sites.
17+
* ``measure_signal``: measures size and position of ATAC-seq fragments
18+
overlapping genomic features.
19+
* ``measure_features``: given a bigWig file of coverage counts and a BED
20+
file of features, calculates a requested statistic for each feature.
21+
* ``plot_aggregate_cut_matrix.R``: generates plots for motifs given the
1822
aggregate output of `make_cut_matrix`
23+
* ``plot_aggregate_midpoint_matrix.R``: generates plots for motifs given the
24+
aggregate output of `make_midpoint_matrix`
25+
* ``plot_signal.R``: plots the output of `measure_signal`
26+
* ``trim_adapters``: based on Jason Buenrostro's utility for trimming
27+
Illumina adapters by aligning paired reads to each other.
1928

2029
A Python library you can use in your own tools for processing ATAC-seq data
2130
---------------------------------------------------------------------------
@@ -31,7 +40,8 @@ you might find your pipeline can be simplified too.
3140
Requirements
3241
============
3342

34-
* Python. We've run it successfully under versions 2.7.10 and 3.4.3.
43+
* Python. We've run it successfully under versions 2.7 and 3.5.
44+
* pyBigWig
3545
* pysam
3646
* python-levenshtein
3747
* sexpdata
@@ -44,6 +54,10 @@ At the command line::
4454
git clone https://github.com/ParkerLab/atactk
4555
pip install ./atactk
4656

57+
Or in one step, if you don't want a local copy::
58+
59+
pip install git+https://github.com/ParkerLab/atactk
60+
4761
Documentation
4862
=============
4963

atactk/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#
22
# atactk: ATAC-seq toolkit
33
#
4-
# Copyright 2015 The Parker Lab at the University of Michigan
4+
# Copyright 2015 Stephen Parker
55
#
66
# Licensed under Version 3 of the GPL or any later version
77
#
88

99

1010
__author__ = 'The Parker Lab'
1111
__email__ = 'parkerlab-software@umich.edu'
12-
__version__ = '0.1.6'
12+
__version__ = '0.1.9'

atactk/command.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
22
# atactk: ATAC-seq toolkit
33
#
4-
# Copyright 2015 The Parker Lab at the University of Michigan
4+
# Copyright 2015 Stephen Parker
55
#
66
# Licensed under Version 3 of the GPL or any later version
77
#
@@ -14,10 +14,13 @@
1414
from __future__ import print_function
1515

1616
import argparse
17+
import logging
1718
import sys
1819

1920
import sexpdata
2021

22+
import atactk
23+
2124

2225
def check_bins_for_overlap(bins):
2326
"""
@@ -39,28 +42,51 @@ def check_bins_for_overlap(bins):
3942
for bin in bins:
4043
start, end, resolution = bin
4144
if start <= last_end:
42-
raise argparse.ArgumentTypeError("Bin %d-%d overlaps %d-%d." % (start, end, last_start, last_end))
45+
raise argparse.ArgumentTypeError("Bin {:d}-{:d} overlaps {:d}-{:d}.".format(start, end, last_start, last_end))
4346
last_start, last_end = start, end
4447

4548

49+
def check_bin_resolutions(bins, extension):
50+
"""
51+
Check the resolutions of a flattened list of bins.
52+
53+
Look for inconsistent resolutions, which tend to give suboptimal
54+
results, or resolutions that don't divide the extended region
55+
evenly.
56+
"""
57+
logger = logging.getLogger("{}.{}".format(__name__, check_bin_resolutions.__name__))
58+
59+
resolutions = set(bin[2] for bin in bins)
60+
if len(resolutions) > 1:
61+
logger.warn("""We've found we usually get better results using the same resolution in each bin.""")
62+
63+
for bin in bins:
64+
start, end, resolution = bin
65+
if extension % resolution != 0:
66+
logger.warn("Bin {:d}-{:d} resolution {:d} is not a divisor of extension {:d}".format(start, end, resolution, extension))
67+
68+
4669
def parse_bins(bins_string):
4770
"""
48-
Parse the string representing template size groups.
71+
Parse the string representing fragment size groups.
4972
5073
The bins are specified as a list of groups, each group comprising
5174
one or more bins, and ending with a resolution value, which
5275
controls how many individual cuts in the extended region around
5376
the feature are aggregated. Within the feature itself, we always
54-
count the cut points for each base. A complete example:
77+
count the cut points for each base.
5578
56-
(36-149 1) (150-224 225-324 2) (325-400 5)
79+
We recommend using the same resolution in all bins. The following
80+
example only shows different resolutions to be thorough.
81+
82+
Assume a --bins value of '(36-149 1) (150-224 225-324 2) (325-400 5)'.
5783
5884
With a resolution of 1, every base in the extended region
59-
around motifs overlapping templates of length 36-149 would be
85+
around motifs overlapping fragments of length 36-149 would be
6086
scored independently; each base's cut count would be added to
6187
the matrix.
6288
63-
The second group, for templates of length 150-224 or 225-324,
89+
The second group, for fragments of length 150-224 or 225-324,
6490
with a resolution of 2, would result in every two bases in the
6591
extended region around motifs being added together. Then the
6692
aggregate scores of the two bins in the group would be summed,
@@ -72,7 +98,7 @@ def parse_bins(bins_string):
7298
7399
To illustrate, assume these settings and an imaginary motif 5
74100
bases long, with a 10-base extended region on either side, and
75-
for the sake of example pretend that each template length bin
101+
for the sake of example pretend that each fragment length bin
76102
had the same counts of cut points around the motif, shown
77103
here::
78104
@@ -120,7 +146,7 @@ def parse_bins(bins_string):
120146
if resolution < 1:
121147
raise ValueError
122148
except ValueError:
123-
raise argparse.ArgumentTypeError("Resolution in bin group %s is not a positive integer." % g)
149+
raise argparse.ArgumentTypeError("Resolution in bin group {} is not a positive integer.".format(g))
124150

125151
for i, bin_string in enumerate(bin_group):
126152
bin_string = bin_string.value()
@@ -131,14 +157,18 @@ def parse_bins(bins_string):
131157
start, end = [int(s) for s in bin]
132158
if start > end:
133159
start, end = end, start
134-
print("Bin %s specified backward; corrected to %d-%d" % (bin_string, start, end), file=sys.stderr)
160+
print("Bin {} specified backward; corrected to {:d}-{:d}".format(bin_string, start, end), file=sys.stderr)
135161

136162
group.append((start, end, resolution))
137163
except ValueError:
138-
raise argparse.ArgumentTypeError("Bin %s in group %s is malformed." % (i, g))
164+
raise argparse.ArgumentTypeError("Bin {} in group {} is malformed.".format(i, g))
139165
groups.append(group)
140166

141167
# flatten groups to just a list of bins, sort, check for overlaps
142168
bins = sorted([b for bins in groups for b in bins])
143169
check_bins_for_overlap(bins)
144170
return groups
171+
172+
173+
def version(program_name=''):
174+
return program_name + ' ' + atactk.__version__

atactk/data.py

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
22
# atactk: ATAC-seq toolkit
33
#
4-
# Copyright 2015 The Parker Lab at the University of Michigan
4+
# Copyright 2015 Stephen Parker
55
#
66
# Licensed under Version 3 of the GPL or any later version
77
#
@@ -11,8 +11,6 @@
1111
Code for reading and manipulating data commonly used in ATAC-seq pipelines.
1212
"""
1313

14-
from __future__ import print_function
15-
1614
import csv
1715
import gzip
1816
import pysam
@@ -80,15 +78,12 @@ def __init__(self, reference=None, start=None, end=None, name=None, score=0, str
8078
self.feature_end = int(end)
8179

8280
# optional BED fields
83-
self.name = name
84-
self.score = float(score)
85-
self.strand = strand
81+
self.name = name or None
82+
self.score = score and float(score) or None
83+
self.strand = strand or None
8684

8785
# region adjustments
8886
self.extension = int(extension)
89-
self.is_reverse = strand == '-'
90-
self.region_start = self.feature_start - self.extension
91-
self.region_end = self.feature_end + self.extension
9287

9388
def __str__(self):
9489
return '\t'.join(str(attribute or '') for attribute in [
@@ -101,13 +96,30 @@ def __str__(self):
10196
self.extension,
10297
])
10398

99+
@property
100+
def center(self):
101+
return self.feature_start + int(round(self.feature_length / 2.0))
102+
104103
@property
105104
def feature_length(self):
106105
return self.feature_end - self.feature_start
107106

107+
@property
108+
def is_reverse(self):
109+
return self.strand == '-'
110+
111+
@property
112+
def region_end(self):
113+
return self.center + self.extension
114+
108115
@property
109116
def region_length(self):
110-
return self.region_end - self.region_start
117+
return self.extension * 2
118+
119+
@property
120+
def region_start(self):
121+
return self.center - self.extension
122+
111123

112124
def complement(seq):
113125
"""
@@ -175,8 +187,9 @@ def open_maybe_gzipped(filename):
175187

176188
def count_features(filename):
177189
count = 0
178-
for line in open_maybe_gzipped(filename):
179-
count += 1
190+
with open_maybe_gzipped(filename) as f:
191+
for line in f:
192+
count += 1
180193
return count
181194

182195

@@ -203,32 +216,24 @@ def read_features(filename, extension=100, feature_class=ExtendedFeature):
203216
An :class:`ExtendedFeature` instance for each row of the file.
204217
"""
205218

206-
if filename == '-':
207-
source = sys.stdin
208-
else:
209-
source = open_maybe_gzipped(filename)
210-
reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab')
211-
for row in reader:
212-
if 'extra_fields' in row:
213-
del row['extra_fields']
214-
yield feature_class(extension=extension, **row)
219+
with filename == '-' and sys.stdin or open_maybe_gzipped(filename) as source:
220+
reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab')
215221

216-
217-
ALIGNMENT_FILE_CACHE = {}
222+
for row in reader:
223+
if 'extra_fields' in row:
224+
del row['extra_fields']
225+
yield feature_class(extension=extension, **row)
218226

219227

220228
def open_alignment_file(alignment_filename):
221-
if alignment_filename in ALIGNMENT_FILE_CACHE:
222-
return ALIGNMENT_FILE_CACHE[alignment_filename]
223-
224229
alignment_file = pysam.AlignmentFile(alignment_filename, 'rb')
225230
try:
226231
alignment_file.check_index()
227232
except AttributeError:
228233
raise AttributeError('The alignments file {} is not in BAM format. Please supply an indexed BAM file.'.format(alignment_filename))
229234
except ValueError:
230235
raise ValueError('The alignment file {} is not usable. Please supply an indexed BAM file.'.format(alignment_filename))
231-
ALIGNMENT_FILE_CACHE[alignment_filename] = alignment_file
236+
232237
return alignment_file
233238

234239

atactk/metrics/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#
2+
# atactk: ATAC-seq toolkit
3+
#
4+
# Copyright 2015 Stephen Parker
5+
#
6+
# Licensed under Version 3 of the GPL or any later version
7+
#

atactk/metrics/common.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#
2+
# atactk: ATAC-seq toolkit
3+
#
4+
# Copyright 2015 Stephen Parker
5+
#
6+
# Licensed under Version 3 of the GPL or any later version
7+
#
8+
9+
10+
import atactk.data
11+
import atactk.util
12+
13+
14+
DEFAULT_CUT_POINT_OFFSET = 4
15+
16+
17+
def reduce_scores(scores, resolution):
18+
"""
19+
Reduce a sequence of scores by summing every `resolution` values.
20+
21+
Called with `scores` of [0, 1, 1, 4, 2], you'd get the following
22+
results at various resolutions:
23+
24+
========== ======
25+
Resolution Result
26+
========== ======
27+
1 [0, 1, 1, 4, 2]
28+
2 [1, 5, 2]
29+
3 [2, 6]
30+
4 [6, 2]
31+
10 [8]
32+
========== ======
33+
"""
34+
if resolution == 1:
35+
return scores
36+
return [sum(chunk) for chunk in atactk.util.partition(resolution, scores)]

0 commit comments

Comments
 (0)