Skip to content

Commit 45c1fcc

Browse files
Dom LaetschDom Laetsch
authored andcommitted
release 0.9.4
1 parent c2c800b commit 45c1fcc

File tree

7 files changed

+185
-21
lines changed

7 files changed

+185
-21
lines changed

bam2cov.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""usage: blobtools bam2cov -i FASTA -b BAM [-h|--help]
5+
6+
Options:
7+
-h --help show this
8+
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
9+
-b, --bam <BAM> BAM file (requires samtools in $PATH)
10+
"""
11+
12+
from __future__ import division
13+
import lib.BtLog as BtLog
14+
from docopt import docopt
15+
import re
16+
import subprocess
17+
import os
18+
19+
class Fasta():
20+
def __init__(self, name, seq):
21+
self.name = name
22+
self.length = len(seq)
23+
self.n_count = seq.count('N')
24+
self.agct_count = self.length - self.n_count
25+
self.cov = 0.0
26+
27+
def which(program):
28+
def is_exe(fpath):
29+
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
30+
fpath, fname = os.path.split(program)
31+
if fpath:
32+
if is_exe(program):
33+
return program
34+
else:
35+
for path in os.environ["PATH"].split(os.pathsep):
36+
path = path.strip('"')
37+
exe_file = os.path.join(path, program)
38+
if is_exe(exe_file):
39+
return exe_file
40+
return None
41+
42+
def runCmd(command):
43+
cmd = command.split() # sanitation
44+
p = subprocess.Popen(cmd,
45+
stdout=subprocess.PIPE,
46+
stderr=subprocess.STDOUT)
47+
return iter(p.stdout.readline, b'')
48+
49+
def readFasta(infile):
50+
with open(infile) as fh:
51+
header, seqs = '', []
52+
for l in fh:
53+
if l[0] == '>':
54+
if (header):
55+
yield header, ''.join(seqs)
56+
header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
57+
else:
58+
seqs.append(l[:-1])
59+
yield header, ''.join(seqs)
60+
61+
def parseFasta(infile):
62+
fasta_dict = {}
63+
for name, seq in readFasta(infile):
64+
fasta = Fasta(name, seq)
65+
fasta_dict[fasta.name] = fasta
66+
return fasta_dict
67+
68+
def checkBam(infile):
69+
print BtLog.status_d['10']
70+
if not (which('samtools')):
71+
BtLog.error('7')
72+
reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
73+
reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
74+
reads_total, reads_mapped = 0, 0
75+
output = ''
76+
command = "samtools flagstat " + infile
77+
for line in runCmd(command):
78+
output += line
79+
reads_mapped = int(reads_mapped_re.search(output).group(1))
80+
reads_total = int(reads_total_re.search(output).group(1))
81+
print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
82+
return reads_total, reads_mapped
83+
84+
def readBam(infile, fasta_headers):
85+
reads_total, reads_mapped = checkBam(infile)
86+
progress_unit = int(int(reads_total)/1000)
87+
base_cov_dict = {}
88+
cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's
89+
# execute samtools to get only mapped reads
90+
command = "samtools view -F 4 " + infile
91+
# only one counter since only yields mapped reads
92+
parsed_reads = 0
93+
for line in runCmd(command):
94+
match = line.split("\t")
95+
if match >= 11:
96+
seq_name = match[2]
97+
base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
98+
if (base_cov):
99+
parsed_reads += 1
100+
if seq_name not in fasta_headers:
101+
print BtLog.warn_d['2'] % (seq_name, infile)
102+
else:
103+
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
104+
BtLog.progress(parsed_reads, progress_unit, reads_total)
105+
BtLog.progress(reads_total, progress_unit, reads_total)
106+
107+
if not int(reads_mapped) == int(parsed_reads):
108+
print warn_d['3'] % (reads_mapped, parsed_reads)
109+
return base_cov_dict, reads_total, parsed_reads
110+
111+
def parseBam(bam_f, fasta_dict):
112+
base_cov_dict, reads_total, reads_mapped = readBam(bam_f, set(fasta_dict.keys()))
113+
if reads_total == 0:
114+
print BtLog.warn_d['4'] % bam_f
115+
for name, base_cov in base_cov_dict.items():
116+
fasta_dict[name].cov = base_cov / fasta_dict[name].agct_count
117+
return fasta_dict
118+
119+
def writeCov(fasta_dict, out_f):
120+
with open(out_f, 'w') as fh:
121+
for name, fasta_obj in fasta_dict.items():
122+
fh.write("%s\t%s\n" % (name, fasta_obj.cov))
123+
124+
if __name__ == '__main__':
125+
args = docopt(__doc__)
126+
127+
fasta_f = args['--infile']
128+
bam_f = args['--bam']
129+
out_f = os.path.basename(bam_f) + ".cov"
130+
131+
fasta_dict = parseFasta(fasta_f)
132+
fasta_dict = parseBam(bam_f, fasta_dict)
133+
writeCov(fasta_dict, out_f)
134+

blobtools

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ commands:
99
view print BlobDB
1010
plot plot BlobDB as a blobplot
1111
12+
bam2cov generate cov file from bam file
13+
1214
-h --help show this
1315
1416
"""
@@ -22,7 +24,7 @@ from docopt import docopt
2224
if __name__ == '__main__':
2325
main_dir = join(dirname(__file__), '')
2426
args = docopt(__doc__,
25-
version='version 0.1',
27+
version='version 0.9.4',
2628
options_first=True)
2729
#print(args)
2830

@@ -33,5 +35,7 @@ if __name__ == '__main__':
3335
exit(call(['python', main_dir + 'view.py'] + argv))
3436
elif args['<command>'] == 'plot':
3537
exit(call(['python', main_dir + 'plot.py'] + argv))
38+
elif args['<command>'] == 'bam2cov':
39+
exit(call(['python', main_dir + 'bam2cov.py'] + argv))
3640
else:
3741
exit("%r is not a blobtools command. See 'blobtools -h'." % args['<command>'])

lib/BtCore.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def load(self, BlobDb_f):
115115
self.hitLibs = blobDict['hitLibs']
116116
self.taxrules = blobDict['taxrules']
117117

118-
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index):
118+
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
119119
data_dict = {}
120120
read_cov_dict = {}
121121
max_cov = 0.0
@@ -125,7 +125,9 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index):
125125
for blob in self.dict_of_blobs.values():
126126
name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
127127

128-
if (c_index): # annotation with c_index instead of taxonomic group
128+
if (catcolour_dict): # annotation with categories specified in catcolour
129+
group = str(catcolour_dict[name])
130+
elif (c_index): # annotation with c_index instead of taxonomic group
129131
group = str(blob['taxonomy'][taxrule][rank]['c_index'])
130132
else: # annotation with taxonomic group
131133
group = str(blob['taxonomy'][taxrule][rank]['tax'])

lib/BtIO.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from __future__ import division
1313
import re
1414
import subprocess
15-
import commands
1615
from os.path import basename, isfile, abspath
1716
import os
1817
import lib.BtLog as BtLog

lib/BtLog.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,10 @@ def progress(iteration, steps, max_value):
5151
'18' : '[ERROR:18]\t: Please provide a tax file in BLAST format.',
5252
'19' : '[ERROR:19]\t: Sequence %s in file %s is not part of the assembly.',
5353
'20' : '[ERROR:20]\t: Please add "clc_mapping_info" to you PATH variable.',
54-
'21' : '[ERROR:21]\t: Refcov FILE does not seem to have the right format.',
55-
'22' : '[ERROR:22]\t: Tax file %s seems to have no taxids.'
56-
54+
'21' : '[ERROR:21]\t: Refcov file %s does not seem to have the right format.',
55+
'22' : '[ERROR:22]\t: Tax file %s seems to have no taxids.',
56+
'23' : '[ERROR:23]\t: Catcolour file %s does not seem to have the right format.',
57+
'24' : '[ERROR:24]\t: Catcolour file incompatible with c-index colouring.'
5758
}
5859

5960
warn_d = {

lib/BtPlot.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
mat.rcParams['lines.antialiased'] = True
2929

3030
FONTSIZE = 24
31-
COLOURMAP = "rainbow" # "Set1", "Paired", "Set2", "Spectral"
31+
COLOURMAP = "Spectral" # "Set1", "Paired", "Set2", "Spectral"
3232
BLACK, GREY, BGGREY, WHITE, DGREY = unicode('#262626'), unicode('#d3d3d3'), unicode('#F0F0F5'), unicode('#ffffff'), unicode('#4d4d4d')
3333
nullfmt = NullFormatter()
3434

@@ -58,9 +58,21 @@ def parseRefCov(refcov_f):
5858
'reads_mapped' : int(reads_mapped_ref)
5959
}
6060
except:
61-
BtLog.error('21')
61+
BtLog.error('21', refcov_f)
6262
return refcov_dict
6363

64+
def parseCatColour(catcolour_f):
65+
catcolour_dict = {}
66+
with open(catcolour_f) as fh:
67+
for l in fh:
68+
try:
69+
seq_name, category = l.rstrip("\n").split(",")
70+
catcolour_dict[seq_name] = category
71+
except:
72+
BtLog.error('23', catcolour_f)
73+
return catcolour_dict
74+
75+
6476
def getSortedGroups(data_dict, sort_order):
6577
""" Returns list of sorted groups based on span or count. """
6678
sorted_groups = []
@@ -301,7 +313,7 @@ def relabel_and_colour(self, colour_f, user_labels):
301313
self.plot_order.append('other')
302314

303315
def plotReadCov(self, refcov_dict):
304-
mat.rcParams.update({'font.size': 18})
316+
mat.rcParams.update({'font.size': 24})
305317
plot_data = {}
306318

307319
main_columns = 2
@@ -418,7 +430,7 @@ def plotBlobs(self, cov_lib, info_flag):
418430
group_number_of_seqs = self.stats[group]['count_visible']
419431
group_n50 = self.stats[group]['n50']
420432
blob_size_array = []
421-
s, lw, alpha, colour = 15, 0.5, 1, self.colours[group]
433+
s, lw, alpha, colour = 15, 0.5, 0.5, self.colours[group]
422434
if (self.ignore_contig_length):
423435
if not group == "no-hit":
424436
s = 65
@@ -448,7 +460,7 @@ def plotBlobs(self, cov_lib, info_flag):
448460
if (self.multiplot):
449461
axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=FONTSIZE, frameon=True)
450462
plot_ref_legend(axScatter)
451-
m_out_f = "%s.%s.%s.blobs.%s" % (self.out_f, i, group, self.format)
463+
m_out_f = "%s.%s.%s.blobs.%s" % (self.out_f, i, group.replace("/", "_").replace(" ", "_"), self.format)
452464
print BtLog.status_d['8'] % m_out_f
453465
plt.savefig(m_out_f, format=self.format)
454466
if not (self.ignore_contig_length):

plot.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
[-r RANK] [-x TAXRULE] [--label GROUPS...]
66
[-o PREFIX] [-m] [--sort ORDER] [--hist HIST] [--title]
77
[--colours FILE] [--include FILE] [--exclude FILE]
8-
[--format FORMAT] [--noblobs] [--noreads] [--refcov FILE]
8+
[--format FORMAT] [--noblobs] [--noreads]
9+
[--refcov FILE] [--catcolour FILE]
910
[-h|--help]
1011
1112
Options:
@@ -41,8 +42,11 @@
4142
--noblobs Omit blobplot [default: False]
4243
--noreads Omit plot of reads mapping [default: False]
4344
--refcov FILE File containing number of "total" and "mapped" reads
44-
per coverage file. (e.g.: bam0,900,100). If provided, info
45-
will be used in read coverage plot(s).
45+
per coverage file. (e.g.: bam0,900,100). If provided, info
46+
will be used in read coverage plot(s).
47+
--catcolour FILE Colour plot based on categories from FILE
48+
(format : "seq\tcategory").
49+
4650
"""
4751

4852
from __future__ import division
@@ -79,6 +83,7 @@
7983
no_plot_blobs = args['--noblobs']
8084
no_plot_reads = args['--noreads']
8185
refcov_f = args['--refcov']
86+
catcolour_f = args['--catcolour']
8287

8388
# Does blobdb_f exist ?
8489
if not isfile(blobdb_f):
@@ -112,6 +117,14 @@
112117
if (refcov_f):
113118
refcov_dict = BtPlot.parseRefCov(refcov_f)
114119

120+
catcolour_dict = {}
121+
if (catcolour_f) and (c_index):
122+
BtLog.error('24')
123+
elif (catcolour_f):
124+
catcolour_dict = BtPlot.parseCatColour(catcolour_f)
125+
else:
126+
pass
127+
115128
# Load BlobDb
116129
print BtLog.status_d['9'] % blobdb_f
117130
blobDB = bt.BlobDb('new')
@@ -124,10 +137,8 @@
124137
# Is taxrule sane and was it computed?
125138
if taxrule not in blobDB.taxrules:
126139
BtLog.error('11', taxrule, blobDB.taxrules)
127-
128-
# Get arrays and filter_dict (filter_dict lists, span/count passing filter) for those groups passing min_length, rank, hide_nohits ...
129-
# make it part of core , get data by group ... should be used by stats, generalise ...
130-
data_dict, max_cov, cov_libs, cov_libs_total_reads = blobDB.getPlotData(rank, min_length, hide_nohits, taxrule, c_index)
140+
141+
data_dict, max_cov, cov_libs, cov_libs_total_reads = blobDB.getPlotData(rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict)
131142
plotObj = BtPlot.PlotObj(data_dict, cov_libs, cov_libs_total_reads)
132143
plotObj.exclude_groups = exclude_groups
133144
plotObj.format = format
@@ -139,8 +150,7 @@
139150
plotObj.max_group_plot = max_group_plot
140151
plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order)
141152
plotObj.labels.update(plotObj.group_order)
142-
#if len(plotObj.group_order) > plotObj.max_group_plot:
143-
# plotObj.labels.add('other')
153+
144154
if (user_labels):
145155
for group, label in user_labels.items():
146156
plotObj.labels.add(label)
@@ -157,6 +167,8 @@
157167
out_f = "%s.%s.%s.p%s" % (title, hist_type, rank, max_group_plot)
158168
if out_prefix:
159169
out_f = "%s.%s" % (out_prefix, out_f)
170+
if catcolour_dict:
171+
out_f = "%s.%s" % (out_f, "catcolour")
160172
if ignore_contig_length:
161173
out_f = "%s.%s" % (out_f, "noscale")
162174
if c_index:

0 commit comments

Comments
 (0)