Skip to content

Commit 4aaf05d

Browse files
authored
Merge pull request #27 from DRL/bugfix
Bugfix
2 parents 8bad6a5 + ba704f9 commit 4aaf05d

13 files changed

+108
-109
lines changed

bloblib/BtCore.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@ def calculateGC(self, seq):
437437
if self.agct_count > 0 else 0.0)
438438

439439
def addCov(self, lib_name, cov):
440-
self.covs[lib_name] = cov
440+
self.covs[lib_name] = float("{0:.3f}".format(cov)) # changed to three decimal digits
441441

442442
def addReadCov(self, lib_name, read_cov):
443443
self.read_cov[lib_name] = read_cov

bloblib/BtIO.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,18 @@
99
from __future__ import division
1010
import re
1111
import subprocess
12-
from os.path import basename, isfile, abspath, splitext, join, isdir
13-
import shutil
1412
import os
15-
import sys
13+
from os.path import basename, isfile, splitext, join, isdir
14+
import shutil
1615
import bloblib.BtLog as BtLog
17-
from collections import deque
1816

1917

2018
def create_dir(directory="", overwrite=True):
21-
if (directory):
19+
if directory:
2220
if not isdir(directory):
2321
os.makedirs(directory)
2422
else:
25-
if (overwrite):
23+
if overwrite:
2624
shutil.rmtree(directory) #removes all the subdirectories!
2725
os.makedirs(directory)
2826
return directory
@@ -31,7 +29,7 @@ def create_dir(directory="", overwrite=True):
3129

3230
def parseList(infile):
3331
if not isfile(infile):
34-
BtLog.error('0', infile)
32+
BtLog.error('0', infile)
3533
with open(infile) as fh:
3634
items = []
3735
for l in fh:
@@ -40,7 +38,7 @@ def parseList(infile):
4038

4139
def parseReferenceCov(infile):
4240
refcov_dict = {}
43-
if (infile):
41+
if infile:
4442
if not isfile(infile):
4543
BtLog.error('0', infile)
4644
with open(infile) as fh:
@@ -55,7 +53,7 @@ def parseReferenceCov(infile):
5553

5654
def parseCmdlist(temp):
5755
_list = []
58-
if (temp):
56+
if temp:
5957
if "," in temp:
6058
_list = temp.split(",")
6159
else:
@@ -65,7 +63,7 @@ def parseCmdlist(temp):
6563
def parseCmdLabels(labels):
6664
label_d = {}
6765
name, groups = '', ''
68-
if (labels):
66+
if labels:
6967
try:
7068
for label in labels:
7169
name, groups = str(label).split("=")
@@ -80,7 +78,7 @@ def parseCmdLabels(labels):
8078

8179
def parseCatColour(infile):
8280
catcolour_dict = {}
83-
if (infile):
81+
if infile:
8482
if not isfile(infile):
8583
BtLog.error('0', infile)
8684
with open(infile) as fh:
@@ -94,7 +92,7 @@ def parseCatColour(infile):
9492

9593
def parseDict(infile, key, value):
9694
items = {}
97-
if (infile):
95+
if infile:
9896
if not isfile(infile):
9997
BtLog.error('0', infile)
10098
with open(infile) as fh:
@@ -108,7 +106,7 @@ def parseDict(infile, key, value):
108106

109107
def parseColours(infile):
110108
items = {}
111-
if (infile):
109+
if infile:
112110
if not isfile(infile):
113111
BtLog.error('0', infile)
114112
with open(infile) as fh:
@@ -119,7 +117,7 @@ def parseColours(infile):
119117

120118
def parseSet(infile):
121119
if not isfile(infile):
122-
BtLog.error('0', infile)
120+
BtLog.error('0', infile)
123121
with open(infile) as fh:
124122
items = set()
125123
for l in fh:
@@ -134,12 +132,12 @@ def parseFastaNameOrder(infile):
134132

135133
def readFasta(infile):
136134
if not isfile(infile):
137-
BtLog.error('0', infile)
135+
BtLog.error('0', infile)
138136
with open(infile) as fh:
139137
header, seqs = '', []
140138
for l in fh:
141139
if l[0] == '>':
142-
if (header):
140+
if header:
143141
yield header, ''.join(seqs)
144142
header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
145143
else:
@@ -173,8 +171,8 @@ def is_exe(fpath):
173171
def checkBam(infile):
174172
print BtLog.status_d['10']
175173
if not isfile(infile):
176-
BtLog.error('0', infile)
177-
if not (which('samtools')):
174+
BtLog.error('0', infile)
175+
if not which('samtools'):
178176
BtLog.error('7')
179177
reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
180178
reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
@@ -189,14 +187,15 @@ def checkBam(infile):
189187
reads_mapped = reads_mapped - reads_secondary
190188
reads_total = int(reads_total_re.search(output).group(1))
191189
# check whether there are reads in BAM
192-
if not (reads_total) or not (reads_mapped):
193-
BtLog.error('29' % infile)
194-
print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
190+
if not reads_total or not reads_mapped:
191+
BtLog.error('29' % infile)
192+
print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
193+
'{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
195194
return reads_total, reads_mapped
196195

197196
def parseSam(infile, set_of_blobs, no_base_cov_flag):
198197
if not isfile(infile):
199-
BtLog.error('0', infile)
198+
BtLog.error('0', infile)
200199
base_cov_dict = {blob : [] for blob in set_of_blobs}
201200
read_cov_dict = {blob : 0 for blob in set_of_blobs}
202201
cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
@@ -241,15 +240,15 @@ def parseBam(infile, set_of_blobs, no_base_cov_flag):
241240
242241
'''
243242
if not isfile(infile):
244-
BtLog.error('0', infile)
243+
BtLog.error('0', infile)
245244
reads_total, reads_mapped = checkBam(infile)
246245
progress_unit = int(reads_mapped/1000)
247246
base_cov_dict = {blob : [] for blob in set_of_blobs}
248247
#base_cov_dict = {blob : 0 for blob in set_of_blobs}
249248
read_cov_dict = {blob : 0 for blob in set_of_blobs}
250249
cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
251250
# execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
252-
command = "samtools view -F 1028 -F 4 -F 256 " + infile
251+
command = "samtools view -F 1024 -F 4 -F 256 " + infile
253252
seen_reads = 0
254253
#import time
255254
#start = time.time()
@@ -308,7 +307,7 @@ def parseCovFromHeader(fasta_type, header):
308307

309308
def parseCov(infile, set_of_blobs):
310309
if not isfile(infile):
311-
BtLog.error('0', infile)
310+
BtLog.error('0', infile)
312311
old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
313312
base_cov_dict = {}
314313

@@ -361,7 +360,7 @@ def parseCov(infile, set_of_blobs):
361360
def checkCas(infile):
362361
print BtLog.status_d['12']
363362
if not isfile(infile):
364-
BtLog.error('0', infile)
363+
BtLog.error('0', infile)
365364
if not (which('clc_mapping_info')):
366365
BtLog.error('20')
367366
seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
@@ -380,7 +379,7 @@ def checkCas(infile):
380379

381380
def parseCas(infile, order_of_blobs):
382381
if not isfile(infile):
383-
BtLog.error('0', infile)
382+
BtLog.error('0', infile)
384383
seqs_total, reads_total, reads_mapped = checkCas(infile)
385384
progress_unit = int(len(order_of_blobs)/100)
386385
cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
@@ -413,7 +412,7 @@ def readTax(infile, set_of_blobs):
413412
- add as key-value pairs to hitDict
414413
'''
415414
if not isfile(infile):
416-
BtLog.error('0', infile)
415+
BtLog.error('0', infile)
417416
hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards
418417
with open(infile) as fh:
419418
for line in fh:
@@ -507,7 +506,7 @@ def readNamesNodes(names_f, nodes_f):
507506
for line in fh:
508507
names_col = line.split("\t")
509508
if names_col[6] == "scientific name":
510-
nodesDB[names_col[0]]['name'] = names_col[2]
509+
nodesDB[names_col[0]]['name'] = names_col[2]
511510
nodesDB['nodes_count'] = nodes_count
512511
return nodesDB
513512

@@ -545,7 +544,7 @@ def byteify(input):
545544
http://stackoverflow.com/a/13105359
546545
'''
547546
if isinstance(input, dict):
548-
return {byteify(key):byteify(value) for key,value in input.iteritems()}
547+
return {byteify(key):byteify(value) for key, value in input.iteritems()}
549548
elif isinstance(input, list):
550549
return [byteify(element) for element in input]
551550
elif isinstance(input, unicode):
@@ -580,7 +579,7 @@ def parseJsonGzip(infile):
580579
def parseJson(infile):
581580
'''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
582581
if not isfile(infile):
583-
BtLog.error('0', infile)
582+
BtLog.error('0', infile)
584583
import time
585584
start = time.time()
586585
json_parser = ''

bloblib/BtLog.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def progress(iteration, steps, max_value, no_limit=False):
8686
'6' : '[WARN]\t\t: Sum of coverage in cov lib %s is 0.0. Please ignore this warning if "--no_base_cov" was specified.',
8787
'7' : '[WARN]\t\t: No taxonomy information found.',
8888
'8' : '[WARN]\t\t: Duplicated sequences found :\n\t\t\t%s',
89-
'9' : '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...'
89+
'9' : '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...',
90+
'10' : '[WARN]\t\t: Line %s: sequence "%s" already has TaxID "%s". Skipped. (use --force to overwrite)'
9091

9192
}
9293
status_d = {

bloblib/BtTax.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,42 +11,42 @@
1111
TAXRULES = ['bestsum', 'bestsumorder'] # this should be re-named colour rules at one point
1212

1313
def noHit():
14-
return {rank : {'tax' : 'no-hit', 'score' : 0.0, 'c_index' : None} for rank in RANKS}
14+
return {rank : {'tax' : 'no-hit', 'score' : 0.0, 'c_index' : None} for rank in RANKS}
1515

1616
def getTreeList(taxIds, nodesDB):
1717
known_tree_lists = {}
1818
for taxId in taxIds:
19-
if not taxId in known_tree_lists:
20-
tree_list = []
21-
nextTaxId = [taxId]
22-
while nextTaxId:
23-
thisTaxId = nextTaxId.pop(0)
24-
if (not thisTaxId == '1') and (thisTaxId in nodesDB):
25-
parent = nodesDB[thisTaxId]['parent']
26-
nextTaxId.append(parent)
27-
tree_list.append(thisTaxId)
28-
else:
29-
tree_list.append('1')
30-
known_tree_lists[taxId] = tree_list
19+
if not taxId in known_tree_lists:
20+
tree_list = []
21+
nextTaxId = [taxId]
22+
while nextTaxId:
23+
thisTaxId = nextTaxId.pop(0)
24+
if (not thisTaxId == '1') and (thisTaxId in nodesDB):
25+
parent = nodesDB[thisTaxId]['parent']
26+
nextTaxId.append(parent)
27+
tree_list.append(thisTaxId)
28+
else:
29+
tree_list.append('1')
30+
known_tree_lists[taxId] = tree_list
3131
return known_tree_lists
3232

3333
def getLineages(tree_lists, nodesDB):
34-
lineage = {}
35-
for tree_list_id, tree_list in tree_lists.items():
36-
lineage[tree_list_id] = {rank : 'undef' for rank in RANKS}
37-
for taxId in tree_list:
38-
node = nodesDB[taxId]
39-
if node['rank'] in RANKS:
40-
lineage[tree_list_id][node['rank']] = node['name']
41-
# traverse ranks again so that undef is "higher_def_rank" + "-" + undef
42-
def_rank = ''
43-
for rank in reversed(list(RANKS)):
44-
if not lineage[tree_list_id][rank] == 'undef':
45-
def_rank = lineage[tree_list_id][rank]
46-
else:
47-
if (def_rank):
48-
lineage[tree_list_id][rank] = def_rank + "-" + lineage[tree_list_id][rank]
49-
return lineage
34+
lineage = {}
35+
for tree_list_id, tree_list in tree_lists.items():
36+
lineage[tree_list_id] = {rank : 'undef' for rank in RANKS}
37+
for taxId in tree_list:
38+
node = nodesDB[taxId]
39+
if node['rank'] in RANKS:
40+
lineage[tree_list_id][node['rank']] = node['name']
41+
# traverse ranks again so that undef is "higher_def_rank" + "-" + undef
42+
def_rank = ''
43+
for rank in reversed(list(RANKS)):
44+
if not lineage[tree_list_id][rank] == 'undef':
45+
def_rank = lineage[tree_list_id][rank]
46+
else:
47+
if (def_rank):
48+
lineage[tree_list_id][rank] = def_rank + "-" + lineage[tree_list_id][rank]
49+
return lineage
5050

5151
def taxRuleBestSum(taxDict, taxonomy, min_bitscore_diff, tax_collision_random):
5252
tempTax = { rank : {} for rank in RANKS }

bloblib/blobplot.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@
3434
span : span-weighted histograms
3535
count : count histograms
3636
-r, --rank <RANK> Taxonomic rank used for colouring of blobs [default: phylum]
37-
(Supported: species, genus, family, order, phylum, superkingdom)
37+
(Supported: species, genus, family, order,
38+
phylum, superkingdom)
3839
-x, --taxrule <TAXRULE> Taxrule which has been used for computing taxonomy
3940
(Supported: bestsum, bestsumorder) [default: bestsum]
4041
--format FORMAT Figure format for plot (png, pdf, eps, jpeg,

bloblib/covplot.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@
4040
span : span-weighted histograms
4141
count : count histograms
4242
-r, --rank <RANK> Taxonomic rank used for colouring of blobs [default: phylum]
43-
(Supported: species, genus, family, order, phylum, superkingdom)
43+
(Supported: species, genus, family, order,
44+
phylum, superkingdom)
4445
-x, --taxrule <TAXRULE> Taxrule which has been used for computing taxonomy
4546
(Supported: bestsum, bestsumorder) [default: bestsum]
4647
--format FORMAT Figure format for plot (png, pdf, eps, jpeg,
@@ -57,7 +58,7 @@
5758
per coverage file. (e.g.: bam0,900,100). If provided, info
5859
will be used in read coverage plot(s).
5960
--catcolour <FILE> Colour plot based on categories from FILE
60-
(format : "seq\tcategory").
61+
(format : "seq,category").
6162
"""
6263

6364
from __future__ import division

bloblib/create.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,23 @@
44
"""usage: blobtools create -i FASTA [-y FASTATYPE] [-o PREFIX] [--title TITLE]
55
[-b BAM...] [-s SAM...] [-a CAS...] [-c COV...]
66
[--nodes <NODES>] [--names <NAMES>] [--db <NODESDB>]
7-
[-t TAX...] [-x TAXRULE...] [-m INT] [--tax_collision_random]
7+
[-t HITS...] [-x TAXRULE...] [-m INT] [--tax_collision_random]
88
[-h|--help]
99
1010
Options:
1111
-h --help show this
1212
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
1313
-y, --type FASTATYPE Assembly program used to create FASTA. If specified,
1414
coverage will be parsed from FASTA header.
15-
(Parsing supported for 'spades', 'soap', 'velvet', 'abyss', 'platanus')
16-
-t, --taxfile TAX... Taxonomy file in format (qseqid\\ttaxid\\tbitscore)
15+
(Parsing supported for 'spades', 'velvet', 'platanus')
16+
-t, --hitsfile HITS... Hits file in format (qseqid\\ttaxid\\tbitscore)
1717
(e.g. BLAST output "--outfmt '6 qseqid staxids bitscore'")
18-
-x, --taxrule <TAXRULE>... Taxrule determines how taxonomy of blobs is computed [default: bestsum]
19-
"bestsum" : sum bitscore across all hits for each taxonomic rank
20-
"bestsumorder" : sum bitscore across all hits for each taxonomic rank.
18+
-x, --taxrule <TAXRULE>... Taxrule determines how taxonomy of blobs
19+
is computed [default: bestsum]
20+
"bestsum" : sum bitscore across all
21+
hits for each taxonomic rank
22+
"bestsumorder" : sum bitscore across all
23+
hits for each taxonomic rank.
2124
- If first <TAX> file supplies hits, bestsum is calculated.
2225
- If no hit is found, the next <TAX> file is used.
2326
-m, --min_diff <FLOAT> Minimal score difference between highest scoring
@@ -30,7 +33,7 @@
3033
-b, --bam <BAM>... BAM file(s) (requires samtools in $PATH)
3134
-s, --sam <SAM>... SAM file(s)
3235
-a, --cas <CAS>... CAS file(s) (requires clc_mapping_info in $PATH)
33-
-c, --cov <COV>... TAB separated. (seqID\\tcoverage)
36+
-c, --cov <COV>... COV file(s)
3437
-o, --out <PREFIX> BlobDB output prefix
3538
--title TITLE Title of BlobDB [default: output prefix)
3639
"""
@@ -57,7 +60,7 @@ def main():
5760
bam_fs = args['--bam']
5861
cov_fs = args['--cov']
5962
cas_fs = args['--cas']
60-
hit_fs = args['--taxfile']
63+
hit_fs = args['--hitsfile']
6164
prefix = args['--out']
6265
nodesDB_f = args['--db']
6366
names_f = args['--names']

0 commit comments

Comments
 (0)