Skip to content

Commit 4632465

Browse files
committed
Merge pull request #1 from DRL/develop
Develop
2 parents 3727043 + 5dc1e89 commit 4632465

File tree

10 files changed

+741
-418
lines changed

10 files changed

+741
-418
lines changed

blobtools

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
usage: blobtools <command> [<args>...] [--help]
66
77
commands:
8-
forge create a BlobDB
8+
create create a BlobDB
99
view print BlobDB
1010
plot plot BlobDB as a blobplot
1111
@@ -27,8 +27,8 @@ if __name__ == '__main__':
2727
#print(args)
2828

2929
argv = [args['<command>']] + args['<args>']
30-
if args['<command>'] == 'forge':
31-
exit(call(['python', main_dir + 'forge.py'] + argv))
30+
if args['<command>'] == 'create':
31+
exit(call(['python', main_dir + 'create.py'] + argv))
3232
elif args['<command>'] == 'view':
3333
exit(call(['python', main_dir + 'view.py'] + argv))
3434
elif args['<command>'] == 'plot':

forge.py renamed to create.py

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,34 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*-
33

4-
"""usage: blobtools forge --i <FASTA> [--type <ASSEMBLY>] [--out <OUT>] [--title <TITLE>]
5-
[--bam <BAM>...] [--sam <SAM>...] [--cas <CAS>...] [--cov <COV>...]
4+
"""usage: blobtools create -i FASTA [-y FASTATYPE] [-o OUTFILE] [--title TITLE]
5+
[-b BAM...] [-s SAM...] [-a CAS...] [-c COV...]
66
[--nodes <NODES>] [--names <NAMES>] [--db <NODESDB>]
7-
[--tax <TAX>...] [--taxrule <TAXRULE>...]
8-
[--h|--help]
7+
[-t TAX...] [-r TAXRULE...]
8+
[-h|--help]
99
1010
Options:
11-
--h --help show this
12-
--i <FASTA> FASTA file of assembly
13-
--type ASSEMBLY Assembly program used to create FASTA. If specified,
14-
coverage will be parsed from FASTA header.
15-
(Parsing supported for 'spades', 'soap', 'velvet', 'abyss')
16-
--tax <TAX>... Taxonomy file in format (qseqid\\ttaxid\\tbitscore)
17-
(e.g. BLAST output "--outfmt '6 std'")
18-
--taxrule <TAXRULE>... Taxrule determines how taxonomy of blobs is computed [default: bestsum]
19-
"bestsum" : sum bitscore across all hits for each taxonomic rank
20-
"bestsumorder" : sum bitscore across all hits for each taxonomic rank.
21-
- If first <TAX> file supplies hits these are used.
11+
-h --help show this
12+
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
13+
-y, --type FASTATYPE Assembly program used to create FASTA. If specified,
14+
coverage will be parsed from FASTA header.
15+
(Parsing supported for 'spades', 'soap', 'velvet', 'abyss')
16+
-t, --taxfile TAX... Taxonomy file in format (qseqid\\ttaxid\\tbitscore)
17+
(e.g. BLAST output "--outfmt '6 qseqid staxids bitscore'")
18+
-r, --taxrule <TAXRULE>... Taxrule determines how taxonomy of blobs is computed [default: bestsum]
19+
"bestsum" : sum bitscore across all hits for each taxonomic rank
20+
"bestsumorder" : sum bitscore across all hits for each taxonomic rank.
21+
- If first <TAX> file supplies hits, bestsum is calculated.
2222
- If no hit is found, the next <TAX> file is used.
23-
--nodes <NODES> NCBI nodes.dmp file. Not required if '--db'
24-
--names <NAMES> NCBI names.dmp file. Not required if '--db'
25-
--db <NODESDB> NodesDB file [default: data/nodesDB.txt].
26-
--bam <BAM>... BAM file (requires samtools in $PATH)
27-
--sam <SAM>... SAM file
28-
--cas <CAS>... CAS file (requires clc_mapping_info in $PATH)
29-
--cov <COV>... TAB separated. (seqID\\tcoverage)
30-
--out <OUT> BlobDB output file [default: blobDb.json]
31-
--title TITLE Title of BlobDB [default: FASTA)
23+
--nodes <NODES> NCBI nodes.dmp file. Not required if '--db'
24+
--names <NAMES> NCBI names.dmp file. Not required if '--db'
25+
--db <NODESDB> NodesDB file [default: data/nodesDB.txt].
26+
-b, --bam <BAM>... BAM file (requires samtools in $PATH)
27+
-s, --sam <SAM>... SAM file
28+
-a, --cas <CAS>... CAS file (requires clc_mapping_info in $PATH)
29+
-c, --cov <COV>... TAB separated. (seqID\\tcoverage)
30+
-o, --out <OUT> BlobDB output prefix
31+
--title TITLE Title of BlobDB [default: FASTA)
3232
"""
3333

3434
from __future__ import division
@@ -45,17 +45,20 @@
4545
#print data_dir
4646
args = docopt(__doc__)
4747
#print args
48-
49-
fasta_f = args['--i']
48+
fasta_f = args['--infile']
5049
fasta_type = args['--type']
5150

5251
sam_fs = args['--sam']
5352
bam_fs = args['--bam']
5453
cov_fs = args['--cov']
5554
cas_fs = args['--cas']
56-
hit_fs = args['--tax']
55+
hit_fs = args['--taxfile']
5756

5857
out_f = args['--out']
58+
if (out_f):
59+
out_f = "%s.%s" % (out_f, "BlobDB.json")
60+
else:
61+
out_f = "%s" % ("BlobDB.json")
5962
nodesDB_f = args['--db']
6063
names_f = args['--names']
6164
nodes_f = args['--nodes']
@@ -90,7 +93,7 @@
9093
[bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
9194
[bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
9295
[bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]
93-
96+
9497
# Create BlobDB object
9598
blobDb = bt.BlobDb(title)
9699

@@ -115,6 +118,6 @@
115118
print BtLog.status_d['6'] % ",".join(taxrules)
116119
blobDb.computeTaxonomy(taxrules, nodesDB)
117120

118-
# Writing BlobDB to file
121+
# Generating BlobDB and writing to file
119122
print BtLog.status_d['7'] % out_f
120123
BtIO.writeJson(blobDb.dump(), out_f)

data/colours

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#48a365=Nematoda
2+
#d0694a=Arthropoda
3+
#ffb917=Proteobacteria
4+
#926eb3=Actinobacteria
5+
#a6cee3=Ascomycota
6+
#d3d3d3=no-hit
7+
#ec84ba=Chordata
8+
#e0d799=Cnidaria
9+
#b89c75=Platyhelminthes
10+
#fdb761=Bacteriodetes
11+
#ffffff=other

lib/BtCore.py

Lines changed: 96 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -107,54 +107,84 @@ def load(self, BlobDb_f):
107107
self.lineages = blobDict['lineages']
108108
self.set_of_taxIds = blobDict['lineages'].keys()
109109
self.order_of_blobs = blobDict['order_of_blobs']
110-
self.dict_of_blobs = blobDict['dict_of_blobs'] # this will probably not work
110+
self.dict_of_blobs = blobDict['dict_of_blobs']
111111
self.length = int(blobDict['length'])
112112
self.seqs = int(blobDict['seqs'])
113113
self.n_count = int(blobDict['n_count'])
114114
self.covLibs = blobDict['covLibs']
115115
self.hitLibs = blobDict['hitLibs']
116116
self.taxrules = blobDict['taxrules']
117117

118-
def getArrays(self, rank, min_length, hide_nohits, taxrule, c_index, label_d):
119-
from numpy import array
120-
summary_dict = {}
121-
data_list = []
122-
cov_dict = {covLib : [] for covLib in self.covLibs}
118+
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index):
119+
data_dict = {}
120+
read_cov_dict = {}
121+
max_cov = 0.0
122+
cov_libs = self.covLibs.keys()
123+
cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
124+
123125
for blob in self.dict_of_blobs.values():
124-
name = blob['name']
125-
gc = blob['gc']
126-
length = blob['length']
127-
tax = ''
128-
if (c_index):
129-
tax = str(blob['taxonomy'][taxrule][rank]['c_index'])
130-
else:
131-
tax = blob['taxonomy'][taxrule][rank]['tax']
132-
if label_d and tax in label_d:
133-
tax = label_d[tax]
134-
if not tax in summary_dict:
135-
summary_dict[tax] = {'count_total' : 0,
136-
'count_hidden' : 0,
137-
'count_visible' : 0,
138-
'span_total': 0,
139-
'span_hidden' : 0,
140-
'span_visible' : 0}
141-
if ((hide_nohits) and tax == 'no-hit') or length < min_length:
142-
summary_dict[tax]['count_hidden'] = summary_dict[tax].get('count_hidden', 0) + 1
143-
summary_dict[tax]['span_hidden'] = summary_dict[tax].get('span_hidden', 0) + length
144-
else:
145-
data_list.append([(name), (length), (gc), (tax)])
146-
for covLib in self.covLibs:
147-
cov = float(blob['covs'][covLib])
148-
if cov < 0.1:
149-
cov = 0.1
150-
cov_dict[covLib].append(cov)
151-
summary_dict[tax]['count_visible'] = summary_dict[tax].get('count_visible', 0) + 1
152-
summary_dict[tax]['span_visible'] = summary_dict[tax].get('span_visible', 0) + int(length)
153-
summary_dict[tax]['count_total'] = summary_dict[tax].get('count_total', 0) + 1
154-
summary_dict[tax]['span_total'] = summary_dict[tax].get('span_total', 0) + int(length)
155-
data_array = array(data_list)
156-
cov_arrays = {covLib: array(cov) for covLib, cov in cov_dict.items()}
157-
return data_array, cov_arrays, summary_dict
126+
name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
127+
128+
if (c_index): # annotation with c_index instead of taxonomic group
129+
group = str(blob['taxonomy'][taxrule][rank]['c_index'])
130+
else: # annotation with taxonomic group
131+
group = str(blob['taxonomy'][taxrule][rank]['tax'])
132+
133+
if not group in data_dict:
134+
data_dict[group] = {
135+
'name' : [],
136+
'length' : [],
137+
'gc' : [],
138+
'covs' : {covLib : [] for covLib in cov_libs},
139+
'reads_mapped' : {covLib : 0 for covLib in cov_libs},
140+
'count' : 0,
141+
'count_hidden' : 0,
142+
'count_visible' : 0,
143+
'span': 0,
144+
'span_hidden' : 0,
145+
'span_visible' : 0,
146+
}
147+
if len(cov_libs) > 1:
148+
data_dict[group]['covs']['sum'] = []
149+
data_dict[group]['reads_mapped']['sum'] = 0
150+
151+
if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
152+
data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
153+
data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
154+
else: # visible
155+
data_dict[group]['count_visible'] = data_dict[group].get('count_visible', 0) + 1
156+
data_dict[group]['span_visible'] = data_dict[group].get('span_visible', 0) + int(length)
157+
158+
data_dict[group]['name'].append(name)
159+
data_dict[group]['length'].append(length)
160+
data_dict[group]['gc'].append(gc)
161+
162+
cov_sum = 0.0
163+
reads_mapped_sum = 0
164+
for cov_lib in sorted(cov_libs):
165+
cov = float(blob['covs'][cov_lib])
166+
cov_sum += cov
167+
cov = cov if cov > 0.02 else 0.02
168+
if cov > max_cov:
169+
max_cov = cov
170+
data_dict[group]['covs'][cov_lib].append(cov)
171+
if cov_lib in blob['read_cov']:
172+
reads_mapped = blob['read_cov'][cov_lib]
173+
reads_mapped_sum += reads_mapped
174+
data_dict[group]['reads_mapped'][cov_lib] += reads_mapped
175+
176+
if len(cov_libs) > 1:
177+
cov_sum = cov_sum if cov_sum > 0.02 else 0.02
178+
data_dict[group]['covs']['sum'].append(cov_sum)
179+
if cov > max_cov:
180+
max_cov = cov
181+
if (reads_mapped_sum):
182+
data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum
183+
184+
data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
185+
data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
186+
187+
return data_dict, max_cov, cov_libs, cov_libs_reads_total
158188

159189
def addCovLib(self, covLib):
160190
self.covLibs[covLib.name] = covLib
@@ -166,8 +196,7 @@ def parseFasta(self, fasta_f, fasta_type):
166196
self.assembly_f = abspath(fasta_f)
167197
if (fasta_type):
168198
# Set up CovLibObj for coverage in assembly header
169-
cov_lib = CovLibObj(fasta_type, fasta_type, fasta_f)
170-
self.covLibs[covLib.name] = covLib
199+
self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)
171200

172201
for name, seq in BtIO.readFasta(fasta_f):
173202
blObj = BlObj(name, seq)
@@ -178,7 +207,7 @@ def parseFasta(self, fasta_f, fasta_type):
178207

179208
if (fasta_type):
180209
cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
181-
covLib.cov_sum += cov
210+
self.covLibs[fasta_type].cov_sum += cov
182211
blObj.addCov(fasta_type, cov)
183212

184213
self.order_of_blobs.append(blObj.name)
@@ -196,32 +225,45 @@ def parseCovs(self, covLibObjs):
196225
if covLib.fmt == 'bam' or covLib.fmt == 'sam':
197226
base_cov_dict = {}
198227
if covLib.fmt == 'bam':
199-
base_cov_dict, covLib.total_reads, covLib.mapped_reads, covLib.read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
228+
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
200229
else:
201-
base_cov_dict, covLib.total_reads, covLib.mapped_reads, covLib.read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
230+
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
231+
if covLib.reads_total == 0:
232+
print BtLog.warn_d['4'] % covLib.f
202233
for name, base_cov in base_cov_dict.items():
203234
cov = base_cov / self.dict_of_blobs[name].agct_count
204235
covLib.cov_sum += cov
205236
self.dict_of_blobs[name].addCov(covLib.name, cov)
237+
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
206238
elif covLib.fmt == 'cas':
207-
for name, cov in BtIO.readCas(covLib.f, self.order_of_blobs):
239+
cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
240+
if covLib.reads_total == 0:
241+
print BtLog.warn_d['4'] % covLib.f
242+
for name, cov in cov_dict.items():
208243
covLib.cov_sum += cov
209244
self.dict_of_blobs[name].addCov(covLib.name, cov)
245+
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
210246
elif covLib.fmt == 'cov':
211-
for name, cov in BtIO.readCov(covLib.f, set(self.dict_of_blobs)):
212-
covLib.cov_sum += cov
213-
self.dict_of_blobs[name].addCov(covLib.name, cov)
247+
cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
248+
if not len(cov_dict) == self.seqs:
249+
print BtLog.warn_d['4'] % covLib.f
250+
covLib.cov_sum += cov
251+
self.dict_of_blobs[name].addCov(covLib.name, cov)
214252
else:
215253
pass
216254
covLib.mean_cov = covLib.cov_sum/self.seqs
217255
self.covLibs[covLib.name] = covLib
218256

257+
219258
def parseHits(self, hitLibs):
220259
for hitLib in hitLibs:
221260
self.hitLibs[hitLib.name] = hitLib
222261
print BtLog.status_d['1'] % (hitLib.name, hitLib.f)
223262
# only accepts format 'seqID\ttaxID\tscore'
224263
for hitDict in BtIO.readTax(hitLib.f, set(self.dict_of_blobs)):
264+
if ";" in hitDict['taxId']:
265+
hitDict['taxId'] = hitDict['taxId'].split(";")[0]
266+
print BtLog.warn['5'] % (hitDict['name'], hitLib)
225267
self.set_of_taxIds.add(hitDict['taxId'])
226268
self.dict_of_blobs[hitDict['name']].addHits(hitLib.name, hitDict)
227269

@@ -246,8 +288,8 @@ def counts(self):
246288
'Ns' : self.n_count,
247289
'AvgCov' : {lib : round(covlibObj.cov_sum/self.seqs, 2) for lib, covlibObj in self.covLibs.items()},
248290
'GC' : round(sum([blObj.gc for blObj in self.dict_of_blobs.values()])/self.seqs, 2),
249-
'MappedReads' : {lib : (covlibObj.mapped_reads) for lib, covlibObj in self.covLibs.items()},
250-
'TotalReads' : {lib : (covlibObj.total_reads) for lib, covlibObj in self.covLibs.items()}
291+
'MappedReads' : {lib : (covlibObj.reads_mapped) for lib, covlibObj in self.covLibs.items()},
292+
'TotalReads' : {lib : (covlibObj.reads_total) for lib, covlibObj in self.covLibs.items()}
251293
}
252294
print count_dict
253295

@@ -263,6 +305,7 @@ def __init__(self, name, seq):
263305
self.agct_count = self.length - self.n_count
264306
self.gc = round(self.calculateGC(seq), 4)
265307
self.covs = {}
308+
self.read_cov = {}
266309
self.hits = {}
267310
self.taxonomy = {}
268311

@@ -284,9 +327,8 @@ def __init__(self, name, fmt, f):
284327
self.fmt = fmt
285328
self.f = abspath(f)
286329
self.cov_sum = 0
287-
self.total_reads = 0
288-
self.mapped_reads = 0
289-
self.read_cov_dict = {}
330+
self.reads_total = 0
331+
self.reads_mapped = 0
290332
self.mean_cov = 0.0
291333

292334
class hitLibObj():

0 commit comments

Comments
 (0)