Skip to content

Commit daff0b2

Browse files
committed
Merge pull request #11 from DRL/development
Development
2 parents 3be2abb + 05cd61f commit daff0b2

File tree

10 files changed

+237
-151
lines changed

10 files changed

+237
-151
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
testing/
2+
testing_fly/
23
*.json
34
*.pyc
45
*.txt

comparecov.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121
2222
-p, --plotgroups INT Number of (taxonomic) groups to plot, remaining
2323
groups are placed in 'other' [default: 7]
24-
-r, --rank RANK Taxonomic rank used for colouring of blobs [default: superkingdom]
25-
(Supported: superkingdom)
24+
-r, --rank RANK Taxonomic rank used for colouring of blobs [default: phylum]
2625
-x, --taxrule TAXRULE Taxrule which has been used for computing taxonomy
2726
(Supported: bestsum, bestsumorder) [default: bestsum]
2827
--sort <ORDER> Sort order for plotting [default: span]

create.py

Lines changed: 4 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -36,64 +36,19 @@
3636
import lib.BtCore as bt
3737
import lib.BtLog as BtLog
3838
import lib.BtIO as BtIO
39+
import lib.BtInput as BtInput
3940
import os.path
4041

4142

4243
if __name__ == '__main__':
43-
ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet']
44+
4445
main_dir = os.path.dirname(__file__)
4546
#print data_dir
4647
args = docopt(__doc__)
4748
#print args
48-
fasta_f = args['--infile']
49-
fasta_type = args['--type']
5049

51-
sam_fs = args['--sam']
52-
bam_fs = args['--bam']
53-
cov_fs = args['--cov']
54-
cas_fs = args['--cas']
55-
hit_fs = args['--taxfile']
56-
57-
out_f = args['--out']
58-
if (out_f):
59-
out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json")
60-
else:
61-
out_f = "%s" % ("BlobDB.json")
62-
nodesDB_f = args['--db']
63-
names_f = args['--names']
64-
nodes_f = args['--nodes']
65-
taxrules = args['--taxrule']
66-
title = args['--title'] if (args['--title']) else out_f
67-
68-
69-
# Do files exist ?
70-
files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
71-
for f in files:
72-
if not os.path.isfile(f):
73-
BtLog.error('0', f)
74-
75-
# Is taxonomy provided?
76-
if nodesDB_f == "data/nodesDB.txt":
77-
nodesDB_f = os.path.join(main_dir, nodesDB_f)
78-
if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
79-
BtLog.error('3')
80-
81-
if not (hit_fs):
82-
BtLog.error('18')
83-
84-
# can FASTA parser deal with assemblies
85-
if not fasta_type in ASSEMBLY_TYPES:
86-
BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
87-
88-
# Is coverage provided?
89-
if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
90-
BtLog.error('1')
50+
title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f = BtInput.validate_input_create(main_dir, args)
9151

92-
cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
93-
[bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
94-
[bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
95-
[bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]
96-
9752
# Create BlobDB object
9853
blobDb = bt.BlobDb(title)
9954

@@ -103,8 +58,7 @@
10358
blobDb.parseCovs(cov_libs)
10459

10560
# Parse Tax
106-
hitLibs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]
107-
blobDb.parseHits(hitLibs)
61+
blobDb.parseHits(hit_libs)
10862

10963
# Parse nodesDB
11064
nodesDB, nodesDB_f = BtIO.getNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f)

lib/BtCore.py

Lines changed: 100 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -107,26 +107,60 @@ def dump(self):
107107

108108
def load(self, BlobDb_f):
109109
blobDict = BtIO.readJson(BlobDb_f)
110-
self.title = blobDict['title']
111-
self.assembly_f = blobDict['assembly_f']
112-
self.nodesDB_f = blobDict['nodesDB_f']
113-
self.lineages = blobDict['lineages']
110+
for k, v in blobDict.items():
111+
setattr(self, k, v)
114112
self.set_of_taxIds = blobDict['lineages'].keys()
115-
self.order_of_blobs = blobDict['order_of_blobs']
116-
self.dict_of_blobs = blobDict['dict_of_blobs']
117-
self.length = int(blobDict['length'])
118-
self.seqs = int(blobDict['seqs'])
119-
self.n_count = int(blobDict['n_count'])
120-
self.covLibs = blobDict['covLibs']
121-
self.hitLibs = blobDict['hitLibs']
122-
self.taxrules = blobDict['taxrules']
113+
#for k, v in self.__dict__.items():
114+
# print k, type(v), v # this seems to work
115+
116+
#self.title = blobDict['title']
117+
#self.assembly_f = blobDict['assembly_f']
118+
#self.nodesDB_f = blobDict['nodesDB_f']
119+
#self.lineages = blobDict['lineages']
120+
#self.set_of_taxIds = blobDict['lineages'].keys()
121+
#self.order_of_blobs = blobDict['order_of_blobs']
122+
#self.dict_of_blobs = blobDict['dict_of_blobs']
123+
#self.length = int(blobDict['length'])
124+
#self.seqs = int(blobDict['seqs'])
125+
#self.n_count = int(blobDict['n_count'])
126+
#self.covLibs = blobDict['covLibs']
127+
#self.hitLibs = blobDict['hitLibs']
128+
#self.taxrules = blobDict['taxrules']
129+
130+
# self.title = title
131+
# self.assembly_f = ''
132+
# self.dict_of_blobs = {}
133+
# self.order_of_blobs = []
134+
# self.set_of_taxIds = set()
135+
# self.lineages = {}
136+
# self.length = 0
137+
# self.seqs = 0
138+
# self.n_count = 0
139+
# self.covLibs = {}
140+
# self.hitLibs = {}
141+
# self.nodesDB_f = ''
142+
# self.taxrules = []
123143

124144
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
125145
data_dict = {}
126146
read_cov_dict = {}
127147
max_cov = 0.0
128-
cov_libs = self.covLibs.keys()
129-
cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
148+
print self.covLibs
149+
cov_lib_dict = self.covLibs
150+
#print cov_lib_dict
151+
cov_lib_names_l = self.covLibs.keys() # does not include cov_sum
152+
153+
if len(cov_lib_names_l) > 1:
154+
# more than one cov_lib, cov_sum_lib has to be created
155+
cov_lib_dict['sum'] = CovLibObj('sum', 'sum', None).__dict__ # ugly
156+
cov_lib_dict['sum']['reads_total'] = sum([cov_lib_dict[x]['reads_total'] for x in cov_lib_dict])
157+
cov_lib_dict['sum']['reads_mapped'] = sum([cov_lib_dict[x]['reads_mapped'] for x in cov_lib_dict])
158+
159+
#print self.covLibs
160+
#cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
161+
#print cov_libs_reads_total # correct
162+
#cov_libs_reads_mapped = {cov_lib : data['reads_mapped'] for cov_lib, data in self.covLibs.items()}
163+
#print cov_libs_reads_mapped # correct
130164

131165
for blob in self.dict_of_blobs.values():
132166
name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
@@ -143,21 +177,19 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
143177
'name' : [],
144178
'length' : [],
145179
'gc' : [],
146-
'covs' : {covLib : [] for covLib in cov_libs},
147-
'reads_mapped' : {covLib : 0 for covLib in cov_libs},
180+
'covs' : {covLib : [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists
181+
'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists
148182
'count' : 0,
149183
'count_hidden' : 0,
150184
'count_visible' : 0,
151185
'span': 0,
152186
'span_hidden' : 0,
153187
'span_visible' : 0,
154188
}
155-
if len(cov_libs) > 1:
156-
data_dict[group]['covs']['cov_sum'] = []
157-
data_dict[group]['reads_mapped']['cov_sum'] = 0
158189

159190
data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
160191
data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
192+
161193
if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
162194
data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
163195
data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
@@ -170,35 +202,45 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
170202

171203
cov_sum = 0.0
172204
reads_mapped_sum = 0
173-
for cov_lib in sorted(cov_libs):
174-
cov = float(blob['covs'][cov_lib])
175-
cov_sum += cov
176-
cov = cov if cov > 0.02 else 0.02
205+
for cov_lib in sorted(cov_lib_names_l):
206+
cov = float(blob['covs'][cov_lib])
207+
if cov < 0.02:
208+
cov = 0.02
209+
#cov = cov if cov > 0.02 else 0.02
210+
# increase max_cov
177211
if cov > max_cov:
178212
max_cov = cov
179-
data_dict[group]['covs'][cov_lib].append(cov)
180-
if cov_lib in blob['read_cov']:
213+
214+
# add cov of blob to group
215+
data_dict[group]['covs'][cov_lib].append(cov)
216+
217+
cov_sum += cov
218+
219+
# add readcov
220+
if cov_lib in blob['read_cov']:
181221
reads_mapped = blob['read_cov'][cov_lib]
182-
reads_mapped_sum += reads_mapped
183222
data_dict[group]['reads_mapped'][cov_lib] += reads_mapped
223+
reads_mapped_sum += reads_mapped
184224

185-
if len(cov_libs) > 1:
186-
cov_sum = cov_sum if cov_sum > 0.02 else 0.02
187-
data_dict[group]['covs']['cov_sum'].append(cov_sum)
188-
if cov > max_cov:
189-
max_cov = cov
225+
if len(cov_lib_names_l) > 1:
226+
if cov_sum < 0.02 :
227+
cov_sum = 0.02
228+
data_dict[group]['covs']['sum'].append(cov_sum)
229+
if cov_sum > max_cov:
230+
max_cov = cov_sum
190231
if (reads_mapped_sum):
191-
data_dict[group]['reads_mapped']['cov_sum'] += reads_mapped_sum
192-
193-
#data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
194-
#data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
232+
data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum
233+
#if len(cov_lib_names_l) > 1:
234+
# for cov_lib, data in self.covLibs.items():
235+
# cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total']
236+
195237

196-
if len(cov_libs) > 1:
197-
cov_libs.append('cov_sum')
198-
for cov_lib, data in self.covLibs.items():
199-
cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total']
238+
#for group in data_dict:
239+
# print "#", group
240+
# for cat in data_dict[group]:
241+
# print cat, data_dict[group][cat]
200242

201-
return data_dict, max_cov, cov_libs, cov_libs_reads_total
243+
return data_dict, max_cov, cov_lib_dict
202244

203245
def addCovLib(self, covLib):
204246
self.covLibs[covLib.name] = covLib
@@ -237,26 +279,31 @@ def parseCovs(self, covLibObjs):
237279
self.addCovLib(covLib)
238280
print BtLog.status_d['1'] % (covLib.name, covLib.f)
239281
if covLib.fmt == 'bam' or covLib.fmt == 'sam':
282+
240283
base_cov_dict = {}
241284
if covLib.fmt == 'bam':
242285
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
243286
else:
244-
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
287+
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
288+
245289
if covLib.reads_total == 0:
246290
print BtLog.warn_d['4'] % covLib.f
291+
247292
for name, base_cov in base_cov_dict.items():
248293
cov = base_cov / self.dict_of_blobs[name].agct_count
249294
covLib.cov_sum += cov
250295
self.dict_of_blobs[name].addCov(covLib.name, cov)
251-
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
296+
self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])
297+
252298
elif covLib.fmt == 'cas':
253299
cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
254300
if covLib.reads_total == 0:
255301
print BtLog.warn_d['4'] % covLib.f
256302
for name, cov in cov_dict.items():
257303
covLib.cov_sum += cov
258304
self.dict_of_blobs[name].addCov(covLib.name, cov)
259-
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
305+
self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])
306+
260307
elif covLib.fmt == 'cov':
261308
cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
262309
if not len(cov_dict) == self.seqs:
@@ -267,6 +314,8 @@ def parseCovs(self, covLibObjs):
267314
else:
268315
pass
269316
covLib.mean_cov = covLib.cov_sum/self.seqs
317+
if covLib.cov_sum == 0.0:
318+
BtLog.error('26', covLib.name)
270319
self.covLibs[covLib.name] = covLib
271320

272321

@@ -295,18 +344,6 @@ def computeTaxonomy(self, taxrules, nodesDB):
295344
blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages)
296345
else:
297346
blObj.taxonomy[taxrule] = BtTax.noHit()
298-
299-
def counts(self):
300-
count_dict = {
301-
'seqs' : self.seqs,
302-
'length' : self.length,
303-
'Ns' : self.n_count,
304-
'AvgCov' : {lib : round(covlibObj.cov_sum/self.seqs, 2) for lib, covlibObj in self.covLibs.items()},
305-
'GC' : round(sum([blObj.gc for blObj in self.dict_of_blobs.values()])/self.seqs, 2),
306-
'MappedReads' : {lib : (covlibObj.reads_mapped) for lib, covlibObj in self.covLibs.items()},
307-
'TotalReads' : {lib : (covlibObj.reads_total) for lib, covlibObj in self.covLibs.items()}
308-
}
309-
print count_dict
310347

311348
def getBlobs(self):
312349
for blObj in [self.dict_of_blobs[key] for key in self.order_of_blobs]:
@@ -328,8 +365,11 @@ def calculateGC(self, seq):
328365
return float((seq.count('G') + seq.count('C') ) / self.agct_count \
329366
if self.agct_count > 0 else 0.0)
330367

331-
def addCov(self, name, cov):
332-
self.covs[name] = cov
368+
def addCov(self, lib_name, cov):
369+
self.covs[lib_name] = cov
370+
371+
def addReadCov(self, lib_name, read_cov):
372+
self.read_cov[lib_name] = read_cov
333373

334374
def addHits(self, hitLibName, hitDict):
335375
if not hitLibName in self.hits:
@@ -340,8 +380,8 @@ class CovLibObj():
340380
def __init__(self, name, fmt, f):
341381
self.name = name
342382
self.fmt = fmt
343-
self.f = abspath(f)
344-
self.cov_sum = 0
383+
self.f = abspath(f) if (f) else ''
384+
self.cov_sum = 0.0
345385
self.reads_total = 0
346386
self.reads_mapped = 0
347387
self.mean_cov = 0.0
@@ -350,7 +390,7 @@ class hitLibObj():
350390
def __init__(self, name, fmt, f):
351391
self.name = name
352392
self.fmt = fmt
353-
self.f = abspath(f)
393+
self.f = abspath(f) if (f) else ''
354394

355395
if __name__ == '__main__':
356396
pass

0 commit comments

Comments
 (0)