Skip to content

Commit 40d370b

Browse files
Dom LaetschDom Laetsch
authored andcommitted
Bug fixes
- Fixed read coverage plots (all but last plotted lib were showing wrong percentages)
1 parent 680f2f4 commit 40d370b

File tree

6 files changed

+129
-73
lines changed

6 files changed

+129
-73
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
testing/
2+
testing_fly/
23
*.json
34
*.pyc
45
*.txt

lib/BtCore.py

Lines changed: 89 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -107,26 +107,60 @@ def dump(self):
107107

108108
def load(self, BlobDb_f):
109109
blobDict = BtIO.readJson(BlobDb_f)
110-
self.title = blobDict['title']
111-
self.assembly_f = blobDict['assembly_f']
112-
self.nodesDB_f = blobDict['nodesDB_f']
113-
self.lineages = blobDict['lineages']
110+
for k, v in blobDict.items():
111+
setattr(self, k, v)
114112
self.set_of_taxIds = blobDict['lineages'].keys()
115-
self.order_of_blobs = blobDict['order_of_blobs']
116-
self.dict_of_blobs = blobDict['dict_of_blobs']
117-
self.length = int(blobDict['length'])
118-
self.seqs = int(blobDict['seqs'])
119-
self.n_count = int(blobDict['n_count'])
120-
self.covLibs = blobDict['covLibs']
121-
self.hitLibs = blobDict['hitLibs']
122-
self.taxrules = blobDict['taxrules']
113+
#for k, v in self.__dict__.items():
114+
# print k, type(v), v # this seems to work
115+
116+
#self.title = blobDict['title']
117+
#self.assembly_f = blobDict['assembly_f']
118+
#self.nodesDB_f = blobDict['nodesDB_f']
119+
#self.lineages = blobDict['lineages']
120+
#self.set_of_taxIds = blobDict['lineages'].keys()
121+
#self.order_of_blobs = blobDict['order_of_blobs']
122+
#self.dict_of_blobs = blobDict['dict_of_blobs']
123+
#self.length = int(blobDict['length'])
124+
#self.seqs = int(blobDict['seqs'])
125+
#self.n_count = int(blobDict['n_count'])
126+
#self.covLibs = blobDict['covLibs']
127+
#self.hitLibs = blobDict['hitLibs']
128+
#self.taxrules = blobDict['taxrules']
129+
130+
# self.title = title
131+
# self.assembly_f = ''
132+
# self.dict_of_blobs = {}
133+
# self.order_of_blobs = []
134+
# self.set_of_taxIds = set()
135+
# self.lineages = {}
136+
# self.length = 0
137+
# self.seqs = 0
138+
# self.n_count = 0
139+
# self.covLibs = {}
140+
# self.hitLibs = {}
141+
# self.nodesDB_f = ''
142+
# self.taxrules = []
123143

124144
def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
125145
data_dict = {}
126146
read_cov_dict = {}
127147
max_cov = 0.0
128-
cov_libs = self.covLibs.keys()
129-
cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
148+
149+
cov_lib_dict = self.covLibs
150+
#print cov_lib_dict
151+
cov_lib_names_l = self.covLibs.keys() # does not include cov_sum
152+
153+
if len(cov_lib_names_l) > 1:
154+
# more than one cov_lib, cov_sum_lib has to be created
155+
cov_lib_dict['sum'] = CovLibObj('sum', 'sum', None).__dict__ # ugly
156+
cov_lib_dict['sum']['reads_total'] = sum([cov_lib_dict[x]['reads_total'] for x in cov_lib_dict])
157+
cov_lib_dict['sum']['reads_mapped'] = sum([cov_lib_dict[x]['reads_mapped'] for x in cov_lib_dict])
158+
159+
#print self.covLibs
160+
#cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
161+
#print cov_libs_reads_total # correct
162+
#cov_libs_reads_mapped = {cov_lib : data['reads_mapped'] for cov_lib, data in self.covLibs.items()}
163+
#print cov_libs_reads_mapped # correct
130164

131165
for blob in self.dict_of_blobs.values():
132166
name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
@@ -143,21 +177,19 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
143177
'name' : [],
144178
'length' : [],
145179
'gc' : [],
146-
'covs' : {covLib : [] for covLib in cov_libs},
147-
'reads_mapped' : {covLib : 0 for covLib in cov_libs},
180+
'covs' : {covLib : [] for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists
181+
'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()}, # includes cov_sum if it exists
148182
'count' : 0,
149183
'count_hidden' : 0,
150184
'count_visible' : 0,
151185
'span': 0,
152186
'span_hidden' : 0,
153187
'span_visible' : 0,
154188
}
155-
if len(cov_libs) > 1:
156-
data_dict[group]['covs']['cov_sum'] = []
157-
data_dict[group]['reads_mapped']['cov_sum'] = 0
158189

159190
data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
160191
data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
192+
161193
if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
162194
data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
163195
data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
@@ -170,35 +202,43 @@ def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour
170202

171203
cov_sum = 0.0
172204
reads_mapped_sum = 0
173-
for cov_lib in sorted(cov_libs):
205+
for cov_lib in sorted(cov_lib_names_l):
174206
cov = float(blob['covs'][cov_lib])
175-
cov_sum += cov
176207
cov = cov if cov > 0.02 else 0.02
208+
# increase max_cov
177209
if cov > max_cov:
178210
max_cov = cov
179-
data_dict[group]['covs'][cov_lib].append(cov)
180-
if cov_lib in blob['read_cov']:
211+
212+
# add cov of blob to group
213+
data_dict[group]['covs'][cov_lib].append(cov)
214+
215+
cov_sum += cov
216+
217+
# add readcov
218+
if cov_lib in blob['read_cov']:
181219
reads_mapped = blob['read_cov'][cov_lib]
182-
reads_mapped_sum += reads_mapped
183220
data_dict[group]['reads_mapped'][cov_lib] += reads_mapped
221+
reads_mapped_sum += reads_mapped
184222

185-
if len(cov_libs) > 1:
223+
if len(cov_lib_names_l) > 1:
186224
cov_sum = cov_sum if cov_sum > 0.02 else 0.02
187-
data_dict[group]['covs']['cov_sum'].append(cov_sum)
225+
data_dict[group]['covs']['sum'].append(cov_sum)
188226
if cov > max_cov:
189227
max_cov = cov
190228
if (reads_mapped_sum):
191-
data_dict[group]['reads_mapped']['cov_sum'] += reads_mapped_sum
192-
193-
#data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
194-
#data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
229+
data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum
230+
231+
#if len(cov_lib_names_l) > 1:
232+
# for cov_lib, data in self.covLibs.items():
233+
# cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total']
234+
195235

196-
if len(cov_libs) > 1:
197-
cov_libs.append('cov_sum')
198-
for cov_lib, data in self.covLibs.items():
199-
cov_libs_reads_total['cov_sum'] = cov_libs_reads_total.get('cov_sum', 0) + data['reads_total']
236+
#for group in data_dict:
237+
# print "#", group
238+
# for cat in data_dict[group]:
239+
# print cat, data_dict[group][cat]
200240

201-
return data_dict, max_cov, cov_libs, cov_libs_reads_total
241+
return data_dict, max_cov, cov_lib_dict
202242

203243
def addCovLib(self, covLib):
204244
self.covLibs[covLib.name] = covLib
@@ -237,18 +277,22 @@ def parseCovs(self, covLibObjs):
237277
self.addCovLib(covLib)
238278
print BtLog.status_d['1'] % (covLib.name, covLib.f)
239279
if covLib.fmt == 'bam' or covLib.fmt == 'sam':
280+
240281
base_cov_dict = {}
241282
if covLib.fmt == 'bam':
242283
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
243284
else:
244-
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
285+
base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))
286+
245287
if covLib.reads_total == 0:
246288
print BtLog.warn_d['4'] % covLib.f
289+
247290
for name, base_cov in base_cov_dict.items():
248291
cov = base_cov / self.dict_of_blobs[name].agct_count
249292
covLib.cov_sum += cov
250293
self.dict_of_blobs[name].addCov(covLib.name, cov)
251-
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
294+
self.dict_of_blobs[name].addReadCov(covLib.name, read_cov_dict[name])
295+
252296
elif covLib.fmt == 'cas':
253297
cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
254298
if covLib.reads_total == 0:
@@ -257,6 +301,7 @@ def parseCovs(self, covLibObjs):
257301
covLib.cov_sum += cov
258302
self.dict_of_blobs[name].addCov(covLib.name, cov)
259303
self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
304+
260305
elif covLib.fmt == 'cov':
261306
cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
262307
if not len(cov_dict) == self.seqs:
@@ -316,8 +361,11 @@ def calculateGC(self, seq):
316361
return float((seq.count('G') + seq.count('C') ) / self.agct_count \
317362
if self.agct_count > 0 else 0.0)
318363

319-
def addCov(self, name, cov):
320-
self.covs[name] = cov
364+
def addCov(self, lib_name, cov):
365+
self.covs[lib_name] = cov
366+
367+
def addReadCov(self, lib_name, read_cov):
368+
self.read_cov[lib_name] = read_cov
321369

322370
def addHits(self, hitLibName, hitDict):
323371
if not hitLibName in self.hits:
@@ -328,7 +376,7 @@ class CovLibObj():
328376
def __init__(self, name, fmt, f):
329377
self.name = name
330378
self.fmt = fmt
331-
self.f = abspath(f)
379+
self.f = abspath(f) if (f) else ''
332380
self.cov_sum = 0
333381
self.reads_total = 0
334382
self.reads_mapped = 0
@@ -338,7 +386,7 @@ class hitLibObj():
338386
def __init__(self, name, fmt, f):
339387
self.name = name
340388
self.fmt = fmt
341-
self.f = abspath(f)
389+
self.f = abspath(f) if (f) else ''
342390

343391
if __name__ == '__main__':
344392
pass

lib/BtIO.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import subprocess
1515
from os.path import basename, isfile, abspath
1616
import os
17+
import sys
1718
import lib.BtLog as BtLog
1819

1920
def parseList(infile):
@@ -100,7 +101,7 @@ def readSam(infile, set_of_blobs):
100101

101102
def readBam(infile, set_of_blobs):
102103
reads_total, reads_mapped = checkBam(infile)
103-
progress_unit = int(int(reads_mapped)/1000) + 1 # lazy fix
104+
progress_unit = int(reads_mapped/1000)
104105
base_cov_dict = {}
105106
read_cov_dict = {}
106107
cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's
@@ -109,21 +110,21 @@ def readBam(infile, set_of_blobs):
109110
# ADD flag picard -F 1028 to not consider optical duplicates
110111
#command = "samtools view -F 1028 " + infile
111112
# only one counter since only yields mapped reads
113+
seen_reads = 0
112114
parsed_reads = 0
113115
for line in runCmd(command):
114116
match = line.split("\t")
115-
if match >= 11:
116-
seq_name = match[2]
117-
base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
118-
if (base_cov):
119-
parsed_reads += 1
120-
if seq_name not in set_of_blobs:
121-
print BtLog.warn_d['2'] % (seq_name, infile)
122-
else:
123-
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
124-
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
125-
BtLog.progress(parsed_reads, progress_unit, reads_total)
126-
BtLog.progress(reads_total, progress_unit, reads_total)
117+
seen_reads += 1
118+
seq_name = match[2]
119+
base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
120+
if (base_cov):
121+
parsed_reads += 1
122+
if seq_name not in set_of_blobs:
123+
print BtLog.warn_d['2'] % (seq_name, infile)
124+
else:
125+
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
126+
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
127+
BtLog.progress(seen_reads, progress_unit, reads_mapped)
127128
if not int(reads_mapped) == int(parsed_reads):
128129
print warn_d['3'] % (reads_mapped, parsed_reads)
129130
return base_cov_dict, reads_total, parsed_reads, read_cov_dict

lib/BtLog.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ def progress(iteration, steps, max_value):
2323
if int(iteration == max_value):
2424
sys.stdout.write('\r')
2525
print "[PROGRESS]\t: %d%%" % (100)
26+
elif int(iteration) % int(steps) == 0:
27+
sys.stdout.write('\r')
28+
print "[PROGRESS]\t: %d%%" % (float(int(iteration)/int(max_value))*100),
29+
sys.stdout.flush()
2630
else:
27-
if int(iteration) % int(steps) == 0:
28-
sys.stdout.write('\r')
29-
print "[PROGRESS]\t: %d%%" % (float(int(iteration)/int(max_value)*100)),
30-
sys.stdout.flush()
31+
pass
3132

3233
error_d = {
3334
'0' : '[ERROR:0]\t: File %s does not exist.',

0 commit comments

Comments
 (0)