Skip to content

Commit 446d7d6

Browse files
Dom LaetschDom Laetsch
authored andcommitted
Bugfix
Fixed error related to parsing Names.dmp|Nodes.dmp
1 parent 649f130 commit 446d7d6

File tree

3 files changed

+105
-104
lines changed

3 files changed

+105
-104
lines changed

create.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,33 +2,33 @@
22
# -*- coding: utf-8 -*-
33

44
"""usage: blobtools create -i FASTA [-y FASTATYPE] [-o OUTFILE] [--title TITLE]
5-
[-b BAM...] [-s SAM...] [-a CAS...] [-c COV...]
6-
[--nodes <NODES>] [--names <NAMES>] [--db <NODESDB>]
5+
[-b BAM...] [-s SAM...] [-a CAS...] [-c COV...]
6+
[--nodes <NODES>] [--names <NAMES>] [--db <NODESDB>]
77
[-t TAX...] [-x TAXRULE...]
8-
[-h|--help]
9-
8+
[-h|--help]
9+
1010
Options:
1111
-h --help show this
12-
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
13-
-y, --type FASTATYPE Assembly program used to create FASTA. If specified,
14-
coverage will be parsed from FASTA header.
12+
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
13+
-y, --type FASTATYPE Assembly program used to create FASTA. If specified,
14+
coverage will be parsed from FASTA header.
1515
(Parsing supported for 'spades', 'soap', 'velvet', 'abyss')
16-
-t, --taxfile TAX... Taxonomy file in format (qseqid\\ttaxid\\tbitscore)
16+
-t, --taxfile TAX... Taxonomy file in format (qseqid\\ttaxid\\tbitscore)
1717
(e.g. BLAST output "--outfmt '6 qseqid staxids bitscore'")
1818
-x, --taxrule <TAXRULE>... Taxrule determines how taxonomy of blobs is computed [default: bestsum]
1919
"bestsum" : sum bitscore across all hits for each taxonomic rank
20-
"bestsumorder" : sum bitscore across all hits for each taxonomic rank.
21-
- If first <TAX> file supplies hits, bestsum is calculated.
22-
- If no hit is found, the next <TAX> file is used.
20+
"bestsumorder" : sum bitscore across all hits for each taxonomic rank.
21+
- If first <TAX> file supplies hits, bestsum is calculated.
22+
- If no hit is found, the next <TAX> file is used.
2323
--nodes <NODES> NCBI nodes.dmp file. Not required if '--db'
24-
--names <NAMES> NCBI names.dmp file. Not required if '--db'
25-
--db <NODESDB> NodesDB file [default: data/nodesDB.txt].
24+
--names <NAMES> NCBI names.dmp file. Not required if '--db'
25+
--db <NODESDB> NodesDB file [default: data/nodesDB.txt].
2626
-b, --bam <BAM>... BAM file(s) (requires samtools in $PATH)
2727
-s, --sam <SAM>... SAM file(s)
2828
-a, --cas <CAS>... CAS file(s) (requires clc_mapping_info in $PATH)
2929
-c, --cov <COV>... TAB separated. (seqID\\tcoverage)
30-
-o, --out <OUT> BlobDB output prefix
31-
--title TITLE Title of BlobDB [default: output prefix)
30+
-o, --out <OUT> BlobDB output prefix
31+
--title TITLE Title of BlobDB [default: output prefix)
3232
"""
3333

3434
from __future__ import division
@@ -41,15 +41,15 @@
4141

4242

4343
if __name__ == '__main__':
44-
44+
4545
main_dir = os.path.dirname(__file__)
4646
#print data_dir
4747
args = docopt(__doc__)
4848
#print args
49-
49+
5050
title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f = BtInput.validate_input_create(main_dir, args)
5151

52-
# Create BlobDB object
52+
# Create BlobDB object
5353
blobDb = bt.BlobDb(title)
5454

5555
# Parse FASTA
@@ -59,11 +59,11 @@
5959

6060
# Parse Tax
6161
blobDb.parseHits(hit_libs)
62-
62+
6363
# Parse nodesDB
6464
nodesDB, nodesDB_f = BtIO.getNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f)
6565
blobDb.nodesDB_f = nodesDB_f
66-
66+
6767
if not os.path.isfile(nodesDB_f):
6868
print BtLog.status_d['5'] % nodesDB_f
6969
BtIO.writeNodesDB(nodesDB, nodesDB_f)
@@ -74,4 +74,4 @@
7474

7575
# Generating BlobDB and writing to file
7676
print BtLog.status_d['7'] % out_f
77-
BtIO.writeJson(blobDb.dump(), out_f)
77+
BtIO.writeJson(blobDb.dump(), out_f)

lib/BtIO.py

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""
55
File : BtIO.py
66
Version : 0.1
7-
Author : Dominik R. Laetsch, dominik.laetsch at gmail dot com
7+
Author : Dominik R. Laetsch, dominik.laetsch at gmail dot com
88
Bugs : ?
99
To do : ?
1010
"""
@@ -25,9 +25,9 @@ def parseList(infile):
2525
return seqs
2626

2727
def readFasta(infile):
28-
with open(infile) as fh:
28+
with open(infile) as fh:
2929
header, seqs = '', []
30-
for l in fh:
30+
for l in fh:
3131
if l[0] == '>':
3232
if (header):
3333
yield header, ''.join(seqs)
@@ -89,15 +89,15 @@ def readSam(infile, set_of_blobs):
8989
if match >= 11:
9090
reads_total += 1
9191
seq_name = match[2]
92-
if not seq_name == '*':
92+
if not seq_name == '*':
9393
if seq_name not in set_of_blobs:
9494
print BtLog.warn_d['2'] % (seq_name, infile)
9595
base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
9696
if (base_cov):
9797
reads_mapped += 1
98-
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
99-
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
100-
return base_cov_dict, reads_total, reads_mapped, read_cov_dict
98+
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
99+
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
100+
return base_cov_dict, reads_total, reads_mapped, read_cov_dict
101101

102102
def readBam(infile, set_of_blobs):
103103
reads_total, reads_mapped = checkBam(infile)
@@ -111,7 +111,7 @@ def readBam(infile, set_of_blobs):
111111
#command = "samtools view -F 1028 " + infile
112112
# only one counter since only yields mapped reads
113113
seen_reads = 0
114-
parsed_reads = 0
114+
parsed_reads = 0
115115
for line in runCmd(command):
116116
match = line.split("\t")
117117
seen_reads += 1
@@ -122,16 +122,16 @@ def readBam(infile, set_of_blobs):
122122
if seq_name not in set_of_blobs:
123123
print BtLog.warn_d['2'] % (seq_name, infile)
124124
else:
125-
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
126-
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
125+
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
126+
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
127127
BtLog.progress(seen_reads, progress_unit, reads_mapped)
128128
if not int(reads_mapped) == int(parsed_reads):
129129
print warn_d['3'] % (reads_mapped, parsed_reads)
130130
return base_cov_dict, reads_total, parsed_reads, read_cov_dict
131131

132132
def parseCovFromHeader(fasta_type, header ):
133-
'''
134-
Returns the coverage from the header of a FASTA
133+
'''
134+
Returns the coverage from the header of a FASTA
135135
sequence depending on the assembly type
136136
'''
137137
if fasta_type == 'spades':
@@ -187,12 +187,12 @@ def readCas(infile, order_of_blobs):
187187
command = "clc_mapping_info -n " + infile
188188
cov_dict = {}
189189
read_cov_dict = {}
190-
seqs_parsed = 0
190+
seqs_parsed = 0
191191
if (runCmd(command)):
192192
for line in runCmd(command):
193193
cas_line_match = cas_line_re.search(line)
194194
if cas_line_match:
195-
idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero
195+
idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero
196196
try:
197197
name = order_of_blobs[idx]
198198
reads = int(cas_line_match.group(3))
@@ -239,26 +239,23 @@ def parseColourDict(infile):
239239

240240
def getNodesDB(**kwargs):
241241
'''
242-
Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
243-
gets JSON'ed into blobtools/data/nodes_db.json if this file
244-
does not exist. This file is used if neither "--names" and "--nodes"
242+
Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
243+
gets JSON'ed into blobtools/data/nodes_db.json if this file
244+
does not exist. This file is used if neither "--names" and "--nodes"
245245
nor "--db" is specified.
246246
'''
247247
nodesDB = {}
248-
nodesDB_f = ''
249-
if (kwargs['nodesDB']):
250-
print BtLog.status_d['4'] % (kwargs['nodesDB'])
251-
nodesDB = readNodesDB(kwargs['nodesDB'])
252-
nodesDB_f = kwargs['nodesDB']
253-
elif (kwargs['names'] and kwargs['nodes']):
248+
nodesDB_f = ''
249+
250+
if (kwargs['names'] and kwargs['nodes']):
254251
print BtLog.status_d['3'] % (kwargs['nodes'], kwargs['names'])
255252
nodesDB = {}
256253
nodes_count = 0
257254
with open(kwargs['nodes']) as fh:
258255
for line in fh:
259256
nodes_col = line.split("\t")
260257
node = {}
261-
node_id = nodes_col[0]
258+
node_id = nodes_col[0]
262259
node['parent'] = nodes_col[2]
263260
node['rank'] = nodes_col[4]
264261
nodesDB[node_id] = node
@@ -270,6 +267,10 @@ def getNodesDB(**kwargs):
270267
nodesDB[names_col[0]]['name'] = names_col[2]
271268
nodesDB_f = kwargs['nodesDB']
272269
nodesDB['nodes_count'] = nodes_count
270+
elif (kwargs['nodesDB']):
271+
print BtLog.status_d['4'] % (kwargs['nodesDB'])
272+
nodesDB = readNodesDB(kwargs['nodesDB'])
273+
nodesDB_f = kwargs['nodesDB']
273274
else:
274275
BtLog.error('3')
275276
return nodesDB, nodesDB_f
@@ -293,9 +294,9 @@ def writeNodesDB(nodesDB, nodesDB_f):
293294
nodes_count = nodesDB['nodes_count']
294295
i = 0
295296
with open(nodesDB_f, 'w') as fh:
296-
fh.write("# nodes_count = %s\n" % nodes_count)
297+
fh.write("# nodes_count = %s\n" % nodes_count)
297298
for node in nodesDB:
298-
if not node == "nodes_count":
299+
if not node == "nodes_count":
299300
i += 1
300301
BtLog.progress(i, 1000, nodes_count)
301302
fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
@@ -316,26 +317,26 @@ def byteify(input):
316317
def writeJsonGzip(obj, outfile):
317318
import json
318319
import gzip
319-
with gzip.open(outfile, 'wb') as fh:
320+
with gzip.open(outfile, 'wb') as fh:
320321
json.dump(obj, fh)
321322

322323
def writeJson(obj, outfile):
323324
import json
324-
with open(outfile, 'w') as fh:
325+
with open(outfile, 'w') as fh:
325326
json.dump(obj, fh)
326327

327328
def readJsonGzip(infile):
328329
import json
329330
import gzip
330-
with gzip.open(infile, 'rb') as fh:
331+
with gzip.open(infile, 'rb') as fh:
331332
obj = json.loads(fh.read().decode("ascii"))
332333
return byteify(obj)
333334

334335
def readJson(infile):
335336
import json
336-
with open(infile, 'r') as fh:
337+
with open(infile, 'r') as fh:
337338
obj = json.loads(fh.read().decode("ascii"))
338339
return byteify(obj)
339340

340-
if __name__ == "__main__":
341+
if __name__ == "__main__":
341342
pass

0 commit comments

Comments
 (0)