DRL
diff --git a/‎bloblib/BtCore.py
Lines changed: 1 addition & 1 deletion b/‎bloblib/BtCore.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎bloblib/BtIO.py
Lines changed: 30 additions & 31 deletions b/‎bloblib/BtIO.py
Lines changed: 30 additions & 31 deletions
diff --git a/‎bloblib/BtLog.py
Lines changed: 2 additions & 1 deletion b/‎bloblib/BtLog.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎bloblib/BtTax.py
Lines changed: 29 additions & 29 deletions b/‎bloblib/BtTax.py
Lines changed: 29 additions & 29 deletions
diff --git a/‎bloblib/blobplot.py
Lines changed: 2 additions & 1 deletion b/‎bloblib/blobplot.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎bloblib/covplot.py
Lines changed: 3 additions & 2 deletions b/‎bloblib/covplot.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎bloblib/create.py
Lines changed: 11 additions & 8 deletions b/‎bloblib/create.py
Lines changed: 11 additions & 8 deletions
@@ -437,7 +437,7 @@ def calculateGC(self, seq):
                      if self.agct_count > 0 else 0.0)
 
     def addCov(self, lib_name, cov):
-        self.covs[lib_name] = cov
+        self.covs[lib_name] = float("{0:.3f}".format(cov)) # changed to three decimal digits
 
     def addReadCov(self, lib_name, read_cov):
         self.read_cov[lib_name] = read_cov
 
@@ -9,20 +9,18 @@
 from __future__ import division
 import re
 import subprocess
-from os.path import basename, isfile, abspath, splitext, join, isdir
-import shutil
 import os
-import sys
+from os.path import basename, isfile, splitext, join, isdir
+import shutil
 import bloblib.BtLog as BtLog
-from collections import deque
 
 
 def create_dir(directory="", overwrite=True):
-    if (directory):
+    if directory:
         if not isdir(directory):
             os.makedirs(directory)
         else:
-            if (overwrite):
+            if overwrite:
                 shutil.rmtree(directory)           #removes all the subdirectories!
                 os.makedirs(directory)
         return directory
@@ -31,7 +29,7 @@ def create_dir(directory="", overwrite=True):
 
 def parseList(infile):
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     with open(infile) as fh:
         items = []
         for l in fh:
@@ -40,7 +38,7 @@ def parseList(infile):
 
 def parseReferenceCov(infile):
     refcov_dict = {}
-    if (infile):
+    if infile:
         if not isfile(infile):
             BtLog.error('0', infile)
         with open(infile) as fh:
@@ -55,7 +53,7 @@ def parseReferenceCov(infile):
 
 def parseCmdlist(temp):
     _list = []
-    if (temp):
+    if temp:
         if "," in temp:
             _list = temp.split(",")
         else:
@@ -65,7 +63,7 @@ def parseCmdlist(temp):
 def parseCmdLabels(labels):
     label_d = {}
     name, groups = '', ''
-    if (labels):
+    if labels:
         try:
             for label in labels:
                 name, groups = str(label).split("=")
@@ -80,7 +78,7 @@ def parseCmdLabels(labels):
 
 def parseCatColour(infile):
     catcolour_dict = {}
-    if (infile):
+    if infile:
         if not isfile(infile):
             BtLog.error('0', infile)
         with open(infile) as fh:
@@ -94,7 +92,7 @@ def parseCatColour(infile):
 
 def parseDict(infile, key, value):
     items = {}
-    if (infile):
+    if infile:
         if not isfile(infile):
             BtLog.error('0', infile)
         with open(infile) as fh:
@@ -108,7 +106,7 @@ def parseDict(infile, key, value):
 
 def parseColours(infile):
     items = {}
-    if (infile):
+    if infile:
         if not isfile(infile):
             BtLog.error('0', infile)
         with open(infile) as fh:
@@ -119,7 +117,7 @@ def parseColours(infile):
 
 def parseSet(infile):
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     with open(infile) as fh:
         items = set()
         for l in fh:
@@ -134,12 +132,12 @@ def parseFastaNameOrder(infile):
 
 def readFasta(infile):
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     with open(infile) as fh:
         header, seqs = '', []
         for l in fh:
             if l[0] == '>':
-                if (header):
+                if header:
                     yield header, ''.join(seqs)
                 header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
             else:
@@ -173,8 +171,8 @@ def is_exe(fpath):
 def checkBam(infile):
     print BtLog.status_d['10']
     if not isfile(infile):
-         BtLog.error('0', infile)
-    if not (which('samtools')):
+        BtLog.error('0', infile)
+    if not which('samtools'):
         BtLog.error('7')
     reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
     reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
@@ -189,14 +187,15 @@ def checkBam(infile):
     reads_mapped = reads_mapped - reads_secondary
     reads_total = int(reads_total_re.search(output).group(1))
     # check whether there are reads in BAM
-    if not (reads_total) or not (reads_mapped):
-          BtLog.error('29' % infile)
-    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
+    if not reads_total or not reads_mapped:
+        BtLog.error('29' % infile)
+    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
+        '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
     return reads_total, reads_mapped
 
 def parseSam(infile, set_of_blobs, no_base_cov_flag):
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     base_cov_dict = {blob : [] for blob in set_of_blobs}
     read_cov_dict = {blob : 0 for blob in set_of_blobs}
     cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
@@ -241,15 +240,15 @@ def parseBam(infile, set_of_blobs, no_base_cov_flag):
 
     '''
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     reads_total, reads_mapped = checkBam(infile)
     progress_unit = int(reads_mapped/1000)
     base_cov_dict = {blob : [] for blob in set_of_blobs}
     #base_cov_dict = {blob : 0 for blob in set_of_blobs}
     read_cov_dict = {blob : 0 for blob in set_of_blobs}
     cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
     # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
-    command = "samtools view -F 1028 -F 4 -F 256 " + infile
+    command = "samtools view -F 1024 -F 4 -F 256 " + infile
     seen_reads = 0
     #import time
     #start = time.time()
@@ -308,7 +307,7 @@ def parseCovFromHeader(fasta_type, header):
 
 def parseCov(infile, set_of_blobs):
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
     base_cov_dict = {}
 
@@ -361,7 +360,7 @@ def parseCov(infile, set_of_blobs):
 def checkCas(infile):
     print BtLog.status_d['12']
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     if not (which('clc_mapping_info')):
         BtLog.error('20')
     seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
@@ -380,7 +379,7 @@ def checkCas(infile):
 
 def parseCas(infile, order_of_blobs):
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     seqs_total, reads_total, reads_mapped = checkCas(infile)
     progress_unit = int(len(order_of_blobs)/100)
     cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
@@ -413,7 +412,7 @@ def readTax(infile, set_of_blobs):
         - add as key-value pairs to hitDict
     '''
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards
     with open(infile) as fh:
         for line in fh:
@@ -507,7 +506,7 @@ def readNamesNodes(names_f, nodes_f):
         for line in fh:
             names_col = line.split("\t")
             if names_col[6] == "scientific name":
-               nodesDB[names_col[0]]['name'] = names_col[2]
+                nodesDB[names_col[0]]['name'] = names_col[2]
     nodesDB['nodes_count'] = nodes_count
     return nodesDB
 
@@ -545,7 +544,7 @@ def byteify(input):
     http://stackoverflow.com/a/13105359
     '''
     if isinstance(input, dict):
-        return {byteify(key):byteify(value) for key,value in input.iteritems()}
+        return {byteify(key):byteify(value) for key, value in input.iteritems()}
     elif isinstance(input, list):
         return [byteify(element) for element in input]
     elif isinstance(input, unicode):
@@ -580,7 +579,7 @@ def parseJsonGzip(infile):
 def parseJson(infile):
     '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
     if not isfile(infile):
-         BtLog.error('0', infile)
+        BtLog.error('0', infile)
     import time
     start = time.time()
     json_parser = ''
 
@@ -86,7 +86,8 @@ def progress(iteration, steps, max_value, no_limit=False):
     '6' : '[WARN]\t\t: Sum of coverage in cov lib %s is 0.0. Please ignore this warning if "--no_base_cov" was specified.',
     '7' : '[WARN]\t\t: No taxonomy information found.',
     '8' : '[WARN]\t\t: Duplicated sequences found :\n\t\t\t%s',
-    '9' : '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...'
+    '9' : '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...',
+    '10' : '[WARN]\t\t: Line %s: sequence "%s" already has TaxID "%s". Skipped. (use --force to overwrite)'
 
 }
 status_d = {
 
@@ -11,42 +11,42 @@
 TAXRULES = ['bestsum', 'bestsumorder'] # this should be re-named colour rules at one point
 
 def noHit():
-	return {rank : {'tax' : 'no-hit', 'score' : 0.0, 'c_index' : None} for rank in RANKS}
+    return {rank : {'tax' : 'no-hit', 'score' : 0.0, 'c_index' : None} for rank in RANKS}
 
 def getTreeList(taxIds, nodesDB):
     known_tree_lists = {}
     for taxId in taxIds:
-    	if not taxId in known_tree_lists:
-    		tree_list = []
-    		nextTaxId = [taxId]
-    		while nextTaxId:
-        		thisTaxId = nextTaxId.pop(0)
-        		if (not thisTaxId == '1') and (thisTaxId in nodesDB):
-        			parent = nodesDB[thisTaxId]['parent']
-        			nextTaxId.append(parent)
-        			tree_list.append(thisTaxId)
-        		else:
-    				tree_list.append('1')
-    		known_tree_lists[taxId] = tree_list
+        if not taxId in known_tree_lists:
+            tree_list = []
+            nextTaxId = [taxId]
+            while nextTaxId:
+                thisTaxId = nextTaxId.pop(0)
+                if (not thisTaxId == '1') and (thisTaxId in nodesDB):
+                    parent = nodesDB[thisTaxId]['parent']
+                    nextTaxId.append(parent)
+                    tree_list.append(thisTaxId)
+                else:
+                    tree_list.append('1')
+            known_tree_lists[taxId] = tree_list
     return known_tree_lists
 
 def getLineages(tree_lists, nodesDB):
-	lineage = {}
-	for tree_list_id, tree_list in tree_lists.items():
-		lineage[tree_list_id] = {rank : 'undef' for rank in RANKS}
-		for taxId in tree_list:
-			node = nodesDB[taxId]
-			if node['rank'] in RANKS:
-				lineage[tree_list_id][node['rank']] = node['name']
-		# traverse ranks again so that undef is "higher_def_rank" + "-" + undef
-		def_rank = ''
-		for rank in reversed(list(RANKS)):
-			if not lineage[tree_list_id][rank] == 'undef':
-				def_rank = lineage[tree_list_id][rank]
-			else:
-				if (def_rank):
-					lineage[tree_list_id][rank] = def_rank + "-" + lineage[tree_list_id][rank]
-	return lineage
+    lineage = {}
+    for tree_list_id, tree_list in tree_lists.items():
+        lineage[tree_list_id] = {rank : 'undef' for rank in RANKS}
+        for taxId in tree_list:
+            node = nodesDB[taxId]
+            if node['rank'] in RANKS:
+                lineage[tree_list_id][node['rank']] = node['name']
+        # traverse ranks again so that undef is "higher_def_rank" + "-" + undef
+        def_rank = ''
+        for rank in reversed(list(RANKS)):
+            if not lineage[tree_list_id][rank] == 'undef':
+                def_rank = lineage[tree_list_id][rank]
+            else:
+                if (def_rank):
+                    lineage[tree_list_id][rank] = def_rank + "-" + lineage[tree_list_id][rank]
+    return lineage
 
 def taxRuleBestSum(taxDict, taxonomy, min_bitscore_diff, tax_collision_random):
     tempTax = { rank : {} for rank in RANKS }
 
@@ -34,7 +34,8 @@
                                      span  : span-weighted histograms
                                      count : count histograms
         -r, --rank <RANK>           Taxonomic rank used for colouring of blobs [default: phylum]
-                                     (Supported: species, genus, family, order, phylum, superkingdom)
+                                     (Supported: species, genus, family, order,
+                                        phylum, superkingdom)
         -x, --taxrule <TAXRULE>     Taxrule which has been used for computing taxonomy
                                      (Supported: bestsum, bestsumorder) [default: bestsum]
         --format FORMAT             Figure format for plot (png, pdf, eps, jpeg,
 
@@ -40,7 +40,8 @@
                                      span  : span-weighted histograms
                                      count : count histograms
         -r, --rank <RANK>           Taxonomic rank used for colouring of blobs [default: phylum]
-                                     (Supported: species, genus, family, order, phylum, superkingdom)
+                                     (Supported: species, genus, family, order,
+                                        phylum, superkingdom)
         -x, --taxrule <TAXRULE>     Taxrule which has been used for computing taxonomy
                                      (Supported: bestsum, bestsumorder) [default: bestsum]
         --format FORMAT             Figure format for plot (png, pdf, eps, jpeg,
@@ -57,7 +58,7 @@
                                      per coverage file. (e.g.: bam0,900,100). If provided, info
                                      will be used in read coverage plot(s).
         --catcolour <FILE>            Colour plot based on categories from FILE
-                                     (format : "seq\tcategory").
+                                     (format : "seq,category").
 """
 
 from __future__ import division
 
@@ -4,20 +4,23 @@
 """usage: blobtools create     -i FASTA [-y FASTATYPE] [-o PREFIX] [--title TITLE]
                               [-b BAM...] [-s SAM...] [-a CAS...] [-c COV...]
                               [--nodes <NODES>] [--names <NAMES>] [--db <NODESDB>]
-                              [-t TAX...] [-x TAXRULE...] [-m INT] [--tax_collision_random]
+                              [-t HITS...] [-x TAXRULE...] [-m INT] [--tax_collision_random]
                               [-h|--help]
 
     Options:
         -h --help                       show this
         -i, --infile FASTA              FASTA file of assembly. Headers are split at whitespaces.
         -y, --type FASTATYPE            Assembly program used to create FASTA. If specified,
                                         coverage will be parsed from FASTA header.
-                                        (Parsing supported for 'spades', 'soap', 'velvet', 'abyss', 'platanus')
-        -t, --taxfile TAX...            Taxonomy file in format (qseqid\\ttaxid\\tbitscore)
+                                        (Parsing supported for 'spades', 'velvet', 'platanus')
+        -t, --hitsfile HITS...          Hits file in format (qseqid\\ttaxid\\tbitscore)
                                         (e.g. BLAST output "--outfmt '6 qseqid staxids bitscore'")
-        -x, --taxrule <TAXRULE>...      Taxrule determines how taxonomy of blobs is computed [default: bestsum]
-                                        "bestsum"       : sum bitscore across all hits for each taxonomic rank
-                                        "bestsumorder"  : sum bitscore across all hits for each taxonomic rank.
+        -x, --taxrule <TAXRULE>...      Taxrule determines how taxonomy of blobs
+                                        is computed [default: bestsum]
+                                        "bestsum"       : sum bitscore across all
+                                                          hits for each taxonomic rank
+                                        "bestsumorder"  : sum bitscore across all
+                                                          hits for each taxonomic rank.
                                                   - If first <TAX> file supplies hits, bestsum is calculated.
                                                   - If no hit is found, the next <TAX> file is used.
         -m, --min_diff <FLOAT>          Minimal score difference between highest scoring
@@ -30,7 +33,7 @@
         -b, --bam <BAM>...              BAM file(s) (requires samtools in $PATH)
         -s, --sam <SAM>...              SAM file(s)
         -a, --cas <CAS>...              CAS file(s) (requires clc_mapping_info in $PATH)
-        -c, --cov <COV>...              TAB separated. (seqID\\tcoverage)
+        -c, --cov <COV>...              COV file(s)
         -o, --out <PREFIX>              BlobDB output prefix
         --title TITLE                   Title of BlobDB [default: output prefix)
 """
@@ -57,7 +60,7 @@ def main():
     bam_fs = args['--bam']
     cov_fs = args['--cov']
     cas_fs = args['--cas']
-    hit_fs = args['--taxfile']
+    hit_fs = args['--hitsfile']
     prefix = args['--out']
     nodesDB_f = args['--db']
     names_f = args['--names']
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,8 @@ def progress(iteration, steps, max_value, no_limit=False):`
`86`	`86`	`'6' : '[WARN]\t\t: Sum of coverage in cov lib %s is 0.0. Please ignore this warning if "--no_base_cov" was specified.',`
`87`	`87`	`'7' : '[WARN]\t\t: No taxonomy information found.',`
`88`	`88`	`'8' : '[WARN]\t\t: Duplicated sequences found :\n\t\t\t%s',`
`89`		`- '9' : '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...'`
	`89`	`+ '9' : '[WARN]\t\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...',`
	`90`	`+ '10' : '[WARN]\t\t: Line %s: sequence "%s" already has TaxID "%s". Skipped. (use --force to overwrite)'`
`90`	`91`
`91`	`92`	`}`
`92`	`93`	`status_d = {`