DRL
diff --git a/‎blobtools
Lines changed: 3 additions & 3 deletions b/‎blobtools
Lines changed: 3 additions & 3 deletions
diff --git a/‎forge.py renamed to ‎create.py
Lines changed: 32 additions & 29 deletions b/‎forge.py renamed to ‎create.py
Lines changed: 32 additions & 29 deletions
diff --git a/‎data/colours
Lines changed: 11 additions & 0 deletions b/‎data/colours
Lines changed: 11 additions & 0 deletions
diff --git a/‎lib/BtCore.py
Lines changed: 96 additions & 54 deletions b/‎lib/BtCore.py
Lines changed: 96 additions & 54 deletions
@@ -5,7 +5,7 @@
 usage: blobtools <command> [<args>...] [--help]
 
 commands:
-  forge     create a BlobDB
+  create     create a BlobDB
   view      print BlobDB
   plot      plot BlobDB as a blobplot
 
@@ -27,8 +27,8 @@ if __name__ == '__main__':
     #print(args)
 
     argv = [args['<command>']] + args['<args>']
-    if args['<command>'] == 'forge':
-        exit(call(['python', main_dir + 'forge.py'] + argv))
+    if args['<command>'] == 'create':
+        exit(call(['python', main_dir + 'create.py'] + argv))
     elif args['<command>'] == 'view':
         exit(call(['python', main_dir + 'view.py'] + argv))
     elif args['<command>'] == 'plot':
 
@@ -1,34 +1,34 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-"""usage: blobtools forge     --i <FASTA> [--type <ASSEMBLY>] [--out <OUT>] [--title <TITLE>]
-                              [--bam <BAM>...] [--sam <SAM>...] [--cas <CAS>...] [--cov <COV>...]  
+"""usage: blobtools create     -i FASTA [-y FASTATYPE] [-o OUTFILE] [--title TITLE]
+                              [-b BAM...] [-s SAM...] [-a CAS...] [-c COV...]  
                               [--nodes <NODES>] [--names <NAMES>] [--db <NODESDB>] 
-                              [--tax <TAX>...] [--taxrule <TAXRULE>...]
-                              [--h|--help] 
+                              [-t TAX...] [-r TAXRULE...]
+                              [-h|--help] 
     
     Options:
-        --h --help              show this
-        --i <FASTA>             FASTA file of assembly 
-        --type ASSEMBLY         Assembly program used to create FASTA. If specified, 
-                                coverage will be parsed from FASTA header. 
-                                (Parsing supported for 'spades', 'soap', 'velvet', 'abyss')
-        --tax <TAX>...          Taxonomy file in format (qseqid\\ttaxid\\tbitscore) 
-                                (e.g. BLAST output "--outfmt '6 std'")
-        --taxrule <TAXRULE>...  Taxrule determines how taxonomy of blobs is computed [default: bestsum]
-                                "bestsum"       : sum bitscore across all hits for each taxonomic rank
-                                "bestsumorder"  : sum bitscore across all hits for each taxonomic rank. 
-                                                  - If first <TAX> file supplies hits these are used. 
+        -h --help                   show this
+        -i, --infile FASTA          FASTA file of assembly. Headers are split at whitespaces.  
+        -y, --type FASTATYPE        Assembly program used to create FASTA. If specified, 
+                                    coverage will be parsed from FASTA header. 
+                                    (Parsing supported for 'spades', 'soap', 'velvet', 'abyss')
+        -t, --taxfile TAX...        Taxonomy file in format (qseqid\\ttaxid\\tbitscore) 
+                                    (e.g. BLAST output "--outfmt '6 qseqid staxids bitscore'")
+        -r, --taxrule <TAXRULE>...  Taxrule determines how taxonomy of blobs is computed [default: bestsum]
+                                    "bestsum"       : sum bitscore across all hits for each taxonomic rank
+                                    "bestsumorder"  : sum bitscore across all hits for each taxonomic rank. 
+                                                  - If first <TAX> file supplies hits, bestsum is calculated. 
                                                   - If no hit is found, the next <TAX> file is used.                                 
-        --nodes <NODES>         NCBI nodes.dmp file. Not required if '--db'
-        --names <NAMES>         NCBI names.dmp file. Not required if '--db' 
-        --db <NODESDB>          NodesDB file [default: data/nodesDB.txt]. 
-        --bam <BAM>...          BAM file (requires samtools in $PATH)
-        --sam <SAM>...          SAM file
-        --cas <CAS>...          CAS file (requires clc_mapping_info in $PATH)
-        --cov <COV>...          TAB separated. (seqID\\tcoverage)
-        --out <OUT>             BlobDB output file [default: blobDb.json]
-        --title TITLE           Title of BlobDB [default: FASTA)  
+        --nodes <NODES>             NCBI nodes.dmp file. Not required if '--db'
+        --names <NAMES>             NCBI names.dmp file. Not required if '--db' 
+        --db <NODESDB>              NodesDB file [default: data/nodesDB.txt]. 
+        -b, --bam <BAM>...          BAM file (requires samtools in $PATH)
+        -s, --sam <SAM>...          SAM file
+        -a, --cas <CAS>...          CAS file (requires clc_mapping_info in $PATH)
+        -c, --cov <COV>...          TAB separated. (seqID\\tcoverage)
+        -o, --out <OUT>             BlobDB output prefix 
+        --title TITLE               Title of BlobDB [default: FASTA)  
 """
 
 from __future__ import division
@@ -45,17 +45,20 @@
     #print data_dir
     args = docopt(__doc__)
     #print args
-
-    fasta_f = args['--i']
+    fasta_f = args['--infile']
     fasta_type = args['--type']
 
     sam_fs = args['--sam']
     bam_fs = args['--bam']
     cov_fs = args['--cov']
     cas_fs = args['--cas']
-    hit_fs = args['--tax']
+    hit_fs = args['--taxfile']
 
     out_f = args['--out']
+    if (out_f):
+        out_f = "%s.%s" % (out_f, "BlobDB.json")
+    else:
+        out_f = "%s" % ("BlobDB.json")
     nodesDB_f = args['--db']
     names_f = args['--names']
     nodes_f = args['--nodes']
@@ -90,7 +93,7 @@
                [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
                [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
                [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] 
-
+               
     # Create BlobDB object              
     blobDb = bt.BlobDb(title)
 
@@ -115,6 +118,6 @@
     print BtLog.status_d['6'] % ",".join(taxrules)
     blobDb.computeTaxonomy(taxrules, nodesDB)
 
-    # Writing BlobDB to file
+    # Generating BlobDB and writing to file
     print BtLog.status_d['7'] % out_f
     BtIO.writeJson(blobDb.dump(), out_f)
@@ -0,0 +1,11 @@
+#48a365=Nematoda
+#d0694a=Arthropoda
+#ffb917=Proteobacteria
+#926eb3=Actinobacteria
+#a6cee3=Ascomycota
+#d3d3d3=no-hit
+#ec84ba=Chordata
+#e0d799=Cnidaria
+#b89c75=Platyhelminthes
+#fdb761=Bacteriodetes
+#ffffff=other
@@ -107,54 +107,84 @@ def load(self, BlobDb_f):
         self.lineages = blobDict['lineages']
         self.set_of_taxIds = blobDict['lineages'].keys()
         self.order_of_blobs = blobDict['order_of_blobs']
-        self.dict_of_blobs = blobDict['dict_of_blobs'] # this will probably not work
+        self.dict_of_blobs = blobDict['dict_of_blobs'] 
         self.length = int(blobDict['length'])
         self.seqs = int(blobDict['seqs'])
         self.n_count = int(blobDict['n_count'])
         self.covLibs = blobDict['covLibs']
         self.hitLibs = blobDict['hitLibs']
         self.taxrules = blobDict['taxrules']
 
-    def getArrays(self, rank, min_length, hide_nohits, taxrule, c_index, label_d):
-        from numpy import array
-        summary_dict = {}
-        data_list = []
-        cov_dict = {covLib : [] for covLib in self.covLibs}
+    def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index):
+        data_dict = {}
+        read_cov_dict = {}
+        max_cov = 0.0
+        cov_libs = self.covLibs.keys()
+        cov_libs_reads_total = {cov_lib : data['reads_total'] for cov_lib, data in self.covLibs.items()}
+
         for blob in self.dict_of_blobs.values():
-            name = blob['name']
-            gc = blob['gc']
-            length = blob['length']
-            tax = ''
-            if (c_index):
-                tax = str(blob['taxonomy'][taxrule][rank]['c_index'])
-            else:
-                tax = blob['taxonomy'][taxrule][rank]['tax']
-                if label_d and tax in label_d:
-                    tax = label_d[tax] 
-            if not tax in summary_dict:
-                summary_dict[tax] = {'count_total' : 0,
-                                     'count_hidden' : 0,
-                                     'count_visible' : 0,
-                                     'span_total': 0, 
-                                     'span_hidden' : 0, 
-                                     'span_visible' : 0}
-            if ((hide_nohits) and tax == 'no-hit') or length < min_length:
-                summary_dict[tax]['count_hidden'] = summary_dict[tax].get('count_hidden', 0) + 1
-                summary_dict[tax]['span_hidden'] = summary_dict[tax].get('span_hidden', 0) + length
-            else:
-                data_list.append([(name), (length), (gc), (tax)])
-                for covLib in self.covLibs:
-                    cov = float(blob['covs'][covLib])
-                    if cov < 0.1:
-                        cov = 0.1
-                    cov_dict[covLib].append(cov)
-                summary_dict[tax]['count_visible'] = summary_dict[tax].get('count_visible', 0) + 1
-                summary_dict[tax]['span_visible'] = summary_dict[tax].get('span_visible', 0) + int(length)
-            summary_dict[tax]['count_total'] = summary_dict[tax].get('count_total', 0) + 1
-            summary_dict[tax]['span_total'] = summary_dict[tax].get('span_total', 0) + int(length)
-        data_array = array(data_list)
-        cov_arrays = {covLib: array(cov) for covLib, cov in cov_dict.items()}
-        return data_array, cov_arrays, summary_dict
+            name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
+            
+            if (c_index): # annotation with c_index instead of taxonomic group 
+                group = str(blob['taxonomy'][taxrule][rank]['c_index'])
+            else: # annotation with taxonomic group
+                group = str(blob['taxonomy'][taxrule][rank]['tax'])
+            
+            if not group in data_dict: 
+                data_dict[group] = {
+                                    'name' : [], 
+                                    'length' : [], 
+                                    'gc' : [], 
+                                    'covs' : {covLib : [] for covLib in cov_libs}, 
+                                    'reads_mapped' : {covLib : 0 for covLib in cov_libs},
+                                    'count' : 0,
+                                    'count_hidden' : 0,
+                                    'count_visible' : 0,
+                                    'span': 0, 
+                                    'span_hidden' : 0, 
+                                    'span_visible' : 0,
+                                    }
+                if len(cov_libs) > 1:
+                    data_dict[group]['covs']['sum'] = []
+                    data_dict[group]['reads_mapped']['sum'] = 0
+
+            if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
+                data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
+                data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
+            else: # visible
+                data_dict[group]['count_visible'] = data_dict[group].get('count_visible', 0) + 1
+                data_dict[group]['span_visible'] = data_dict[group].get('span_visible', 0) + int(length)
+
+            data_dict[group]['name'].append(name)
+            data_dict[group]['length'].append(length)
+            data_dict[group]['gc'].append(gc)
+
+            cov_sum = 0.0
+            reads_mapped_sum = 0
+            for cov_lib in sorted(cov_libs):
+                cov = float(blob['covs'][cov_lib]) 
+                cov_sum += cov
+                cov = cov if cov > 0.02 else 0.02
+                if cov > max_cov:
+                    max_cov = cov
+                data_dict[group]['covs'][cov_lib].append(cov)
+                if cov_lib in blob['read_cov']:
+                    reads_mapped = blob['read_cov'][cov_lib]
+                    reads_mapped_sum += reads_mapped
+                    data_dict[group]['reads_mapped'][cov_lib] += reads_mapped  
+            
+            if len(cov_libs) > 1:
+                cov_sum = cov_sum if cov_sum > 0.02 else 0.02
+                data_dict[group]['covs']['sum'].append(cov_sum)
+                if cov > max_cov:
+                    max_cov = cov
+                if (reads_mapped_sum):
+                    data_dict[group]['reads_mapped']['sum'] += reads_mapped_sum
+
+            data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
+            data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
+
+        return data_dict, max_cov, cov_libs, cov_libs_reads_total
 
     def addCovLib(self, covLib):
         self.covLibs[covLib.name] = covLib
@@ -166,8 +196,7 @@ def parseFasta(self, fasta_f, fasta_type):
         self.assembly_f = abspath(fasta_f)
         if (fasta_type):
             # Set up CovLibObj for coverage in assembly header
-            cov_lib = CovLibObj(fasta_type, fasta_type, fasta_f)
-            self.covLibs[covLib.name] = covLib
+            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)
 
         for name, seq in BtIO.readFasta(fasta_f):
             blObj = BlObj(name, seq)
@@ -178,7 +207,7 @@ def parseFasta(self, fasta_f, fasta_type):
 
                 if (fasta_type):
                     cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
-                    covLib.cov_sum += cov
+                    self.covLibs[fasta_type].cov_sum += cov
                     blObj.addCov(fasta_type, cov)
 
                 self.order_of_blobs.append(blObj.name)
@@ -196,32 +225,45 @@ def parseCovs(self, covLibObjs):
             if covLib.fmt == 'bam' or covLib.fmt == 'sam':
                 base_cov_dict = {}
                 if covLib.fmt == 'bam':
-                    base_cov_dict, covLib.total_reads, covLib.mapped_reads, covLib.read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
+                    base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readBam(covLib.f, set(self.dict_of_blobs))
                 else:
-                    base_cov_dict, covLib.total_reads, covLib.mapped_reads, covLib.read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))    
+                    base_cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readSam(covLib.f, set(self.dict_of_blobs))    
+                if covLib.reads_total == 0:
+                    print BtLog.warn_d['4'] % covLib.f
                 for name, base_cov in base_cov_dict.items():
                     cov = base_cov / self.dict_of_blobs[name].agct_count
                     covLib.cov_sum += cov
                     self.dict_of_blobs[name].addCov(covLib.name, cov)
+                    self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
             elif covLib.fmt == 'cas':
-                for name, cov in BtIO.readCas(covLib.f, self.order_of_blobs):
+                cov_dict, covLib.reads_total, covLib.reads_mapped, read_cov_dict = BtIO.readCas(covLib.f, self.order_of_blobs)
+                if covLib.reads_total == 0:
+                    print BtLog.warn_d['4'] % covLib.f
+                for name, cov in cov_dict.items():
                     covLib.cov_sum += cov
                     self.dict_of_blobs[name].addCov(covLib.name, cov)
+                    self.dict_of_blobs[name].read_cov = {covLib.name : read_cov_dict[name]}
             elif covLib.fmt == 'cov':
-                for name, cov in BtIO.readCov(covLib.f, set(self.dict_of_blobs)):
-                    covLib.cov_sum += cov
-                    self.dict_of_blobs[name].addCov(covLib.name, cov)
+                cov_dict = BtIO.readCov(covLib.f, set(self.dict_of_blobs))
+                if not len(cov_dict) == self.seqs:
+                    print BtLog.warn_d['4'] % covLib.f
+                covLib.cov_sum += cov
+                self.dict_of_blobs[name].addCov(covLib.name, cov)
             else:
                 pass        
             covLib.mean_cov = covLib.cov_sum/self.seqs
             self.covLibs[covLib.name] = covLib
 
+
     def parseHits(self, hitLibs):
         for hitLib in hitLibs:
             self.hitLibs[hitLib.name] = hitLib
             print BtLog.status_d['1'] % (hitLib.name, hitLib.f)
             # only accepts format 'seqID\ttaxID\tscore'
             for hitDict in BtIO.readTax(hitLib.f, set(self.dict_of_blobs)):
+                if ";" in hitDict['taxId']:
+                    hitDict['taxId'] = hitDict['taxId'].split(";")[0]
+                    print BtLog.warn['5'] % (hitDict['name'], hitLib)
                 self.set_of_taxIds.add(hitDict['taxId'])
                 self.dict_of_blobs[hitDict['name']].addHits(hitLib.name, hitDict)
 
@@ -246,8 +288,8 @@ def counts(self):
             'Ns'       : self.n_count,
             'AvgCov'   : {lib : round(covlibObj.cov_sum/self.seqs, 2) for lib, covlibObj in self.covLibs.items()},
             'GC'       : round(sum([blObj.gc for blObj in self.dict_of_blobs.values()])/self.seqs, 2),
-            'MappedReads' : {lib : (covlibObj.mapped_reads) for lib, covlibObj in self.covLibs.items()},
-            'TotalReads' : {lib : (covlibObj.total_reads) for lib, covlibObj in self.covLibs.items()}
+            'MappedReads' : {lib : (covlibObj.reads_mapped) for lib, covlibObj in self.covLibs.items()},
+            'TotalReads' : {lib : (covlibObj.reads_total) for lib, covlibObj in self.covLibs.items()}
         }
         print count_dict
 
@@ -263,6 +305,7 @@ def __init__(self, name, seq):
         self.agct_count = self.length - self.n_count
         self.gc = round(self.calculateGC(seq), 4)
         self.covs = {}
+        self.read_cov = {}
         self.hits = {}
         self.taxonomy = {}
 
@@ -284,9 +327,8 @@ def __init__(self, name, fmt, f):
         self.fmt = fmt
         self.f = abspath(f)
         self.cov_sum = 0
-        self.total_reads = 0
-        self.mapped_reads = 0
-        self.read_cov_dict = {} 
+        self.reads_total = 0
+        self.reads_mapped = 0
         self.mean_cov = 0.0
 
 class hitLibObj():