From bc5abd7e4d006e0fddec135986a92e4148bc2c68 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 3 Dec 2019 00:47:56 -0500
Subject: [PATCH 01/11] initial commit of
 metagenomics.py::krakenuniq_report_filter

---
 metagenomics.py | 578 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 438 insertions(+), 140 deletions(-)

diff --git a/metagenomics.py b/metagenomics.py
index 081a5a9fd..39e97eefe 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -14,6 +14,7 @@
 import itertools
 import logging
 import os.path
+from os import linesep
 from os.path import join
 import operator
 import queue
@@ -36,7 +37,6 @@
 import tools.krona
 import tools.picard
 import tools.samtools
-from util.file import open_or_gzopen
 
 __commands__ = []
 
@@ -109,7 +109,7 @@ def __init__(
     def load_gi_single_dmp(self, dmp_path):
         '''Load a gi->taxid dmp file from NCBI taxonomy.'''
         gi_array = {}
-        with open_or_gzopen(dmp_path) as f:
+        with util.file.open_or_gzopen(dmp_path) as f:
             for i, line in enumerate(f):
                 gi, taxid = line.strip().split('\t')
                 gi = int(gi)
@@ -142,7 +142,7 @@ def load_nodes(self, nodes_db):
         '''Load ranks and parents arrays from NCBI taxonomy.'''
         ranks = {}
         parents = {}
-        with open_or_gzopen(nodes_db) as f:
+        with util.file.open_or_gzopen(nodes_db) as f:
             for line in f:
                 parts = line.strip().split('|')
                 taxid = int(parts[0])
@@ -589,7 +589,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade
         output_path = os.path.join(outputDb, path)
 
         input_path = maybe_compressed(input_path)
-        with open_or_gzopen(input_path, 'rt') as f, \
+        with util.file.open_or_gzopen(input_path, 'rt') as f, \
              open_or_gzopen(output_path, 'wt') as out_f:
             if header:
                 out_f.write(f.readline())  # Cannot use next(f) for python2
@@ -1047,7 +1047,7 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
 
     # perform the actual filtering to return a list of read IDs, writeen to a temp file
     with util.file.tempfname(".txt.gz") as temp_read_list:
-        with open_or_gzopen(temp_read_list, "wt") as read_IDs_file:
+        with util.file.open_or_gzopen(temp_read_list, "wt") as read_IDs_file:
             read_ids_written = 0
             for row in util.file.read_tabfile(read_IDs_to_tax_IDs):
                 assert tax_id_col<len(row), "tax_id_col does not appear to be in range for number of columns present in mapping file"
@@ -1077,7 +1077,276 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
             tools.samtools.SamtoolsTool().dumpHeader(in_bam,out_bam)
 __commands__.append(('filter_bam_to_taxa', parser_filter_bam_to_taxa))
 
+def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()):
+    parser.add_argument('summary_file_in', help='Input KrakenUNiq-format summary text file with tab-delimited fields and indented taxonomic levels.')
+    parser.add_argument('summary_file_out', help='Output KrakenUNiq-format summary text file with tab-delimited fields and indented taxonomic levels.')
+    parser.add_argument('--fieldToFilterOn', 
+                            choices=[
+                                "num_reads",
+                                "uniq_kmers",
+                                #"kmer_dups",
+                                #"reads_exc_children",
+                            ], 
+                            dest="field_to_filter", 
+                            help='The field to filter on (default: %(default)s).', 
+                            default="uniq_kmers"
+    )
+    parser.add_argument('--keepAboveN', 
+                        type=int, 
+                        dest="keep_threshold", 
+                        help='Only taxa with values above this will be kept. '
+                            'Higher taxonomic ranks will have their values '
+                            'reduced by the values of removed sub-ranks (default: %(default)s)',
+                        default=100
+    )
+    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.attach_main(parser, krakenuniq_report_filter, split_args=True)
+    return parser
 
+def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter, keep_threshold):
+    """
+        Filter a krakenuniq report by field to include rows above some threshold, 
+        where contributions to the value from subordinate levels
+        are first removed from the value.
+    """
+
+    # class to represent taxonomic tree
+    # entry in KrakenUniq report
+    class TaxNode(object):
+        def __init__(self, row, level):
+            self.data     = row
+            self.level    = level
+            self.parent   = None
+            self.children = []
+
+        def __str__(self):
+            # pad the name with the original prefix spacing
+            # when rendering as a string
+            sci_name = "  "*self.level+self.data["sci_name"]
+            return "\t".join([str(value) for key,value in self.data.items() if key!="sci_name"]+[sci_name])
+
+        def __repr__(self):
+            return self.__str__()
+
+        def add_child(self, node):
+            self.children.append(node)
+
+        def remove_child(self, node):
+            try:
+                self.children.remove(node)
+            except Exception as e:
+                raise Exception("failed trying to remove '{}' from '{}'; valid children: {}".format(node.name,self.name,",".join([c.name for c in self.children])))
+
+        def add_parent(self, node):
+            self.parent = node
+
+        @property
+        def has_parent(self):
+            return self.parent != None
+
+        @property
+        def lacks_parent(self):
+            return self.parent == None
+
+        @property
+        def has_children(self):
+            return len(self.children)>0
+
+        @property
+        def lacks_children(self):
+            return len(self.children)==0
+
+        @property
+        def name(self):
+            return self.data["sci_name"]
+
+    # load the KrakenUniq summary file
+    # into a tree structure
+    # using the taxName indentation (space prefix) to indicate hierarchy
+    # (rather than the 'rank' column since many rows have 'no rank')
+    # This assumes the hierarchy is described linearly
+    # within the file such that indentation changes
+    # indicate changes of hierarchy
+    tax_tree_top_nodes = []
+    previous_node      = None
+    top_nodes          = []
+    for row,level in _kraken_report_reader(summary_file_in):
+        # create a new code object for this row
+        current_node = TaxNode(row,level)
+
+        # if this is a top-level node ('root' or 'unclassified')
+        if current_node.level==0:
+            # set previous_node to None since
+            # we usually have at least two top-level nodes
+            previous_node=None
+
+        # if we do not have a previous node/line to consider
+        if previous_node==None:
+            # assume this line is a top-level node
+            top_nodes.append(current_node)
+        # otherwise, if the node is subordinate in 
+        # level/indentation to the previous node
+        elif current_node.level>previous_node.level:
+            # make it a child of the previous node
+            previous_node.add_child(current_node)
+            # (and set the previous node as its parent)
+            current_node.add_parent(previous_node)
+        else:
+            # if this node is not subordinate in
+            # level/indentation to the previous node
+            # track back through prior nodes
+            # until we find one that is at a higher level
+            parent_node = previous_node
+            while(current_node.level<=parent_node.level):
+                if parent_node.lacks_parent:
+                    # if we reach a node without a parent
+                    # break out (should not be reached)
+                    break
+                parent_node = parent_node.parent
+            # when we have found a superior prior node
+            # add this node as its child
+            parent_node.add_child(current_node)
+            # (and set it as this node's parent)
+            current_node.add_parent(parent_node)
+        # set this node as the previous node
+        previous_node = current_node
+
+    def remove_children(node):
+        """
+            remove child nodes from instance
+            attribute (list) of the specified node
+        """
+        if node != None:
+            if node.has_children:
+                for child in node.children:
+                    remove_children(child)
+            node.children = []
+
+    def delete_children(node):
+        """
+            delete a node outright along with its children
+        """
+        if node != None:
+            if node.has_children:
+                for child in node.children:
+                    delete_children(child)
+                    del child
+            else:
+                del node
+
+    def dfs_traversal(node):
+        """
+            yield nodes in preorder form
+        """
+        yield node
+        if node.has_children:
+            for child in node.children:
+                for n in dfs_traversal(child):
+                    yield n
+
+    def dfs_traversal_print(node):
+        """
+            print the tree in preorder form
+        """
+        for n in dfs_traversal(node):
+            print(n)
+
+    def dfs_traversal_filtered(node, field_to_filter, keep_threshold):
+        """
+            yield nodes from the tree where the field is above the threshold value.
+            NB: This does not take into account rows where the sum includes
+                the values and sums from subordinate nodes.
+                
+                subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold)
+                should be used to account for sums that include 
+                contributions from subordinate nodes
+        """
+        for n in dfs_traversal(node):
+            if (int(n.data[field_to_filter])>=keep_threshold):
+                yield n
+
+    def dfs_traversal_print_filtered(node, field_to_filter, keep_threshold):
+        """
+            Print nodes from the tree where the field is above the threshold value.
+            NB: This does not take into account rows where the sum includes
+                the values and sums from subordinate nodes.
+                
+                subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold)
+                should be used to account for sums that include 
+                contributions from subordinate nodes
+        """
+        for n in dfs_traversal(node):
+            if (int(n.data[field_to_filter])>=keep_threshold):
+                print(n)
+
+    def dfs_traversal_prune(node, field_to_filter, keep_threshold):
+        """
+            Remove nodes from the tree where the field is above the threshold value.
+            NB: This does not take into account rows where the sum includes
+                the values and sums from subordinate rows.
+
+                subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold)
+                should be used to account for sums that include 
+                contributions from subordinate nodes
+        """
+        for child in node.children:
+            dfs_traversal_prune(child, field_to_filter, keep_threshold)
+
+        for n in dfs_traversal(node):
+            if int(n.data[field_to_filter])<keep_threshold:
+                n.parent.remove_child(n)
+
+    def subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold):
+        """
+            For the given filter field and numeric threshold
+            subtract the value from the same field in
+            superior nodes, using DFS so we gradually
+            account for the entire tree.
+
+            This is intended as a first pass operation
+            to correct numeric values prior to an operation
+            to output only those nodes above some value.
+        """
+        if node.has_children:
+            for child in node.children:
+                subtract_low_values_with_upward_propagation(child, field_to_filter, keep_threshold)
+        # if the value of the field in this node is below our threshold, 
+        # reduce the field in this node and in all nodes above it by the
+        # amount of this node
+        if int(node.data[field_to_filter])<keep_threshold:
+            amount_to_remove = int(node.data[field_to_filter])
+            node_to_reduce = node
+            node.data[field_to_filter] = int(node.data[field_to_filter])-amount_to_remove
+            while(node_to_reduce.has_parent):
+                node_to_reduce = node_to_reduce.parent
+                node_to_reduce.data[field_to_filter] = int(node_to_reduce.data[field_to_filter])-amount_to_remove
+
+    def tree_size(node):
+        """
+            get the number of nodes in the tree
+            including the count of the given node
+            and its children
+        """
+        if node.has_children:
+            return 1+sum([tree_size(child) for child in node.children])
+        else:
+            return 1
+
+    with util.file.open_or_gzopen(summary_file_out, 'wt') as out_f:
+        # write the header
+        for line in _kraken_report_get_header(summary_file_in):
+            out_f.write(line)
+        # for each of the top-level nodes
+        # ('unclassified' and 'root')
+        for top_node in top_nodes:
+            log.info("tree size below (and including) '%s' before filtering: %s", top_node.data["sci_name"],tree_size(top_node))
+            subtract_low_values_with_upward_propagation(top_node, field_to_filter, keep_threshold)
+            num=0
+            for n in dfs_traversal_filtered(top_node,field_to_filter, keep_threshold):
+                num+=1
+                out_f.write(str(n)+os.linesep)
+            log.info("tree size below (and including) '%s' after filtering: %s", top_node.data["sci_name"],num)
+__commands__.append(('krakenuniq_report_filter', parser_krakenuniq_report_filter))
 
 def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
     parser.add_argument('summary_files_in', nargs="+", help='Kraken-format summary text file with tab-delimited taxonomic levels.')
@@ -1095,6 +1364,130 @@ def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
     util.cmd.attach_main(parser, taxlevel_summary, split_args=True)
     return parser
 
+def _kraken_report_get_header(kraken_file_path):
+    header_lines=[]
+    with util.file.open_or_gzopen(kraken_file_path, 'rU') as inf:
+        report_type=None
+        for lineno, line in enumerate(inf):
+            if len(line.rstrip('\r\n').strip()) == 0 or ( report_type != None and line.startswith("#") or line.startswith("%")):
+                header_lines.append(line)
+                continue
+            # KrakenUniq is mentioned on the first line of
+            # summary reports created by KrakenUniq
+            if not report_type and "KrakenUniq" in line:
+                report_type="krakenuniq"
+                header_lines.append(line)
+                continue
+            elif not report_type:
+                report_type="kraken" 
+            
+            if report_type:
+                return header_lines
+    return header_lines
+
+def _kraken_report_reader(kraken_file_path):
+    '''
+        Read Kraken(Uniq) reports and return dict rows with their fields
+    '''
+
+    # -----------------------------------------------------------------
+    # KrakenUniq has two lines prefixed by '#', a blank line,
+    # and then a TSV header beginning with "%". The column fields are:
+    # (NB:field names accurate, but space-separated in this comment 
+    # for readability here)
+    #   %        reads  taxReads  kmers  dup   cov  taxID  rank          taxName
+    #   0.05591  2      0         13     1.85  NA   10239  superkingdom  Viruses
+    #
+    # Where the fields are:
+    #   %:
+    #   reads:
+    #   taxReads:
+    #   kmers: number of unique k-mers
+    #   dup: average number of times each unique k-mer has been seen
+    #   cov: coverage of the k-mers of the clade in the database
+    #   taxID: 
+    #   rank: row["rank"]; A rank code (see list below)
+    #   taxName: row["sci_name"]; indented scientific name
+    #
+    # Taxonomic ranks used by KrakenUniq include:
+    #   unknown, no rank, sequence, assembly, subspecies, 
+    #   species, species subgroup, species group, subgenus, 
+    #   genus, tribe, subfamily, family, superfamily, parvorder, 
+    #   infraorder, suborder, order, superorder, parvclass, 
+    #   infraclass, subclass, class, superclass, subphylum, 
+    #   phylum, kingdom, superkingdom, root
+    #
+    #   via: https://github.com/fbreitwieser/krakenuniq/blob/a8b4a2dbf50553e02d3cab3c32f93f91958aa575/src/taxdb.hpp#L96-L131
+    # -----------------------------------------------------------------
+    # Kraken (standard) reports lack header lines. 
+    # (NB:field names below are only for reference. Space-separated for 
+    # readability here)
+    #   %     reads  taxReads  rank  taxID      taxName
+    #   0.00  16     0         D     10239      Viruses
+    #
+    # Where the fields are:
+    #   %:        row["pct_of_reads"]; Percentage of reads covered by the clade rooted at this taxon
+    #   reads:    row["num_reads"]; Number of reads covered by the clade rooted at this taxon
+    #   taxReads: row["reads_exc_children"]; Number of reads assigned directly to this taxon
+    #   rank:     row["rank"]; A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
+    #   taxID:    row["NCBI_tax_ID"]; NCBI taxonomy ID
+    #   taxName:  row["sci_name"]; indented scientific name
+    # -----------------------------------------------------------------
+
+    def indent_len(in_string):
+        return len(in_string)-len(in_string.lstrip())
+
+    with util.file.open_or_gzopen(kraken_file_path, 'rU') as inf:
+        report_type=None
+        for lineno, line in enumerate(inf):
+            if len(line.rstrip('\r\n').strip()) == 0 or ( report_type != None and line.startswith("#") or line.startswith("%")):
+                continue
+
+            # KrakenUniq is mentioned on the first line of
+            # summary reports created by KrakenUniq
+            if not report_type and "KrakenUniq" in line:
+                report_type="krakenuniq"
+                continue
+            elif not report_type:
+                report_type="kraken"                
+
+            csv.register_dialect('kraken_report', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
+            if report_type == "kraken":
+                fieldnames = [ "pct_of_reads",
+                                "num_reads",
+                                "reads_exc_children",
+                                "rank",
+                                "NCBI_tax_ID",
+                                "sci_name"
+                            ]
+            elif report_type == "krakenuniq":
+                fieldnames = [ 
+                                "pct_of_reads",
+                                "num_reads",
+                                "reads_exc_children",
+                                "uniq_kmers",
+                                "kmer_dups",
+                                "cov_of_clade_kmers",
+                                "NCBI_tax_ID",
+                                "rank",
+                                "sci_name"
+                            ]
+            else:
+                continue #never reached since we fall back to kraken above
+
+            row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report"))
+
+            try:
+                indent_of_line = indent_len(row["sci_name"])
+            except AttributeError as e:
+                log.warning("Report type: '{}'".format(report_type))
+                log.warning("Issue with line {}: '{}'".format(lineno,line.strip().rstrip('\n')))
+                log.warning("From file: {}".format(f))
+            # remove leading/trailing whitespace from each item
+            row = { k:v.strip() for k, v in row.items()}
+
+            yield (row, indent_of_line)
+
 def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel_focus, top_n_entries, count_threshold, no_hist, zero_fill, include_root):
     """
         Aggregates taxonomic abundance data from multiple Kraken-format summary files.
@@ -1113,148 +1506,53 @@ def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel
 
     Abundance = collections.namedtuple("Abundance", "percent,count,kmers,dup,cov")
 
-    def indent_len(in_string):
-        return len(in_string)-len(in_string.lstrip())
-
     for f in list(summary_files_in):
         sample_name, extension = os.path.splitext(f)
         sample_summary = {}
         sample_root_summary = {}
         tax_headings_copy = [s.lower() for s in tax_headings]
 
-        # -----------------------------------------------------------------
-        # KrakenUniq has two lines prefixed by '#', a blank line,
-        # and then a TSV header beginning with "%". The column fields are:
-        # (NB:field names accurate, but space-separated in this comment 
-        # for readability here)
-        #   %        reads  taxReads  kmers  dup   cov  taxID  rank          taxName
-        #   0.05591  2      0         13     1.85  NA   10239  superkingdom  Viruses
-        #
-        # Where the fields are:
-        #   %:
-        #   reads:
-        #   taxReads:
-        #   kmers: number of unique k-mers
-        #   dup: average number of times each unique k-mer has been seen
-        #   cov: coverage of the k-mers of the clade in the database
-        #   taxID: 
-        #   rank: row["rank"]; A rank code (see list below)
-        #   taxName: row["sci_name"]; indented scientific name
-        #
-        # Taxonomic ranks used by KrakenUniq include:
-        #   unknown, no rank, sequence, assembly, subspecies, 
-        #   species, species subgroup, species group, subgenus, 
-        #   genus, tribe, subfamily, family, superfamily, parvorder, 
-        #   infraorder, suborder, order, superorder, parvclass, 
-        #   infraclass, subclass, class, superclass, subphylum, 
-        #   phylum, kingdom, superkingdom, root
-        #
-        #   via: https://github.com/fbreitwieser/krakenuniq/blob/a8b4a2dbf50553e02d3cab3c32f93f91958aa575/src/taxdb.hpp#L96-L131
-        # -----------------------------------------------------------------
-        # Kraken (standard) reports lack header lines. 
-        # (NB:field names below are only for reference. Space-separated for 
-        # readability here)
-        #   %     reads  taxReads  rank  taxID      taxName
-        #   0.00  16     0         D     10239      Viruses
-        #
-        # Where the fields are:
-        #   %:        row["pct_of_reads"]; Percentage of reads covered by the clade rooted at this taxon
-        #   reads:    row["num_reads"]; Number of reads covered by the clade rooted at this taxon
-        #   taxReads: row["reads_exc_children"]; Number of reads assigned directly to this taxon
-        #   rank:     row["rank"]; A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
-        #   taxID:    row["NCBI_tax_ID"]; NCBI taxonomy ID
-        #   taxName:  row["sci_name"]; indented scientific name
-        # -----------------------------------------------------------------
-
-
-        with util.file.open_or_gzopen(f, 'rU') as inf:
-            report_type=None
-            should_process = False
-            indent_of_selection = -1
-            currently_being_processed = ""
-            for lineno, line in enumerate(inf):
-                if len(line.rstrip('\r\n').strip()) == 0 or ( report_type != None and line.startswith("#") or line.startswith("%")):
-                    continue
-
-                # KrakenUniq is mentioned on the first line of
-                # summary reports created by KrakenUniq
-                if not report_type and "KrakenUniq" in line:
-                    report_type="krakenuniq"
-                    continue
-                elif not report_type:
-                    report_type="kraken"                
-
-                csv.register_dialect('kraken_report', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
-                if report_type == "kraken":
-                    fieldnames = [ "pct_of_reads",
-                                    "num_reads",
-                                    "reads_exc_children",
-                                    "rank",
-                                    "NCBI_tax_ID",
-                                    "sci_name"
-                                ]
-                elif report_type == "krakenuniq":
-                    fieldnames = [ 
-                                    "pct_of_reads",
-                                    "num_reads",
-                                    "reads_exc_children",
-                                    "uniq_kmers",
-                                    "kmer_dups",
-                                    "cov_of_clade_kmers",
-                                    "NCBI_tax_ID",
-                                    "rank",
-                                    "sci_name"
-                                ]
-                else:
-                    continue #never reached since we fall back to kraken above
-
-                row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report"))
-
-                try:
-                    indent_of_line = indent_len(row["sci_name"])
-                except AttributeError as e:
-                    log.warning("Report type: '{}'".format(report_type))
-                    log.warning("Issue with line {}: '{}'".format(lineno,line.strip().rstrip('\n')))
-                    log.warning("From file: {}".format(f))
-                # remove leading/trailing whitespace from each item
-                row = { k:v.strip() for k, v in row.items()}
-
-                # rows are formatted as described above. 
-                # Kraken:
-                #   0.00  16  0   D   10239     Viruses
-                # KrakenUniq:
-                #   0.05591  2      0         13     1.85  NA   10239  superkingdom  Viruses
-
-                # if the root-level bins (root, unclassified) should be included, do so, but bypass normal
-                # stateful parsing logic since root does not have a distinct rank level
-                if row["sci_name"].lower() in ["root","unclassified"] and include_root:
-                    sample_root_summary[row["sci_name"]] = collections.OrderedDict()
-                    sample_root_summary[row["sci_name"]][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]),row.get("kmers",None),row.get("dup",None),row.get("cov",None))
-                    continue
+        should_process = False
+        indent_of_selection = -1
+        currently_being_processed = ""
+
+        for row,indent_of_line in _kraken_report_reader(f): 
+            # rows are formatted as described above. 
+            # Kraken:
+            #   0.00  16  0   D   10239     Viruses
+            # KrakenUniq:
+            #   0.05591  2      0         13     1.85  NA   10239  superkingdom  Viruses
+
+            # if the root-level bins (root, unclassified) should be included, do so, but bypass normal
+            # stateful parsing logic since root does not have a distinct rank level
+            if row["sci_name"].lower() in ["root","unclassified"] and include_root:
+                sample_root_summary[row["sci_name"]] = collections.OrderedDict()
+                sample_root_summary[row["sci_name"]][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]),row.get("kmers",None),row.get("dup",None),row.get("cov",None))
+                continue
 
-                if indent_of_line <= indent_of_selection:
-                    should_process = False
-                    indent_of_selection=-1
-
-                if indent_of_selection == -1:
-                    if row["sci_name"].lower() in tax_headings_copy:
-                        tax_headings_copy.remove(row["sci_name"].lower())
-
-                        should_process = True
-                        indent_of_selection = indent_of_line
-                        currently_being_processed = row["sci_name"]
-                        sample_summary[currently_being_processed] = collections.OrderedDict()
-                        if row["rank"] == rank_code(taxlevel_focus) or row["rank"].lower().replace(" ","") == taxlevel_focus.lower().replace(" ",""):
-                            same_level = True
-                        if row["rank"] in ("-","no rank"):
-                            log.warning("Non-taxonomic parent level selected")
-
-                if should_process:
-                    # skip "-" rank levels since they do not occur at the sample level
-                    # otherwise include the taxon row if the rank matches the desired level of focus
-                    if (row["rank"] not in ("-","no rank") and (rank_code(taxlevel_focus) == row["rank"] or row["rank"].lower().replace(" ","") == taxlevel_focus.lower().replace(" ","")) ):
-                        if int(row["num_reads"])>=count_threshold:
-                            sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]),row.get("kmers",None),row.get("dup",None),row.get("cov",None))
+            if indent_of_line <= indent_of_selection:
+                should_process = False
+                indent_of_selection =- 1
+
+            if indent_of_selection == -1:
+                if row["sci_name"].lower() in tax_headings_copy:
+                    tax_headings_copy.remove(row["sci_name"].lower())
+
+                    should_process = True
+                    indent_of_selection = indent_of_line
+                    currently_being_processed = row["sci_name"]
+                    sample_summary[currently_being_processed] = collections.OrderedDict()
+                    if row["rank"] == rank_code(taxlevel_focus) or row["rank"].lower().replace(" ","") == taxlevel_focus.lower().replace(" ",""):
+                        same_level = True
+                    if row["rank"] in ("-","no rank"):
+                        log.warning("Non-taxonomic parent level selected")
+
+            if should_process:
+                # skip "-" rank levels since they do not occur at the sample level
+                # otherwise include the taxon row if the rank matches the desired level of focus
+                if (row["rank"] not in ("-","no rank") and (rank_code(taxlevel_focus) == row["rank"] or row["rank"].lower().replace(" ","") == taxlevel_focus.lower().replace(" ","")) ):
+                    if int(row["num_reads"])>=count_threshold:
+                        sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]),row.get("kmers",None),row.get("dup",None),row.get("cov",None))
 
 
         for k,taxa in sample_summary.items():

From 63fc075654e698b0bdbf923f57b87683ee994360 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 3 Dec 2019 09:45:05 -0500
Subject: [PATCH 02/11] update read counts and percentages in
 metagenomics.py::krakenuniq_report_filter

update read counts and percentages in metagenomics.py::krakenuniq_report_filter to reflect nodes removed based on filtering
---
 metagenomics.py | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/metagenomics.py b/metagenomics.py
index 39e97eefe..f93d92834 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -1091,6 +1091,18 @@ def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()):
                             help='The field to filter on (default: %(default)s).', 
                             default="uniq_kmers"
     )
+    parser.add_argument('--fieldToAdjust', 
+                            choices=[
+                                "num_reads",
+                                "uniq_kmers",
+                                #"kmer_dups",
+                                #"reads_exc_children",
+                            ], 
+                            nargs="+",
+                            dest="fields_to_adjust", 
+                            help='The field to adjust along with the --fieldToFilterOn (default: %(default)s).', 
+                            default=["num_reads"]
+    )
     parser.add_argument('--keepAboveN', 
                         type=int, 
                         dest="keep_threshold", 
@@ -1103,7 +1115,7 @@ def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()):
     util.cmd.attach_main(parser, krakenuniq_report_filter, split_args=True)
     return parser
 
-def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter, keep_threshold):
+def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter, keep_threshold, fields_to_adjust):
     """
         Filter a krakenuniq report by field to include rows above some threshold, 
         where contributions to the value from subordinate levels
@@ -1296,7 +1308,7 @@ def dfs_traversal_prune(node, field_to_filter, keep_threshold):
             if int(n.data[field_to_filter])<keep_threshold:
                 n.parent.remove_child(n)
 
-    def subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold):
+    def subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold, fields_to_adjust):
         """
             For the given filter field and numeric threshold
             subtract the value from the same field in
@@ -1307,19 +1319,35 @@ def subtract_low_values_with_upward_propagation(node, field_to_filter, keep_thre
             to correct numeric values prior to an operation
             to output only those nodes above some value.
         """
+        total_number_of_reads = float(node.data["num_reads"])/float(node.data["pct_of_reads"])
+
         if node.has_children:
             for child in node.children:
-                subtract_low_values_with_upward_propagation(child, field_to_filter, keep_threshold)
+                subtract_low_values_with_upward_propagation(child, field_to_filter, keep_threshold, fields_to_adjust)
         # if the value of the field in this node is below our threshold, 
         # reduce the field in this node and in all nodes above it by the
         # amount of this node
         if int(node.data[field_to_filter])<keep_threshold:
+            other_field_adjustment_values = {}
+            for field in fields_to_adjust:
+                other_field_adjustment_values[field] = int(node.data[field])
+
             amount_to_remove = int(node.data[field_to_filter])
             node_to_reduce = node
             node.data[field_to_filter] = int(node.data[field_to_filter])-amount_to_remove
+            for field in fields_to_adjust:
+                node.data[field] = int(node.data[field])-other_field_adjustment_values[field]
+            
             while(node_to_reduce.has_parent):
                 node_to_reduce = node_to_reduce.parent
                 node_to_reduce.data[field_to_filter] = int(node_to_reduce.data[field_to_filter])-amount_to_remove
+                # adjust secondary fields
+                for field in fields_to_adjust:
+                    node_to_reduce.data[field] = int(node_to_reduce.data[field])-other_field_adjustment_values[field]
+
+    def adjust_read_percentages(node, total_read_count):
+        for n in dfs_traversal(node):
+            n.data["pct_of_reads"] = (float(n.data["num_reads"]) / float(total_read_count))*100.0
 
     def tree_size(node):
         """
@@ -1338,9 +1366,17 @@ def tree_size(node):
             out_f.write(line)
         # for each of the top-level nodes
         # ('unclassified' and 'root')
+        total_number_of_reads = -1 
         for top_node in top_nodes:
             log.info("tree size below (and including) '%s' before filtering: %s", top_node.data["sci_name"],tree_size(top_node))
-            subtract_low_values_with_upward_propagation(top_node, field_to_filter, keep_threshold)
+            # use field_to_filter to conditionally determine
+            # which rows nodes to keep, 
+            # while also adjusting fields_to_adjust of the
+            # nodes impacted by the conditional matching
+            subtract_low_values_with_upward_propagation(top_node, field_to_filter, keep_threshold, fields_to_adjust)
+            total_number_of_reads = sum([int(n.data["num_reads"]) for n in top_nodes])
+        for top_node in top_nodes:
+            adjust_read_percentages(top_node, total_number_of_reads)
             num=0
             for n in dfs_traversal_filtered(top_node,field_to_filter, keep_threshold):
                 num+=1

From 4a4072c2544d687554eb264e79e0b69b91b6c429 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 3 Dec 2019 10:25:23 -0500
Subject: [PATCH 03/11] parser_krakenuniq_report_filter help typo fix

---
 metagenomics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metagenomics.py b/metagenomics.py
index f93d92834..8d4fdc3d0 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -1078,8 +1078,8 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam,
 __commands__.append(('filter_bam_to_taxa', parser_filter_bam_to_taxa))
 
 def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()):
-    parser.add_argument('summary_file_in', help='Input KrakenUNiq-format summary text file with tab-delimited fields and indented taxonomic levels.')
-    parser.add_argument('summary_file_out', help='Output KrakenUNiq-format summary text file with tab-delimited fields and indented taxonomic levels.')
+    parser.add_argument('summary_file_in', help='Input KrakenUniq-format summary text file with tab-delimited fields and indented taxonomic levels.')
+    parser.add_argument('summary_file_out', help='Output KrakenUniq-format summary text file with tab-delimited fields and indented taxonomic levels.')
     parser.add_argument('--fieldToFilterOn', 
                             choices=[
                                 "num_reads",

From da7f6ea6c3e207b32e08a8fe5019363c2421b440 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 3 Dec 2019 10:46:27 -0500
Subject: [PATCH 04/11] cruft removal

---
 metagenomics.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/metagenomics.py b/metagenomics.py
index 8d4fdc3d0..cffa3c64f 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -1119,7 +1119,7 @@ def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter,
     """
         Filter a krakenuniq report by field to include rows above some threshold, 
         where contributions to the value from subordinate levels
-        are first removed from the value.
+        are removed from the value and the value of superior levels.
     """
 
     # class to represent taxonomic tree
@@ -1319,8 +1319,6 @@ def subtract_low_values_with_upward_propagation(node, field_to_filter, keep_thre
             to correct numeric values prior to an operation
             to output only those nodes above some value.
         """
-        total_number_of_reads = float(node.data["num_reads"])/float(node.data["pct_of_reads"])
-
         if node.has_children:
             for child in node.children:
                 subtract_low_values_with_upward_propagation(child, field_to_filter, keep_threshold, fields_to_adjust)

From 82dac27437153fe3b2bdb92e4155a350b2dfac18 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 3 Dec 2019 11:35:37 -0500
Subject: [PATCH 05/11] consistent util.fil.open_or_gzopen calls

---
 metagenomics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metagenomics.py b/metagenomics.py
index cffa3c64f..6313bb869 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -125,7 +125,7 @@ def load_names(self, names_db, scientific_only=True):
             names = {}
         else:
             names = collections.defaultdict(list)
-        for line in open_or_gzopen(names_db):
+        for line in util.file.open_or_gzopen(names_db):
             parts = line.strip().split('|')
             taxid = int(parts[0])
             name = parts[1].strip()
@@ -590,7 +590,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade
 
         input_path = maybe_compressed(input_path)
         with util.file.open_or_gzopen(input_path, 'rt') as f, \
-             open_or_gzopen(output_path, 'wt') as out_f:
+             util.file.open_or_gzopen(output_path, 'wt') as out_f:
             if header:
                 out_f.write(f.readline())  # Cannot use next(f) for python2
             for line in f:

From f153b0ebf77d1c7a3b0f0792a0b1c33716ac8fcd Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 6 Dec 2019 21:35:13 -0500
Subject: [PATCH 06/11] add new test type:
 assertApproxEqualValuesInDelimitedFiles

add new test type: assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0
---
 test/__init__.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/test/__init__.py b/test/__init__.py
index 5baa40c86..1a12ebf81 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -10,6 +10,9 @@
 import hashlib
 import logging
 import copy
+import math
+import csv
+import itertools
 
 # third-party
 import Bio.SeqIO
@@ -111,6 +114,47 @@ def inputs(self, *fnames):
         '''Return the full filenames for files in the test input directory for this test class'''
         return [self.input(fname) for fname in fnames]
 
+    def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0):
+        def _is_number(s):
+            try:
+                complex(s) # for int, long, float and complex
+            except ValueError:
+                return False
+            return True
+
+        csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
+        csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",")
+        
+        with util.file.open_or_gzopen(file_one, 'rU') as inf1, util.file.open_or_gzopen(file_two, 'rU') as inf2:
+            report_type=None
+            for line_num, (line1,line2) in enumerate(itertools.zip_longest(inf1,inf2)):
+                self.assertIsNotNone(line1, msg="%s appears to be shorter than %s" % (inf1, inf2))
+                self.assertIsNotNone(line2, msg="%s appears to be shorter than %s" % (inf2, inf1))
+
+                # continue to this next pair of lines until we have 
+                # skipped past the header
+                if line_num < header_lines_to_skip:
+                    continue
+                else:
+                    inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                    inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect))
+
+                    # assume the rows have the same number of elements
+                    self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row))
+
+                    for inf1_row_item, inf2_row_item in zip(inf1_row,inf2_row):
+                        # we assume at the item from the same position is a number 
+                        # or not a number in both files
+                        self.assertTrue(_is_number(inf1_row_item)==_is_number(inf2_row_item))
+                        
+                        # if we're dealing with numbers, check that they're approximately equal
+                        if _is_number(inf1_row_item):
+                            assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol)
+                        else:
+                            # otherwise we're probably dealing with a string
+                            self.assertEqual(inf2_row_item, inf1_row_item)
+
+
     def assertEqualSamHeaders(self, tested_samfile, expected_samfile, other_allowed_values=None):
         '''
             other_allowed_values is a dict that maps a key to a list of other accepted values

From 1f3377e69a02c1e3d24050b16caf6955fc69f913 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 6 Dec 2019 21:35:38 -0500
Subject: [PATCH 07/11] pad each level with single space since we aren't
 collapsing doubles

---
 metagenomics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagenomics.py b/metagenomics.py
index f7422fadd..308a078da 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -1134,7 +1134,7 @@ def __init__(self, row, level):
         def __str__(self):
             # pad the name with the original prefix spacing
             # when rendering as a string
-            sci_name = "  "*self.level+self.data["sci_name"]
+            sci_name = " "*self.level+self.data["sci_name"]
             return "\t".join([str(value) for key,value in self.data.items() if key!="sci_name"]+[sci_name])
 
         def __repr__(self):

From f1bbad21a4d11a4dbc5c30eeac62d5a6430ba67b Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 6 Dec 2019 21:37:13 -0500
Subject: [PATCH 08/11] add test class TestKrakenUniqSummaryFilter

add test class TestKrakenUniqSummaryFilter, with first test case: test_unchanged_report
---
 .../expected/should_be_unchanged.txt          | 37 +++++++++++++++++++
 ...ve_higher_node_trimmed_after_resumming.txt |  0
 ...have_higher_node_trimmed_at_first_pass.txt |  0
 .../expected/should_have_leaves_trimmed.txt   |  0
 .../input/should_be_unchanged.txt             | 37 +++++++++++++++++++
 ...ve_higher_node_trimmed_after_resumming.txt |  0
 ...have_higher_node_trimmed_at_first_pass.txt |  0
 .../input/should_have_leaves_trimmed.txt      |  0
 test/unit/test_metagenomics.py                | 18 +++++++++
 9 files changed, 92 insertions(+)
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt
 create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt

diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt
new file mode 100644
index 000000000..a7868efec
--- /dev/null
+++ b/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt
@@ -0,0 +1,37 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+8.134	1788	1788	1193206	1.69	NA	0	no rank	unclassified
+91.87	20195	4	356763	3	NA	1	no rank	root
+71.38	15692	709	346092	2.29	NA	131567	no rank	  cellular organisms
+31.42	6906	0	153473	1.68	NA	2759	superkingdom	    Eukaryota
+31.42	6906	0	153473	1.68	NA	33154	no rank	      Opisthokonta
+31.42	6906	0	153473	1.68	NA	33208	kingdom	        Metazoa
+31.42	6906	0	153473	1.68	NA	6072	no rank	          Eumetazoa
+31.42	6906	0	153473	1.68	NA	33213	no rank	            Bilateria
+31.42	6906	0	153473	1.68	NA	33511	no rank	              Deuterostomia
+31.42	6906	0	153473	1.68	NA	7711	phylum	                Chordata
+31.42	6906	0	153473	1.68	NA	89593	subphylum	                  Craniata
+31.42	6906	0	153473	1.68	NA	7742	no rank	                    Vertebrata
+31.42	6906	0	153473	1.68	NA	7776	no rank	                      Gnathostomata
+31.42	6906	0	153473	1.68	NA	117570	no rank	                        Teleostomi
+31.42	6906	0	153473	1.68	NA	117571	no rank	                          Euteleostomi
+31.42	6906	0	153473	1.68	NA	8287	superclass	                            Sarcopterygii
+31.42	6906	0	153473	1.68	NA	1338369	no rank	                              Dipnotetrapodomorpha
+31.42	6906	0	153473	1.68	NA	32523	no rank	                                Tetrapoda
+31.42	6906	0	153473	1.68	NA	32524	no rank	                                  Amniota
+31.42	6906	0	153473	1.68	NA	40674	class	                                    Mammalia
+31.42	6906	0	153473	1.68	NA	32525	no rank	                                      Theria
+31.42	6906	0	153473	1.68	NA	9347	no rank	                                        Eutheria
+31.42	6906	0	153473	1.68	NA	1437010	no rank	                                          Boreoeutheria
+31.42	6906	0	153473	1.68	NA	314146	superorder	                                            Euarchontoglires
+31.42	6906	0	153473	1.68	NA	9443	order	                                              Primates
+31.42	6906	0	153473	1.68	NA	376913	suborder	                                                Haplorrhini
+31.42	6906	0	153473	1.68	NA	314293	infraorder	                                                  Simiiformes
+31.42	6906	0	153473	1.68	NA	9526	parvorder	                                                    Catarrhini
+31.42	6906	0	153473	1.68	NA	314295	superfamily	                                                      Hominoidea
+31.42	6906	0	153473	1.68	NA	9604	family	                                                        Hominidae
+31.42	6906	0	153473	1.68	NA	207598	subfamily	                                                          Homininae
+31.42	6906	0	153473	1.68	NA	9605	genus	                                                            Homo
+31.42	6906	6906	153473	1.68	NA	9606	species	                                                              Homo sapiens
diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt
new file mode 100644
index 000000000..a7868efec
--- /dev/null
+++ b/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt
@@ -0,0 +1,37 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+8.134	1788	1788	1193206	1.69	NA	0	no rank	unclassified
+91.87	20195	4	356763	3	NA	1	no rank	root
+71.38	15692	709	346092	2.29	NA	131567	no rank	  cellular organisms
+31.42	6906	0	153473	1.68	NA	2759	superkingdom	    Eukaryota
+31.42	6906	0	153473	1.68	NA	33154	no rank	      Opisthokonta
+31.42	6906	0	153473	1.68	NA	33208	kingdom	        Metazoa
+31.42	6906	0	153473	1.68	NA	6072	no rank	          Eumetazoa
+31.42	6906	0	153473	1.68	NA	33213	no rank	            Bilateria
+31.42	6906	0	153473	1.68	NA	33511	no rank	              Deuterostomia
+31.42	6906	0	153473	1.68	NA	7711	phylum	                Chordata
+31.42	6906	0	153473	1.68	NA	89593	subphylum	                  Craniata
+31.42	6906	0	153473	1.68	NA	7742	no rank	                    Vertebrata
+31.42	6906	0	153473	1.68	NA	7776	no rank	                      Gnathostomata
+31.42	6906	0	153473	1.68	NA	117570	no rank	                        Teleostomi
+31.42	6906	0	153473	1.68	NA	117571	no rank	                          Euteleostomi
+31.42	6906	0	153473	1.68	NA	8287	superclass	                            Sarcopterygii
+31.42	6906	0	153473	1.68	NA	1338369	no rank	                              Dipnotetrapodomorpha
+31.42	6906	0	153473	1.68	NA	32523	no rank	                                Tetrapoda
+31.42	6906	0	153473	1.68	NA	32524	no rank	                                  Amniota
+31.42	6906	0	153473	1.68	NA	40674	class	                                    Mammalia
+31.42	6906	0	153473	1.68	NA	32525	no rank	                                      Theria
+31.42	6906	0	153473	1.68	NA	9347	no rank	                                        Eutheria
+31.42	6906	0	153473	1.68	NA	1437010	no rank	                                          Boreoeutheria
+31.42	6906	0	153473	1.68	NA	314146	superorder	                                            Euarchontoglires
+31.42	6906	0	153473	1.68	NA	9443	order	                                              Primates
+31.42	6906	0	153473	1.68	NA	376913	suborder	                                                Haplorrhini
+31.42	6906	0	153473	1.68	NA	314293	infraorder	                                                  Simiiformes
+31.42	6906	0	153473	1.68	NA	9526	parvorder	                                                    Catarrhini
+31.42	6906	0	153473	1.68	NA	314295	superfamily	                                                      Hominoidea
+31.42	6906	0	153473	1.68	NA	9604	family	                                                        Hominidae
+31.42	6906	0	153473	1.68	NA	207598	subfamily	                                                          Homininae
+31.42	6906	0	153473	1.68	NA	9605	genus	                                                            Homo
+31.42	6906	6906	153473	1.68	NA	9606	species	                                                              Homo sapiens
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
index 63bc18237..a7f724ccf 100644
--- a/test/unit/test_metagenomics.py
+++ b/test/unit/test_metagenomics.py
@@ -29,6 +29,24 @@ def test_help_parser_for_each_command(self):
             parser = parser_fun(argparse.ArgumentParser())
             helpstring = parser.format_help()
 
+class TestKrakenUniqSummaryFilter(TestCaseWithTmp):
+    
+    def test_unchanged_report(self):
+        test_files_dir = util.file.get_test_input_path(self)
+        in_report = os.path.join(test_files_dir, 'input', 'should_be_unchanged.txt')
+        out_report = util.file.mkstempfname('.txt')
+        
+        parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser())
+        args = parser.parse_args([in_report, out_report])
+        args.func_main(args)
+
+        # Check that results match (roughly) with what is expected
+        expected_out = os.path.join(test_files_dir, 'expected', 'should_be_unchanged.txt')
+        self.assertApproxEqualValuesInDelimitedFiles(out_report, 
+                                                     expected_out, 
+                                                     dialect="tsv", 
+                                                     numeric_rel_tol=1e-3, 
+                                                     header_lines_to_skip=3)
 
 class TestKronaCalls(TestCaseWithTmp):
 

From 37424388cfb0aa19d38361d8e47d7d7424c69186 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Fri, 6 Dec 2019 21:55:16 -0500
Subject: [PATCH 09/11] add test
 TestKrakenUniqSummaryFilter::test_leaves_trimmed

---
 .../expected/should_have_leaves_trimmed.txt   | 13 +++++++++++
 .../input/should_have_leaves_trimmed.txt      | 19 ++++++++++++++++
 test/unit/test_metagenomics.py                | 22 +++++++++++++++----
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt
index e69de29bb..69a67aac9 100644
--- a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt
+++ b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt
@@ -0,0 +1,13 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+27.810893206534104	6112	6112	1193206	1.69	NA	0	no rank	unclassified
+72.1891067934659	15865	11368	349850	3	NA	1	no rank	root
+20.46230149701961	4497	0	7949	31.5	NA	10239	superkingdom	  Viruses
+20.46230149701961	4497	0	7930	32	NA	439488	no rank	    ssRNA viruses
+20.46230149701961	4497	0	7930	32	NA	35301	no rank	      ssRNA negative-strand viruses
+20.46230149701961	4497	0	7930	32	NA	11157	order	        Mononegavirales
+20.46230149701961	4497	0	7930	32	NA	11266	family	          Filoviridae
+20.46230149701961	4497	0	7930	32	NA	186536	genus	            Ebolavirus
+20.46230149701961	4497	4497	7916	32	NA	186538	species	              Zaire ebolavirus
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt
index e69de29bb..4981a88bc 100644
--- a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt
+++ b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt
@@ -0,0 +1,19 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+27.81	6112	6112	1193206	1.69	NA	0	no rank	unclassified
+72.19	15867	11368	349960	3	NA	1	no rank	root
+20.47	4499	0	8059	31.5	NA	10239	superkingdom	  Viruses
+20.46	4497	0	7930	32	NA	439488	no rank	    ssRNA viruses
+20.46	4497	0	7930	32	NA	35301	no rank	      ssRNA negative-strand viruses
+20.46	4497	0	7930	32	NA	11157	order	        Mononegavirales
+20.46	4497	0	7930	32	NA	11266	family	          Filoviridae
+20.46	4497	0	7930	32	NA	186536	genus	            Ebolavirus
+20.46	4497	4497	7916	32	NA	186538	species	              Zaire ebolavirus
+0.009098	2	0	110	1.71	NA	35237	no rank	    dsDNA viruses, no RNA stage
+0.009098	2	0	100	1.6	NA	28883	order	      Caudovirales
+0.009098	2	0	93	1.65	NA	10699	family	        Siphoviridae
+0.009098	2	0	11	1.73	NA	1982251	genus	          Pa6virus
+0.009098	2	0	8	1.62	NA	1982272	species	            Propionibacterium virus PAS50
+0.009098	2	2	8	1.62	NA	504553	no rank	              Propionibacterium phage PAS50
diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
index a7f724ccf..28c9b4850 100644
--- a/test/unit/test_metagenomics.py
+++ b/test/unit/test_metagenomics.py
@@ -30,10 +30,10 @@ def test_help_parser_for_each_command(self):
             helpstring = parser.format_help()
 
 class TestKrakenUniqSummaryFilter(TestCaseWithTmp):
-    
-    def test_unchanged_report(self):
+
+    def _test_report(self, report_basename):
         test_files_dir = util.file.get_test_input_path(self)
-        in_report = os.path.join(test_files_dir, 'input', 'should_be_unchanged.txt')
+        in_report = os.path.join(test_files_dir, 'input', report_basename)
         out_report = util.file.mkstempfname('.txt')
         
         parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser())
@@ -41,13 +41,27 @@ def test_unchanged_report(self):
         args.func_main(args)
 
         # Check that results match (roughly) with what is expected
-        expected_out = os.path.join(test_files_dir, 'expected', 'should_be_unchanged.txt')
+        expected_out = os.path.join(test_files_dir, 'expected', report_basename)
         self.assertApproxEqualValuesInDelimitedFiles(out_report, 
                                                      expected_out, 
                                                      dialect="tsv", 
                                                      numeric_rel_tol=1e-3, 
                                                      header_lines_to_skip=3)
 
+    def test_unchanged_report(self):
+        self._test_report("should_be_unchanged.txt")
+    
+    def test_leaves_trimmed(self):
+        self._test_report("should_have_leaves_trimmed.txt")
+
+    #def test_higher_node_trimmed_after_resumming(self):
+    #    self._test_report("should_have_higher_node_trimmed_after_resumming.txt")
+
+    #def test_higher_node_trimmed_at_first_pass(self):
+    #    self._test_report("should_have_higher_node_trimmed_at_first_pass.txt")
+    
+
+
 class TestKronaCalls(TestCaseWithTmp):
 
     def setUp(self):

From e94a745b24bca70cd3456253bf7b48cf69289250 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Mon, 9 Dec 2019 10:12:56 -0500
Subject: [PATCH 10/11] extend assertApproxEqualValuesInDelimitedFiles

extend assertApproxEqualValuesInDelimitedFiles() test to use dicts in the case where a header line of field names is available so value comparisons can be made independent of column order. This is only used if use_first_processed_line_for_fieldnames=True. is_number() moved to util.misc; zip_dicts() added to util.misc
---
 test/__init__.py               | 52 +++++++++++++++++++++++++---------
 test/unit/test_metagenomics.py |  3 +-
 util/misc.py                   | 17 +++++++++++
 3 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/test/__init__.py b/test/__init__.py
index 1a12ebf81..1a398f037 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -21,7 +21,7 @@
 
 # intra-project
 import util.file
-from util.misc import available_cpu_count
+from util.misc import available_cpu_count, zip_dicts, is_number
 from tools.samtools import SamtoolsTool
 
 logging.getLogger('botocore').setLevel(logging.WARNING)
@@ -114,13 +114,19 @@ def inputs(self, *fnames):
         '''Return the full filenames for files in the test input directory for this test class'''
         return [self.input(fname) for fname in fnames]
 
-    def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0):
-        def _is_number(s):
-            try:
-                complex(s) # for int, long, float and complex
-            except ValueError:
-                return False
-            return True
+    def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0, use_first_processed_line_for_fieldnames=False):
+        '''
+            This test checks whether two delimited (tsv, csv, or custom dialect) files
+            are approximately the same, by comparing the values present at each line
+            with a configurable tolerance for comparison of numeric values, and exact
+            comparison of string values. A specified number of header lines can be skipped 
+            before the comparison begins to account for lines written by some tools with
+            for metadata, invokation params, etc.
+            If use_first_processed_line_for_fieldnames is specified, the columns can be
+            in any order since the rows are parsed as dicts, otherwise the columns are
+            assumed to be in the same order in both of the files.
+        '''
+        header_fieldnames=None
 
         csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
         csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",")
@@ -136,19 +142,39 @@ def _is_number(s):
                 if line_num < header_lines_to_skip:
                     continue
                 else:
-                    inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
-                    inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect))
+                    if header_fieldnames is None and use_first_processed_line_for_fieldnames:
+                        inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                        inf2_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                        self.assertEqual(inf1_row,inf2_row, msg="header lines are not the same")
+                        header_fieldnames=inf1_row
+                        continue
+
+                    # if the header names are defined
+                    if header_fieldnames is not None:
+                        inf1_row = next(csv.DictReader([line1.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames))
+                        inf2_row = next(csv.DictReader([line2.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames))
+                    else:
+                        inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                        inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect))
 
                     # assume the rows have the same number of elements
                     self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row))
 
-                    for inf1_row_item, inf2_row_item in zip(inf1_row,inf2_row):
+                    if header_fieldnames is not None:
+                        def _dict_values_only(dict1_in, dict2_in):
+                            for key, values in zip_dicts(dict1_in,dict2_in):
+                                yield values
+                        items_to_compare=_dict_values_only(inf1_row,inf2_row)
+                    else:
+                        items_to_compare=zip(inf1_row,inf2_row)
+
+                    for inf1_row_item, inf2_row_item in items_to_compare:
                         # we assume at the item from the same position is a number 
                         # or not a number in both files
-                        self.assertTrue(_is_number(inf1_row_item)==_is_number(inf2_row_item))
+                        self.assertTrue(is_number(inf1_row_item)==is_number(inf2_row_item))
                         
                         # if we're dealing with numbers, check that they're approximately equal
-                        if _is_number(inf1_row_item):
+                        if is_number(inf1_row_item):
                             assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol)
                         else:
                             # otherwise we're probably dealing with a string
diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
index 28c9b4850..922620432 100644
--- a/test/unit/test_metagenomics.py
+++ b/test/unit/test_metagenomics.py
@@ -46,7 +46,8 @@ def _test_report(self, report_basename):
                                                      expected_out, 
                                                      dialect="tsv", 
                                                      numeric_rel_tol=1e-3, 
-                                                     header_lines_to_skip=3)
+                                                     header_lines_to_skip=3,
+                                                     use_first_processed_line_for_fieldnames=True)
 
     def test_unchanged_report(self):
         self._test_report("should_be_unchanged.txt")
diff --git a/util/misc.py b/util/misc.py
index ce8a8043a..fec0f8b2f 100644
--- a/util/misc.py
+++ b/util/misc.py
@@ -50,6 +50,16 @@ def unique(items):
             seen.add(i)
             yield i
 
+def is_number(s):
+    """
+        If the string can be parsed as an
+        int, long, float or complex number, return True
+    """
+    try:
+        complex(s)
+    except ValueError:
+        return False
+    return True
 
 def histogram(items):
     ''' I count the number of times I see stuff and return a dict of counts. '''
@@ -614,6 +624,13 @@ def as_type(val, types):
             pass
     raise TypeError('Could not convert {} to any of {}: {}'.format(val, types, errs))
 
+def zip_dicts(*dicts):
+    keys_sets = map(set, dicts)
+    common_keys = functools.reduce(set.intersection, keys_sets)
+    for key in common_keys:
+        #yield (key,) + tuple(map(operator.itemgetter(key), dicts))
+        yield key, tuple(map(operator.itemgetter(key), dicts))
+
 def subdict(d, keys):
     """Return a newly allocated shallow copy of a mapping `d` restricted to keys in `keys`."""
     d = dict(d)

From bb7fb489c005a15b115d6bbed4d7befeeedd29f6 Mon Sep 17 00:00:00 2001
From: Christopher Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Tue, 10 Dec 2019 12:54:42 -0500
Subject: [PATCH 11/11] regex raw string

---
 metagenomics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metagenomics.py b/metagenomics.py
index 308a078da..b42503753 100755
--- a/metagenomics.py
+++ b/metagenomics.py
@@ -962,7 +962,7 @@ def fasta_library_accessions(library):
             for seqr in SeqIO.parse(filepath, 'fasta'):
                 name = seqr.name
                 # Search for accession
-                mo = re.search('([A-Z]+_?\d+\.\d+)', name)
+                mo = re.search(r'([A-Z]+_?\d+\.\d+)', name)
                 if mo:
                     accession = mo.group(1)
                     library_accessions.add(accession)