broadinstitute · tomkinsc · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019
diff --git a/metagenomics.py b/metagenomics.py
diff --git a/test/__init__.py b/test/__init__.py
@@ -10,6 +10,9 @@
 import hashlib
 import logging
 import copy
+import math
+import csv
+import itertools
 
 # third-party
 import Bio.SeqIO
@@ -18,7 +21,7 @@
 
 # intra-project
 import util.file
-from util.misc import available_cpu_count
+from util.misc import available_cpu_count, zip_dicts, is_number
 from tools.samtools import SamtoolsTool
 
 logging.getLogger('botocore').setLevel(logging.WARNING)
@@ -111,6 +114,73 @@ def inputs(self, *fnames):
         '''Return the full filenames for files in the test input directory for this test class'''
         return [self.input(fname) for fname in fnames]
 
+    def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0, use_first_processed_line_for_fieldnames=False):
+        '''
+            This test checks whether two delimited (tsv, csv, or custom dialect) files
+            are approximately the same, by comparing the values present at each line
+            with a configurable tolerance for comparison of numeric values, and exact
+            comparison of string values. A specified number of header lines can be skipped 
+            before the comparison begins to account for lines written by some tools with
+            for metadata, invokation params, etc.
+            If use_first_processed_line_for_fieldnames is specified, the columns can be
+            in any order since the rows are parsed as dicts, otherwise the columns are
+            assumed to be in the same order in both of the files.
+        '''
+        header_fieldnames=None
+
+        csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
+        csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",")
+
+        with util.file.open_or_gzopen(file_one, 'rU') as inf1, util.file.open_or_gzopen(file_two, 'rU') as inf2:
+            report_type=None
+            for line_num, (line1,line2) in enumerate(itertools.zip_longest(inf1,inf2)):
+                self.assertIsNotNone(line1, msg="%s appears to be shorter than %s" % (inf1, inf2))
+                self.assertIsNotNone(line2, msg="%s appears to be shorter than %s" % (inf2, inf1))
+
+                # continue to this next pair of lines until we have 
+                # skipped past the header
+                if line_num < header_lines_to_skip:
+                    continue
+                else:
+                    if header_fieldnames is None and use_first_processed_line_for_fieldnames:
+                        inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                        inf2_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                        self.assertEqual(inf1_row,inf2_row, msg="header lines are not the same")
+                        header_fieldnames=inf1_row
+                        continue
+
+                    # if the header names are defined
+                    if header_fieldnames is not None:
+                        inf1_row = next(csv.DictReader([line1.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames))
+                        inf2_row = next(csv.DictReader([line2.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames))
+                    else:
+                        inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
+                        inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect))
+
+                    # assume the rows have the same number of elements
+                    self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row))
+
+                    if header_fieldnames is not None:
+                        def _dict_values_only(dict1_in, dict2_in):
+                            for key, values in zip_dicts(dict1_in,dict2_in):
+                                yield values
+                        items_to_compare=_dict_values_only(inf1_row,inf2_row)
+                    else:
+                        items_to_compare=zip(inf1_row,inf2_row)
+
+                    for inf1_row_item, inf2_row_item in items_to_compare:
+                        # we assume at the item from the same position is a number 
+                        # or not a number in both files
+                        self.assertTrue(is_number(inf1_row_item)==is_number(inf2_row_item))
+
+                        # if we're dealing with numbers, check that they're approximately equal
+                        if is_number(inf1_row_item):
+                            assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol)
+                        else:
+                            # otherwise we're probably dealing with a string
+                            self.assertEqual(inf2_row_item, inf1_row_item)
+
+
     def assertEqualSamHeaders(self, tested_samfile, expected_samfile, other_allowed_values=None):
         '''
             other_allowed_values is a dict that maps a key to a list of other accepted values

diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt
@@ -0,0 +1,37 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+8.134	1788	1788	1193206	1.69	NA	0	no rank	unclassified
+91.87	20195	4	356763	3	NA	1	no rank	root
+71.38	15692	709	346092	2.29	NA	131567	no rank	  cellular organisms
+31.42	6906	0	153473	1.68	NA	2759	superkingdom	    Eukaryota
+31.42	6906	0	153473	1.68	NA	33154	no rank	      Opisthokonta
+31.42	6906	0	153473	1.68	NA	33208	kingdom	        Metazoa
+31.42	6906	0	153473	1.68	NA	6072	no rank	          Eumetazoa
+31.42	6906	0	153473	1.68	NA	33213	no rank	            Bilateria
+31.42	6906	0	153473	1.68	NA	33511	no rank	              Deuterostomia
+31.42	6906	0	153473	1.68	NA	7711	phylum	                Chordata
+31.42	6906	0	153473	1.68	NA	89593	subphylum	                  Craniata
+31.42	6906	0	153473	1.68	NA	7742	no rank	                    Vertebrata
+31.42	6906	0	153473	1.68	NA	7776	no rank	                      Gnathostomata
+31.42	6906	0	153473	1.68	NA	117570	no rank	                        Teleostomi
+31.42	6906	0	153473	1.68	NA	117571	no rank	                          Euteleostomi
+31.42	6906	0	153473	1.68	NA	8287	superclass	                            Sarcopterygii
+31.42	6906	0	153473	1.68	NA	1338369	no rank	                              Dipnotetrapodomorpha
+31.42	6906	0	153473	1.68	NA	32523	no rank	                                Tetrapoda
+31.42	6906	0	153473	1.68	NA	32524	no rank	                                  Amniota
+31.42	6906	0	153473	1.68	NA	40674	class	                                    Mammalia
+31.42	6906	0	153473	1.68	NA	32525	no rank	                                      Theria
+31.42	6906	0	153473	1.68	NA	9347	no rank	                                        Eutheria
+31.42	6906	0	153473	1.68	NA	1437010	no rank	                                          Boreoeutheria
+31.42	6906	0	153473	1.68	NA	314146	superorder	                                            Euarchontoglires
+31.42	6906	0	153473	1.68	NA	9443	order	                                              Primates
+31.42	6906	0	153473	1.68	NA	376913	suborder	                                                Haplorrhini
+31.42	6906	0	153473	1.68	NA	314293	infraorder	                                                  Simiiformes
+31.42	6906	0	153473	1.68	NA	9526	parvorder	                                                    Catarrhini
+31.42	6906	0	153473	1.68	NA	314295	superfamily	                                                      Hominoidea
+31.42	6906	0	153473	1.68	NA	9604	family	                                                        Hominidae
+31.42	6906	0	153473	1.68	NA	207598	subfamily	                                                          Homininae
+31.42	6906	0	153473	1.68	NA	9605	genus	                                                            Homo
+31.42	6906	6906	153473	1.68	NA	9606	species	                                                              Homo sapiens
diff --git a/.../TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt b/.../TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt
diff --git a/...ut/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt b/...ut/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt
diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt
@@ -0,0 +1,13 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+27.810893206534104	6112	6112	1193206	1.69	NA	0	no rank	unclassified
+72.1891067934659	15865	11368	349850	3	NA	1	no rank	root
+20.46230149701961	4497	0	7949	31.5	NA	10239	superkingdom	  Viruses
+20.46230149701961	4497	0	7930	32	NA	439488	no rank	    ssRNA viruses
+20.46230149701961	4497	0	7930	32	NA	35301	no rank	      ssRNA negative-strand viruses
+20.46230149701961	4497	0	7930	32	NA	11157	order	        Mononegavirales
+20.46230149701961	4497	0	7930	32	NA	11266	family	          Filoviridae
+20.46230149701961	4497	0	7930	32	NA	186536	genus	            Ebolavirus
+20.46230149701961	4497	4497	7916	32	NA	186538	species	              Zaire ebolavirus
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt
@@ -0,0 +1,37 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+8.134	1788	1788	1193206	1.69	NA	0	no rank	unclassified
+91.87	20195	4	356763	3	NA	1	no rank	root
+71.38	15692	709	346092	2.29	NA	131567	no rank	  cellular organisms
+31.42	6906	0	153473	1.68	NA	2759	superkingdom	    Eukaryota
+31.42	6906	0	153473	1.68	NA	33154	no rank	      Opisthokonta
+31.42	6906	0	153473	1.68	NA	33208	kingdom	        Metazoa
+31.42	6906	0	153473	1.68	NA	6072	no rank	          Eumetazoa
+31.42	6906	0	153473	1.68	NA	33213	no rank	            Bilateria
+31.42	6906	0	153473	1.68	NA	33511	no rank	              Deuterostomia
+31.42	6906	0	153473	1.68	NA	7711	phylum	                Chordata
+31.42	6906	0	153473	1.68	NA	89593	subphylum	                  Craniata
+31.42	6906	0	153473	1.68	NA	7742	no rank	                    Vertebrata
+31.42	6906	0	153473	1.68	NA	7776	no rank	                      Gnathostomata
+31.42	6906	0	153473	1.68	NA	117570	no rank	                        Teleostomi
+31.42	6906	0	153473	1.68	NA	117571	no rank	                          Euteleostomi
+31.42	6906	0	153473	1.68	NA	8287	superclass	                            Sarcopterygii
+31.42	6906	0	153473	1.68	NA	1338369	no rank	                              Dipnotetrapodomorpha
+31.42	6906	0	153473	1.68	NA	32523	no rank	                                Tetrapoda
+31.42	6906	0	153473	1.68	NA	32524	no rank	                                  Amniota
+31.42	6906	0	153473	1.68	NA	40674	class	                                    Mammalia
+31.42	6906	0	153473	1.68	NA	32525	no rank	                                      Theria
+31.42	6906	0	153473	1.68	NA	9347	no rank	                                        Eutheria
+31.42	6906	0	153473	1.68	NA	1437010	no rank	                                          Boreoeutheria
+31.42	6906	0	153473	1.68	NA	314146	superorder	                                            Euarchontoglires
+31.42	6906	0	153473	1.68	NA	9443	order	                                              Primates
+31.42	6906	0	153473	1.68	NA	376913	suborder	                                                Haplorrhini
+31.42	6906	0	153473	1.68	NA	314293	infraorder	                                                  Simiiformes
+31.42	6906	0	153473	1.68	NA	9526	parvorder	                                                    Catarrhini
+31.42	6906	0	153473	1.68	NA	314295	superfamily	                                                      Hominoidea
+31.42	6906	0	153473	1.68	NA	9604	family	                                                        Hominidae
+31.42	6906	0	153473	1.68	NA	207598	subfamily	                                                          Homininae
+31.42	6906	0	153473	1.68	NA	9605	genus	                                                            Homo
+31.42	6906	6906	153473	1.68	NA	9606	species	                                                              Homo sapiens
diff --git a/...put/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt b/...put/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt
diff --git a/...input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt b/...input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt
diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt
@@ -0,0 +1,19 @@
+# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
+# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz
+
+%	reads	taxReads	kmers	dup	cov	taxID	rank	taxName
+27.81	6112	6112	1193206	1.69	NA	0	no rank	unclassified
+72.19	15867	11368	349960	3	NA	1	no rank	root
+20.47	4499	0	8059	31.5	NA	10239	superkingdom	  Viruses
+20.46	4497	0	7930	32	NA	439488	no rank	    ssRNA viruses
+20.46	4497	0	7930	32	NA	35301	no rank	      ssRNA negative-strand viruses
+20.46	4497	0	7930	32	NA	11157	order	        Mononegavirales
+20.46	4497	0	7930	32	NA	11266	family	          Filoviridae
+20.46	4497	0	7930	32	NA	186536	genus	            Ebolavirus
+20.46	4497	4497	7916	32	NA	186538	species	              Zaire ebolavirus
+0.009098	2	0	110	1.71	NA	35237	no rank	    dsDNA viruses, no RNA stage
+0.009098	2	0	100	1.6	NA	28883	order	      Caudovirales
+0.009098	2	0	93	1.65	NA	10699	family	        Siphoviridae
+0.009098	2	0	11	1.73	NA	1982251	genus	          Pa6virus
+0.009098	2	0	8	1.62	NA	1982272	species	            Propionibacterium virus PAS50
+0.009098	2	2	8	1.62	NA	504553	no rank	              Propionibacterium phage PAS50
diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
@@ -29,6 +29,39 @@ def test_help_parser_for_each_command(self):
             parser = parser_fun(argparse.ArgumentParser())
             helpstring = parser.format_help()
 
+class TestKrakenUniqSummaryFilter(TestCaseWithTmp):
+
+    def _test_report(self, report_basename):
+        test_files_dir = util.file.get_test_input_path(self)
+        in_report = os.path.join(test_files_dir, 'input', report_basename)
+        out_report = util.file.mkstempfname('.txt')
+
+        parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser())
+        args = parser.parse_args([in_report, out_report])
+        args.func_main(args)
+
+        # Check that results match (roughly) with what is expected
+        expected_out = os.path.join(test_files_dir, 'expected', report_basename)
+        self.assertApproxEqualValuesInDelimitedFiles(out_report, 
+                                                     expected_out, 
+                                                     dialect="tsv", 
+                                                     numeric_rel_tol=1e-3, 
+                                                     header_lines_to_skip=3,
+                                                     use_first_processed_line_for_fieldnames=True)
+
+    def test_unchanged_report(self):
+        self._test_report("should_be_unchanged.txt")
+
+    def test_leaves_trimmed(self):
+        self._test_report("should_have_leaves_trimmed.txt")
+
+    #def test_higher_node_trimmed_after_resumming(self):
+    #    self._test_report("should_have_higher_node_trimmed_after_resumming.txt")
+
+    #def test_higher_node_trimmed_at_first_pass(self):
+    #    self._test_report("should_have_higher_node_trimmed_at_first_pass.txt")
+
+
 
 class TestKronaCalls(TestCaseWithTmp):
 

diff --git a/util/misc.py b/util/misc.py
@@ -50,6 +50,16 @@ def unique(items):
             seen.add(i)
             yield i
 
+def is_number(s):
+    """
+        If the string can be parsed as an
+        int, long, float or complex number, return True
+    """
+    try:
+        complex(s)
+    except ValueError:
+        return False
+    return True
 
 def histogram(items):
     ''' I count the number of times I see stuff and return a dict of counts. '''
@@ -614,6 +624,13 @@ def as_type(val, types):
             pass
     raise TypeError('Could not convert {} to any of {}: {}'.format(val, types, errs))
 
+def zip_dicts(*dicts):
+    keys_sets = map(set, dicts)
+    common_keys = functools.reduce(set.intersection, keys_sets)
+    for key in common_keys:
+        #yield (key,) + tuple(map(operator.itemgetter(key), dicts))
+        yield key, tuple(map(operator.itemgetter(key), dicts))
+
 def subdict(d, keys):
     """Return a newly allocated shallow copy of a mapping `d` restricted to keys in `keys`."""
     d = dict(d)