Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
618 changes: 475 additions & 143 deletions metagenomics.py

Large diffs are not rendered by default.

72 changes: 71 additions & 1 deletion test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import hashlib
import logging
import copy
import math
import csv
import itertools

# third-party
import Bio.SeqIO
Expand All @@ -18,7 +21,7 @@

# intra-project
import util.file
from util.misc import available_cpu_count
from util.misc import available_cpu_count, zip_dicts, is_number
from tools.samtools import SamtoolsTool

logging.getLogger('botocore').setLevel(logging.WARNING)
Expand Down Expand Up @@ -111,6 +114,73 @@ def inputs(self, *fnames):
'''Return the full filenames for files in the test input directory for this test class'''
return [self.input(fname) for fname in fnames]

def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0, use_first_processed_line_for_fieldnames=False):
'''
This test checks whether two delimited (tsv, csv, or custom dialect) files
are approximately the same, by comparing the values present at each line
with a configurable tolerance for comparison of numeric values, and exact
comparison of string values. A specified number of header lines can be skipped
before the comparison begins to account for lines written by some tools with
for metadata, invokation params, etc.
If use_first_processed_line_for_fieldnames is specified, the columns can be
in any order since the rows are parsed as dicts, otherwise the columns are
assumed to be in the same order in both of the files.
'''
header_fieldnames=None

csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",")

with util.file.open_or_gzopen(file_one, 'rU') as inf1, util.file.open_or_gzopen(file_two, 'rU') as inf2:
report_type=None
for line_num, (line1,line2) in enumerate(itertools.zip_longest(inf1,inf2)):
self.assertIsNotNone(line1, msg="%s appears to be shorter than %s" % (inf1, inf2))
self.assertIsNotNone(line2, msg="%s appears to be shorter than %s" % (inf2, inf1))

# continue to this next pair of lines until we have
# skipped past the header
if line_num < header_lines_to_skip:
continue
else:
if header_fieldnames is None and use_first_processed_line_for_fieldnames:
inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
inf2_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
self.assertEqual(inf1_row,inf2_row, msg="header lines are not the same")
header_fieldnames=inf1_row
continue

# if the header names are defined
if header_fieldnames is not None:
inf1_row = next(csv.DictReader([line1.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames))
inf2_row = next(csv.DictReader([line2.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames))
else:
inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect))

# assume the rows have the same number of elements
self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row))

if header_fieldnames is not None:
def _dict_values_only(dict1_in, dict2_in):
for key, values in zip_dicts(dict1_in,dict2_in):
yield values
items_to_compare=_dict_values_only(inf1_row,inf2_row)
else:
items_to_compare=zip(inf1_row,inf2_row)

for inf1_row_item, inf2_row_item in items_to_compare:
# we assume at the item from the same position is a number
# or not a number in both files
self.assertTrue(is_number(inf1_row_item)==is_number(inf2_row_item))

# if we're dealing with numbers, check that they're approximately equal
if is_number(inf1_row_item):
assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol)
else:
# otherwise we're probably dealing with a string
self.assertEqual(inf2_row_item, inf1_row_item)


def assertEqualSamHeaders(self, tested_samfile, expected_samfile, other_allowed_values=None):
'''
other_allowed_values is a dict that maps a key to a list of other accepted values
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
8.134 1788 1788 1193206 1.69 NA 0 no rank unclassified
91.87 20195 4 356763 3 NA 1 no rank root
71.38 15692 709 346092 2.29 NA 131567 no rank cellular organisms
31.42 6906 0 153473 1.68 NA 2759 superkingdom Eukaryota
31.42 6906 0 153473 1.68 NA 33154 no rank Opisthokonta
31.42 6906 0 153473 1.68 NA 33208 kingdom Metazoa
31.42 6906 0 153473 1.68 NA 6072 no rank Eumetazoa
31.42 6906 0 153473 1.68 NA 33213 no rank Bilateria
31.42 6906 0 153473 1.68 NA 33511 no rank Deuterostomia
31.42 6906 0 153473 1.68 NA 7711 phylum Chordata
31.42 6906 0 153473 1.68 NA 89593 subphylum Craniata
31.42 6906 0 153473 1.68 NA 7742 no rank Vertebrata
31.42 6906 0 153473 1.68 NA 7776 no rank Gnathostomata
31.42 6906 0 153473 1.68 NA 117570 no rank Teleostomi
31.42 6906 0 153473 1.68 NA 117571 no rank Euteleostomi
31.42 6906 0 153473 1.68 NA 8287 superclass Sarcopterygii
31.42 6906 0 153473 1.68 NA 1338369 no rank Dipnotetrapodomorpha
31.42 6906 0 153473 1.68 NA 32523 no rank Tetrapoda
31.42 6906 0 153473 1.68 NA 32524 no rank Amniota
31.42 6906 0 153473 1.68 NA 40674 class Mammalia
31.42 6906 0 153473 1.68 NA 32525 no rank Theria
31.42 6906 0 153473 1.68 NA 9347 no rank Eutheria
31.42 6906 0 153473 1.68 NA 1437010 no rank Boreoeutheria
31.42 6906 0 153473 1.68 NA 314146 superorder Euarchontoglires
31.42 6906 0 153473 1.68 NA 9443 order Primates
31.42 6906 0 153473 1.68 NA 376913 suborder Haplorrhini
31.42 6906 0 153473 1.68 NA 314293 infraorder Simiiformes
31.42 6906 0 153473 1.68 NA 9526 parvorder Catarrhini
31.42 6906 0 153473 1.68 NA 314295 superfamily Hominoidea
31.42 6906 0 153473 1.68 NA 9604 family Hominidae
31.42 6906 0 153473 1.68 NA 207598 subfamily Homininae
31.42 6906 0 153473 1.68 NA 9605 genus Homo
31.42 6906 6906 153473 1.68 NA 9606 species Homo sapiens
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
27.810893206534104 6112 6112 1193206 1.69 NA 0 no rank unclassified
72.1891067934659 15865 11368 349850 3 NA 1 no rank root
20.46230149701961 4497 0 7949 31.5 NA 10239 superkingdom Viruses
20.46230149701961 4497 0 7930 32 NA 439488 no rank ssRNA viruses
20.46230149701961 4497 0 7930 32 NA 35301 no rank ssRNA negative-strand viruses
20.46230149701961 4497 0 7930 32 NA 11157 order Mononegavirales
20.46230149701961 4497 0 7930 32 NA 11266 family Filoviridae
20.46230149701961 4497 0 7930 32 NA 186536 genus Ebolavirus
20.46230149701961 4497 4497 7916 32 NA 186538 species Zaire ebolavirus
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
8.134 1788 1788 1193206 1.69 NA 0 no rank unclassified
91.87 20195 4 356763 3 NA 1 no rank root
71.38 15692 709 346092 2.29 NA 131567 no rank cellular organisms
31.42 6906 0 153473 1.68 NA 2759 superkingdom Eukaryota
31.42 6906 0 153473 1.68 NA 33154 no rank Opisthokonta
31.42 6906 0 153473 1.68 NA 33208 kingdom Metazoa
31.42 6906 0 153473 1.68 NA 6072 no rank Eumetazoa
31.42 6906 0 153473 1.68 NA 33213 no rank Bilateria
31.42 6906 0 153473 1.68 NA 33511 no rank Deuterostomia
31.42 6906 0 153473 1.68 NA 7711 phylum Chordata
31.42 6906 0 153473 1.68 NA 89593 subphylum Craniata
31.42 6906 0 153473 1.68 NA 7742 no rank Vertebrata
31.42 6906 0 153473 1.68 NA 7776 no rank Gnathostomata
31.42 6906 0 153473 1.68 NA 117570 no rank Teleostomi
31.42 6906 0 153473 1.68 NA 117571 no rank Euteleostomi
31.42 6906 0 153473 1.68 NA 8287 superclass Sarcopterygii
31.42 6906 0 153473 1.68 NA 1338369 no rank Dipnotetrapodomorpha
31.42 6906 0 153473 1.68 NA 32523 no rank Tetrapoda
31.42 6906 0 153473 1.68 NA 32524 no rank Amniota
31.42 6906 0 153473 1.68 NA 40674 class Mammalia
31.42 6906 0 153473 1.68 NA 32525 no rank Theria
31.42 6906 0 153473 1.68 NA 9347 no rank Eutheria
31.42 6906 0 153473 1.68 NA 1437010 no rank Boreoeutheria
31.42 6906 0 153473 1.68 NA 314146 superorder Euarchontoglires
31.42 6906 0 153473 1.68 NA 9443 order Primates
31.42 6906 0 153473 1.68 NA 376913 suborder Haplorrhini
31.42 6906 0 153473 1.68 NA 314293 infraorder Simiiformes
31.42 6906 0 153473 1.68 NA 9526 parvorder Catarrhini
31.42 6906 0 153473 1.68 NA 314295 superfamily Hominoidea
31.42 6906 0 153473 1.68 NA 9604 family Hominidae
31.42 6906 0 153473 1.68 NA 207598 subfamily Homininae
31.42 6906 0 153473 1.68 NA 9605 genus Homo
31.42 6906 6906 153473 1.68 NA 9606 species Homo sapiens
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
27.81 6112 6112 1193206 1.69 NA 0 no rank unclassified
72.19 15867 11368 349960 3 NA 1 no rank root
20.47 4499 0 8059 31.5 NA 10239 superkingdom Viruses
20.46 4497 0 7930 32 NA 439488 no rank ssRNA viruses
20.46 4497 0 7930 32 NA 35301 no rank ssRNA negative-strand viruses
20.46 4497 0 7930 32 NA 11157 order Mononegavirales
20.46 4497 0 7930 32 NA 11266 family Filoviridae
20.46 4497 0 7930 32 NA 186536 genus Ebolavirus
20.46 4497 4497 7916 32 NA 186538 species Zaire ebolavirus
0.009098 2 0 110 1.71 NA 35237 no rank dsDNA viruses, no RNA stage
0.009098 2 0 100 1.6 NA 28883 order Caudovirales
0.009098 2 0 93 1.65 NA 10699 family Siphoviridae
0.009098 2 0 11 1.73 NA 1982251 genus Pa6virus
0.009098 2 0 8 1.62 NA 1982272 species Propionibacterium virus PAS50
0.009098 2 2 8 1.62 NA 504553 no rank Propionibacterium phage PAS50
33 changes: 33 additions & 0 deletions test/unit/test_metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,39 @@ def test_help_parser_for_each_command(self):
parser = parser_fun(argparse.ArgumentParser())
helpstring = parser.format_help()

class TestKrakenUniqSummaryFilter(TestCaseWithTmp):

def _test_report(self, report_basename):
test_files_dir = util.file.get_test_input_path(self)
in_report = os.path.join(test_files_dir, 'input', report_basename)
out_report = util.file.mkstempfname('.txt')

parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser())
args = parser.parse_args([in_report, out_report])
args.func_main(args)

# Check that results match (roughly) with what is expected
expected_out = os.path.join(test_files_dir, 'expected', report_basename)
self.assertApproxEqualValuesInDelimitedFiles(out_report,
expected_out,
dialect="tsv",
numeric_rel_tol=1e-3,
header_lines_to_skip=3,
use_first_processed_line_for_fieldnames=True)

def test_unchanged_report(self):
self._test_report("should_be_unchanged.txt")

def test_leaves_trimmed(self):
self._test_report("should_have_leaves_trimmed.txt")

#def test_higher_node_trimmed_after_resumming(self):
# self._test_report("should_have_higher_node_trimmed_after_resumming.txt")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anything wrong with these tests at the moment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing wrong--they're just not implemented yet (have to make the synthetic input/output).


#def test_higher_node_trimmed_at_first_pass(self):
# self._test_report("should_have_higher_node_trimmed_at_first_pass.txt")



class TestKronaCalls(TestCaseWithTmp):

Expand Down
17 changes: 17 additions & 0 deletions util/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@ def unique(items):
seen.add(i)
yield i

def is_number(s):
"""
If the string can be parsed as an
int, long, float or complex number, return True
"""
try:
complex(s)
except ValueError:
return False
return True

def histogram(items):
''' I count the number of times I see stuff and return a dict of counts. '''
Expand Down Expand Up @@ -614,6 +624,13 @@ def as_type(val, types):
pass
raise TypeError('Could not convert {} to any of {}: {}'.format(val, types, errs))

def zip_dicts(*dicts):
keys_sets = map(set, dicts)
common_keys = functools.reduce(set.intersection, keys_sets)
for key in common_keys:
#yield (key,) + tuple(map(operator.itemgetter(key), dicts))
yield key, tuple(map(operator.itemgetter(key), dicts))

def subdict(d, keys):
"""Return a newly allocated shallow copy of a mapping `d` restricted to keys in `keys`."""
d = dict(d)
Expand Down