Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
616 changes: 474 additions & 142 deletions metagenomics.py

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import hashlib
import logging
import copy
import math
import csv
import itertools

# third-party
import Bio.SeqIO
Expand Down Expand Up @@ -111,6 +114,47 @@ def inputs(self, *fnames):
'''Return the full filenames for files in the test input directory for this test class'''
return [self.input(fname) for fname in fnames]

def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh man -- this test function will definitely come in handy

def _is_number(s):
try:
complex(s) # for int, long, float and complex
except ValueError:
return False
return True

csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",")

with util.file.open_or_gzopen(file_one, 'rU') as inf1, util.file.open_or_gzopen(file_two, 'rU') as inf2:
report_type=None
for line_num, (line1,line2) in enumerate(itertools.zip_longest(inf1,inf2)):
self.assertIsNotNone(line1, msg="%s appears to be shorter than %s" % (inf1, inf2))
self.assertIsNotNone(line2, msg="%s appears to be shorter than %s" % (inf2, inf1))

# continue to this next pair of lines until we have
# skipped past the header
if line_num < header_lines_to_skip:
continue
else:
inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect))
inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect))

# assume the rows have the same number of elements
self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row))

for inf1_row_item, inf2_row_item in zip(inf1_row,inf2_row):
# we assume at the item from the same position is a number
# or not a number in both files
self.assertTrue(_is_number(inf1_row_item)==_is_number(inf2_row_item))

# if we're dealing with numbers, check that they're approximately equal
if _is_number(inf1_row_item):
assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol)
else:
# otherwise we're probably dealing with a string
self.assertEqual(inf2_row_item, inf1_row_item)


def assertEqualSamHeaders(self, tested_samfile, expected_samfile, other_allowed_values=None):
'''
other_allowed_values is a dict that maps a key to a list of other accepted values
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
8.134 1788 1788 1193206 1.69 NA 0 no rank unclassified
91.87 20195 4 356763 3 NA 1 no rank root
71.38 15692 709 346092 2.29 NA 131567 no rank cellular organisms
31.42 6906 0 153473 1.68 NA 2759 superkingdom Eukaryota
31.42 6906 0 153473 1.68 NA 33154 no rank Opisthokonta
31.42 6906 0 153473 1.68 NA 33208 kingdom Metazoa
31.42 6906 0 153473 1.68 NA 6072 no rank Eumetazoa
31.42 6906 0 153473 1.68 NA 33213 no rank Bilateria
31.42 6906 0 153473 1.68 NA 33511 no rank Deuterostomia
31.42 6906 0 153473 1.68 NA 7711 phylum Chordata
31.42 6906 0 153473 1.68 NA 89593 subphylum Craniata
31.42 6906 0 153473 1.68 NA 7742 no rank Vertebrata
31.42 6906 0 153473 1.68 NA 7776 no rank Gnathostomata
31.42 6906 0 153473 1.68 NA 117570 no rank Teleostomi
31.42 6906 0 153473 1.68 NA 117571 no rank Euteleostomi
31.42 6906 0 153473 1.68 NA 8287 superclass Sarcopterygii
31.42 6906 0 153473 1.68 NA 1338369 no rank Dipnotetrapodomorpha
31.42 6906 0 153473 1.68 NA 32523 no rank Tetrapoda
31.42 6906 0 153473 1.68 NA 32524 no rank Amniota
31.42 6906 0 153473 1.68 NA 40674 class Mammalia
31.42 6906 0 153473 1.68 NA 32525 no rank Theria
31.42 6906 0 153473 1.68 NA 9347 no rank Eutheria
31.42 6906 0 153473 1.68 NA 1437010 no rank Boreoeutheria
31.42 6906 0 153473 1.68 NA 314146 superorder Euarchontoglires
31.42 6906 0 153473 1.68 NA 9443 order Primates
31.42 6906 0 153473 1.68 NA 376913 suborder Haplorrhini
31.42 6906 0 153473 1.68 NA 314293 infraorder Simiiformes
31.42 6906 0 153473 1.68 NA 9526 parvorder Catarrhini
31.42 6906 0 153473 1.68 NA 314295 superfamily Hominoidea
31.42 6906 0 153473 1.68 NA 9604 family Hominidae
31.42 6906 0 153473 1.68 NA 207598 subfamily Homininae
31.42 6906 0 153473 1.68 NA 9605 genus Homo
31.42 6906 6906 153473 1.68 NA 9606 species Homo sapiens
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
27.810893206534104 6112 6112 1193206 1.69 NA 0 no rank unclassified
72.1891067934659 15865 11368 349850 3 NA 1 no rank root
20.46230149701961 4497 0 7949 31.5 NA 10239 superkingdom Viruses
20.46230149701961 4497 0 7930 32 NA 439488 no rank ssRNA viruses
20.46230149701961 4497 0 7930 32 NA 35301 no rank ssRNA negative-strand viruses
20.46230149701961 4497 0 7930 32 NA 11157 order Mononegavirales
20.46230149701961 4497 0 7930 32 NA 11266 family Filoviridae
20.46230149701961 4497 0 7930 32 NA 186536 genus Ebolavirus
20.46230149701961 4497 4497 7916 32 NA 186538 species Zaire ebolavirus
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
8.134 1788 1788 1193206 1.69 NA 0 no rank unclassified
91.87 20195 4 356763 3 NA 1 no rank root
71.38 15692 709 346092 2.29 NA 131567 no rank cellular organisms
31.42 6906 0 153473 1.68 NA 2759 superkingdom Eukaryota
31.42 6906 0 153473 1.68 NA 33154 no rank Opisthokonta
31.42 6906 0 153473 1.68 NA 33208 kingdom Metazoa
31.42 6906 0 153473 1.68 NA 6072 no rank Eumetazoa
31.42 6906 0 153473 1.68 NA 33213 no rank Bilateria
31.42 6906 0 153473 1.68 NA 33511 no rank Deuterostomia
31.42 6906 0 153473 1.68 NA 7711 phylum Chordata
31.42 6906 0 153473 1.68 NA 89593 subphylum Craniata
31.42 6906 0 153473 1.68 NA 7742 no rank Vertebrata
31.42 6906 0 153473 1.68 NA 7776 no rank Gnathostomata
31.42 6906 0 153473 1.68 NA 117570 no rank Teleostomi
31.42 6906 0 153473 1.68 NA 117571 no rank Euteleostomi
31.42 6906 0 153473 1.68 NA 8287 superclass Sarcopterygii
31.42 6906 0 153473 1.68 NA 1338369 no rank Dipnotetrapodomorpha
31.42 6906 0 153473 1.68 NA 32523 no rank Tetrapoda
31.42 6906 0 153473 1.68 NA 32524 no rank Amniota
31.42 6906 0 153473 1.68 NA 40674 class Mammalia
31.42 6906 0 153473 1.68 NA 32525 no rank Theria
31.42 6906 0 153473 1.68 NA 9347 no rank Eutheria
31.42 6906 0 153473 1.68 NA 1437010 no rank Boreoeutheria
31.42 6906 0 153473 1.68 NA 314146 superorder Euarchontoglires
31.42 6906 0 153473 1.68 NA 9443 order Primates
31.42 6906 0 153473 1.68 NA 376913 suborder Haplorrhini
31.42 6906 0 153473 1.68 NA 314293 infraorder Simiiformes
31.42 6906 0 153473 1.68 NA 9526 parvorder Catarrhini
31.42 6906 0 153473 1.68 NA 314295 superfamily Hominoidea
31.42 6906 0 153473 1.68 NA 9604 family Hominidae
31.42 6906 0 153473 1.68 NA 207598 subfamily Homininae
31.42 6906 0 153473 1.68 NA 9605 genus Homo
31.42 6906 6906 153473 1.68 NA 9606 species Homo sapiens
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus
# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz

% reads taxReads kmers dup cov taxID rank taxName
27.81 6112 6112 1193206 1.69 NA 0 no rank unclassified
72.19 15867 11368 349960 3 NA 1 no rank root
20.47 4499 0 8059 31.5 NA 10239 superkingdom Viruses
20.46 4497 0 7930 32 NA 439488 no rank ssRNA viruses
20.46 4497 0 7930 32 NA 35301 no rank ssRNA negative-strand viruses
20.46 4497 0 7930 32 NA 11157 order Mononegavirales
20.46 4497 0 7930 32 NA 11266 family Filoviridae
20.46 4497 0 7930 32 NA 186536 genus Ebolavirus
20.46 4497 4497 7916 32 NA 186538 species Zaire ebolavirus
0.009098 2 0 110 1.71 NA 35237 no rank dsDNA viruses, no RNA stage
0.009098 2 0 100 1.6 NA 28883 order Caudovirales
0.009098 2 0 93 1.65 NA 10699 family Siphoviridae
0.009098 2 0 11 1.73 NA 1982251 genus Pa6virus
0.009098 2 0 8 1.62 NA 1982272 species Propionibacterium virus PAS50
0.009098 2 2 8 1.62 NA 504553 no rank Propionibacterium phage PAS50
32 changes: 32 additions & 0 deletions test/unit/test_metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,38 @@ def test_help_parser_for_each_command(self):
parser = parser_fun(argparse.ArgumentParser())
helpstring = parser.format_help()

class TestKrakenUniqSummaryFilter(TestCaseWithTmp):

def _test_report(self, report_basename):
test_files_dir = util.file.get_test_input_path(self)
in_report = os.path.join(test_files_dir, 'input', report_basename)
out_report = util.file.mkstempfname('.txt')

parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser())
args = parser.parse_args([in_report, out_report])
args.func_main(args)

# Check that results match (roughly) with what is expected
expected_out = os.path.join(test_files_dir, 'expected', report_basename)
self.assertApproxEqualValuesInDelimitedFiles(out_report,
expected_out,
dialect="tsv",
numeric_rel_tol=1e-3,
header_lines_to_skip=3)

def test_unchanged_report(self):
self._test_report("should_be_unchanged.txt")

def test_leaves_trimmed(self):
self._test_report("should_have_leaves_trimmed.txt")

#def test_higher_node_trimmed_after_resumming(self):
# self._test_report("should_have_higher_node_trimmed_after_resumming.txt")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anything wrong with these tests at the moment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing wrong--they're just not implemented yet (have to make the synthetic input/output).


#def test_higher_node_trimmed_at_first_pass(self):
# self._test_report("should_have_higher_node_trimmed_at_first_pass.txt")



class TestKronaCalls(TestCaseWithTmp):

Expand Down