From bc5abd7e4d006e0fddec135986a92e4148bc2c68 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 3 Dec 2019 00:47:56 -0500 Subject: [PATCH 01/11] initial commit of metagenomics.py::krakenuniq_report_filter --- metagenomics.py | 578 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 438 insertions(+), 140 deletions(-) diff --git a/metagenomics.py b/metagenomics.py index 081a5a9fd..39e97eefe 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -14,6 +14,7 @@ import itertools import logging import os.path +from os import linesep from os.path import join import operator import queue @@ -36,7 +37,6 @@ import tools.krona import tools.picard import tools.samtools -from util.file import open_or_gzopen __commands__ = [] @@ -109,7 +109,7 @@ def __init__( def load_gi_single_dmp(self, dmp_path): '''Load a gi->taxid dmp file from NCBI taxonomy.''' gi_array = {} - with open_or_gzopen(dmp_path) as f: + with util.file.open_or_gzopen(dmp_path) as f: for i, line in enumerate(f): gi, taxid = line.strip().split('\t') gi = int(gi) @@ -142,7 +142,7 @@ def load_nodes(self, nodes_db): '''Load ranks and parents arrays from NCBI taxonomy.''' ranks = {} parents = {} - with open_or_gzopen(nodes_db) as f: + with util.file.open_or_gzopen(nodes_db) as f: for line in f: parts = line.strip().split('|') taxid = int(parts[0]) @@ -589,7 +589,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade output_path = os.path.join(outputDb, path) input_path = maybe_compressed(input_path) - with open_or_gzopen(input_path, 'rt') as f, \ + with util.file.open_or_gzopen(input_path, 'rt') as f, \ open_or_gzopen(output_path, 'wt') as out_f: if header: out_f.write(f.readline()) # Cannot use next(f) for python2 @@ -1047,7 +1047,7 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam, # perform the actual filtering to return a list of read IDs, writeen to a temp file with util.file.tempfname(".txt.gz") as temp_read_list: - with open_or_gzopen(temp_read_list, "wt") as read_IDs_file: + with util.file.open_or_gzopen(temp_read_list, "wt") as read_IDs_file: read_ids_written = 0 for row in util.file.read_tabfile(read_IDs_to_tax_IDs): assert tax_id_col0 + + @property + def lacks_children(self): + return len(self.children)==0 + + @property + def name(self): + return self.data["sci_name"] + + # load the KrakenUniq summary file + # into a tree structure + # using the taxName indentation (space prefix) to indicate hierarchy + # (rather than the 'rank' column since many rows have 'no rank') + # This assumes the hierarchy is described linearly + # within the file such that indentation changes + # indicate changes of hierarchy + tax_tree_top_nodes = [] + previous_node = None + top_nodes = [] + for row,level in _kraken_report_reader(summary_file_in): + # create a new code object for this row + current_node = TaxNode(row,level) + + # if this is a top-level node ('root' or 'unclassified') + if current_node.level==0: + # set previous_node to None since + # we usually have at least two top-level nodes + previous_node=None + + # if we do not have a previous node/line to consider + if previous_node==None: + # assume this line is a top-level node + top_nodes.append(current_node) + # otherwise, if the node is subordinate in + # level/indentation to the previous node + elif current_node.level>previous_node.level: + # make it a child of the previous node + previous_node.add_child(current_node) + # (and set the previous node as its parent) + current_node.add_parent(previous_node) + else: + # if this node is not subordinate in + # level/indentation to the previous node + # track back through prior nodes + # until we find one that is at a higher level + parent_node = previous_node + while(current_node.level<=parent_node.level): + if parent_node.lacks_parent: + # if we reach a node without a parent + # break out (should not be reached) + break + parent_node = parent_node.parent + # when we have found a superior prior node + # add this node as its child + parent_node.add_child(current_node) + # (and set it as this node's parent) + current_node.add_parent(parent_node) + # set this node as the previous node + previous_node = current_node + + def remove_children(node): + """ + remove child nodes from instance + attribute (list) of the specified node + """ + if node != None: + if node.has_children: + for child in node.children: + remove_children(child) + node.children = [] + + def delete_children(node): + """ + delete a node outright along with its children + """ + if node != None: + if node.has_children: + for child in node.children: + delete_children(child) + del child + else: + del node + + def dfs_traversal(node): + """ + yield nodes in preorder form + """ + yield node + if node.has_children: + for child in node.children: + for n in dfs_traversal(child): + yield n + + def dfs_traversal_print(node): + """ + print the tree in preorder form + """ + for n in dfs_traversal(node): + print(n) + + def dfs_traversal_filtered(node, field_to_filter, keep_threshold): + """ + yield nodes from the tree where the field is above the threshold value. + NB: This does not take into account rows where the sum includes + the values and sums from subordinate nodes. + + subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold) + should be used to account for sums that include + contributions from subordinate nodes + """ + for n in dfs_traversal(node): + if (int(n.data[field_to_filter])>=keep_threshold): + yield n + + def dfs_traversal_print_filtered(node, field_to_filter, keep_threshold): + """ + Print nodes from the tree where the field is above the threshold value. + NB: This does not take into account rows where the sum includes + the values and sums from subordinate nodes. + + subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold) + should be used to account for sums that include + contributions from subordinate nodes + """ + for n in dfs_traversal(node): + if (int(n.data[field_to_filter])>=keep_threshold): + print(n) + + def dfs_traversal_prune(node, field_to_filter, keep_threshold): + """ + Remove nodes from the tree where the field is above the threshold value. + NB: This does not take into account rows where the sum includes + the values and sums from subordinate rows. + + subtract_low_values_with_upward_propagation(node, field_to_filter, keep_threshold) + should be used to account for sums that include + contributions from subordinate nodes + """ + for child in node.children: + dfs_traversal_prune(child, field_to_filter, keep_threshold) + + for n in dfs_traversal(node): + if int(n.data[field_to_filter])=count_threshold: - sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]),row.get("kmers",None),row.get("dup",None),row.get("cov",None)) + if indent_of_line <= indent_of_selection: + should_process = False + indent_of_selection =- 1 + + if indent_of_selection == -1: + if row["sci_name"].lower() in tax_headings_copy: + tax_headings_copy.remove(row["sci_name"].lower()) + + should_process = True + indent_of_selection = indent_of_line + currently_being_processed = row["sci_name"] + sample_summary[currently_being_processed] = collections.OrderedDict() + if row["rank"] == rank_code(taxlevel_focus) or row["rank"].lower().replace(" ","") == taxlevel_focus.lower().replace(" ",""): + same_level = True + if row["rank"] in ("-","no rank"): + log.warning("Non-taxonomic parent level selected") + + if should_process: + # skip "-" rank levels since they do not occur at the sample level + # otherwise include the taxon row if the rank matches the desired level of focus + if (row["rank"] not in ("-","no rank") and (rank_code(taxlevel_focus) == row["rank"] or row["rank"].lower().replace(" ","") == taxlevel_focus.lower().replace(" ","")) ): + if int(row["num_reads"])>=count_threshold: + sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]),row.get("kmers",None),row.get("dup",None),row.get("cov",None)) for k,taxa in sample_summary.items(): From 63fc075654e698b0bdbf923f57b87683ee994360 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 3 Dec 2019 09:45:05 -0500 Subject: [PATCH 02/11] update read counts and percentages in metagenomics.py::krakenuniq_report_filter update read counts and percentages in metagenomics.py::krakenuniq_report_filter to reflect nodes removed based on filtering --- metagenomics.py | 44 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/metagenomics.py b/metagenomics.py index 39e97eefe..f93d92834 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -1091,6 +1091,18 @@ def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()): help='The field to filter on (default: %(default)s).', default="uniq_kmers" ) + parser.add_argument('--fieldToAdjust', + choices=[ + "num_reads", + "uniq_kmers", + #"kmer_dups", + #"reads_exc_children", + ], + nargs="+", + dest="fields_to_adjust", + help='The field to adjust along with the --fieldToFilterOn (default: %(default)s).', + default=["num_reads"] + ) parser.add_argument('--keepAboveN', type=int, dest="keep_threshold", @@ -1103,7 +1115,7 @@ def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()): util.cmd.attach_main(parser, krakenuniq_report_filter, split_args=True) return parser -def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter, keep_threshold): +def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter, keep_threshold, fields_to_adjust): """ Filter a krakenuniq report by field to include rows above some threshold, where contributions to the value from subordinate levels @@ -1296,7 +1308,7 @@ def dfs_traversal_prune(node, field_to_filter, keep_threshold): if int(n.data[field_to_filter]) Date: Tue, 3 Dec 2019 10:25:23 -0500 Subject: [PATCH 03/11] parser_krakenuniq_report_filter help typo fix --- metagenomics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagenomics.py b/metagenomics.py index f93d92834..8d4fdc3d0 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -1078,8 +1078,8 @@ def filter_bam_to_taxa(in_bam, read_IDs_to_tax_IDs, out_bam, __commands__.append(('filter_bam_to_taxa', parser_filter_bam_to_taxa)) def parser_krakenuniq_report_filter(parser=argparse.ArgumentParser()): - parser.add_argument('summary_file_in', help='Input KrakenUNiq-format summary text file with tab-delimited fields and indented taxonomic levels.') - parser.add_argument('summary_file_out', help='Output KrakenUNiq-format summary text file with tab-delimited fields and indented taxonomic levels.') + parser.add_argument('summary_file_in', help='Input KrakenUniq-format summary text file with tab-delimited fields and indented taxonomic levels.') + parser.add_argument('summary_file_out', help='Output KrakenUniq-format summary text file with tab-delimited fields and indented taxonomic levels.') parser.add_argument('--fieldToFilterOn', choices=[ "num_reads", From da7f6ea6c3e207b32e08a8fe5019363c2421b440 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 3 Dec 2019 10:46:27 -0500 Subject: [PATCH 04/11] cruft removal --- metagenomics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metagenomics.py b/metagenomics.py index 8d4fdc3d0..cffa3c64f 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -1119,7 +1119,7 @@ def krakenuniq_report_filter(summary_file_in, summary_file_out, field_to_filter, """ Filter a krakenuniq report by field to include rows above some threshold, where contributions to the value from subordinate levels - are first removed from the value. + are removed from the value and the value of superior levels. """ # class to represent taxonomic tree @@ -1319,8 +1319,6 @@ def subtract_low_values_with_upward_propagation(node, field_to_filter, keep_thre to correct numeric values prior to an operation to output only those nodes above some value. """ - total_number_of_reads = float(node.data["num_reads"])/float(node.data["pct_of_reads"]) - if node.has_children: for child in node.children: subtract_low_values_with_upward_propagation(child, field_to_filter, keep_threshold, fields_to_adjust) From 82dac27437153fe3b2bdb92e4155a350b2dfac18 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 3 Dec 2019 11:35:37 -0500 Subject: [PATCH 05/11] consistent util.fil.open_or_gzopen calls --- metagenomics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagenomics.py b/metagenomics.py index cffa3c64f..6313bb869 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -125,7 +125,7 @@ def load_names(self, names_db, scientific_only=True): names = {} else: names = collections.defaultdict(list) - for line in open_or_gzopen(names_db): + for line in util.file.open_or_gzopen(names_db): parts = line.strip().split('|') taxid = int(parts[0]) name = parts[1].strip() @@ -590,7 +590,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade input_path = maybe_compressed(input_path) with util.file.open_or_gzopen(input_path, 'rt') as f, \ - open_or_gzopen(output_path, 'wt') as out_f: + util.file.open_or_gzopen(output_path, 'wt') as out_f: if header: out_f.write(f.readline()) # Cannot use next(f) for python2 for line in f: From f153b0ebf77d1c7a3b0f0792a0b1c33716ac8fcd Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 6 Dec 2019 21:35:13 -0500 Subject: [PATCH 06/11] add new test type: assertApproxEqualValuesInDelimitedFiles add new test type: assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0 --- test/__init__.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/__init__.py b/test/__init__.py index 5baa40c86..1a12ebf81 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -10,6 +10,9 @@ import hashlib import logging import copy +import math +import csv +import itertools # third-party import Bio.SeqIO @@ -111,6 +114,47 @@ def inputs(self, *fnames): '''Return the full filenames for files in the test input directory for this test class''' return [self.input(fname) for fname in fnames] + def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0): + def _is_number(s): + try: + complex(s) # for int, long, float and complex + except ValueError: + return False + return True + + csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t") + csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",") + + with util.file.open_or_gzopen(file_one, 'rU') as inf1, util.file.open_or_gzopen(file_two, 'rU') as inf2: + report_type=None + for line_num, (line1,line2) in enumerate(itertools.zip_longest(inf1,inf2)): + self.assertIsNotNone(line1, msg="%s appears to be shorter than %s" % (inf1, inf2)) + self.assertIsNotNone(line2, msg="%s appears to be shorter than %s" % (inf2, inf1)) + + # continue to this next pair of lines until we have + # skipped past the header + if line_num < header_lines_to_skip: + continue + else: + inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect)) + inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect)) + + # assume the rows have the same number of elements + self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row)) + + for inf1_row_item, inf2_row_item in zip(inf1_row,inf2_row): + # we assume at the item from the same position is a number + # or not a number in both files + self.assertTrue(_is_number(inf1_row_item)==_is_number(inf2_row_item)) + + # if we're dealing with numbers, check that they're approximately equal + if _is_number(inf1_row_item): + assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol) + else: + # otherwise we're probably dealing with a string + self.assertEqual(inf2_row_item, inf1_row_item) + + def assertEqualSamHeaders(self, tested_samfile, expected_samfile, other_allowed_values=None): ''' other_allowed_values is a dict that maps a key to a list of other accepted values From 1f3377e69a02c1e3d24050b16caf6955fc69f913 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 6 Dec 2019 21:35:38 -0500 Subject: [PATCH 07/11] pad each level with single space since we aren't collapsing doubles --- metagenomics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagenomics.py b/metagenomics.py index f7422fadd..308a078da 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -1134,7 +1134,7 @@ def __init__(self, row, level): def __str__(self): # pad the name with the original prefix spacing # when rendering as a string - sci_name = " "*self.level+self.data["sci_name"] + sci_name = " "*self.level+self.data["sci_name"] return "\t".join([str(value) for key,value in self.data.items() if key!="sci_name"]+[sci_name]) def __repr__(self): From f1bbad21a4d11a4dbc5c30eeac62d5a6430ba67b Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 6 Dec 2019 21:37:13 -0500 Subject: [PATCH 08/11] add test class TestKrakenUniqSummaryFilter add test class TestKrakenUniqSummaryFilter, with first test case: test_unchanged_report --- .../expected/should_be_unchanged.txt | 37 +++++++++++++++++++ ...ve_higher_node_trimmed_after_resumming.txt | 0 ...have_higher_node_trimmed_at_first_pass.txt | 0 .../expected/should_have_leaves_trimmed.txt | 0 .../input/should_be_unchanged.txt | 37 +++++++++++++++++++ ...ve_higher_node_trimmed_after_resumming.txt | 0 ...have_higher_node_trimmed_at_first_pass.txt | 0 .../input/should_have_leaves_trimmed.txt | 0 test/unit/test_metagenomics.py | 18 +++++++++ 9 files changed, 92 insertions(+) create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt create mode 100644 test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt new file mode 100644 index 000000000..a7868efec --- /dev/null +++ b/test/input/TestKrakenUniqSummaryFilter/expected/should_be_unchanged.txt @@ -0,0 +1,37 @@ +# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus +# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz + +% reads taxReads kmers dup cov taxID rank taxName +8.134 1788 1788 1193206 1.69 NA 0 no rank unclassified +91.87 20195 4 356763 3 NA 1 no rank root +71.38 15692 709 346092 2.29 NA 131567 no rank cellular organisms +31.42 6906 0 153473 1.68 NA 2759 superkingdom Eukaryota +31.42 6906 0 153473 1.68 NA 33154 no rank Opisthokonta +31.42 6906 0 153473 1.68 NA 33208 kingdom Metazoa +31.42 6906 0 153473 1.68 NA 6072 no rank Eumetazoa +31.42 6906 0 153473 1.68 NA 33213 no rank Bilateria +31.42 6906 0 153473 1.68 NA 33511 no rank Deuterostomia +31.42 6906 0 153473 1.68 NA 7711 phylum Chordata +31.42 6906 0 153473 1.68 NA 89593 subphylum Craniata +31.42 6906 0 153473 1.68 NA 7742 no rank Vertebrata +31.42 6906 0 153473 1.68 NA 7776 no rank Gnathostomata +31.42 6906 0 153473 1.68 NA 117570 no rank Teleostomi +31.42 6906 0 153473 1.68 NA 117571 no rank Euteleostomi +31.42 6906 0 153473 1.68 NA 8287 superclass Sarcopterygii +31.42 6906 0 153473 1.68 NA 1338369 no rank Dipnotetrapodomorpha +31.42 6906 0 153473 1.68 NA 32523 no rank Tetrapoda +31.42 6906 0 153473 1.68 NA 32524 no rank Amniota +31.42 6906 0 153473 1.68 NA 40674 class Mammalia +31.42 6906 0 153473 1.68 NA 32525 no rank Theria +31.42 6906 0 153473 1.68 NA 9347 no rank Eutheria +31.42 6906 0 153473 1.68 NA 1437010 no rank Boreoeutheria +31.42 6906 0 153473 1.68 NA 314146 superorder Euarchontoglires +31.42 6906 0 153473 1.68 NA 9443 order Primates +31.42 6906 0 153473 1.68 NA 376913 suborder Haplorrhini +31.42 6906 0 153473 1.68 NA 314293 infraorder Simiiformes +31.42 6906 0 153473 1.68 NA 9526 parvorder Catarrhini +31.42 6906 0 153473 1.68 NA 314295 superfamily Hominoidea +31.42 6906 0 153473 1.68 NA 9604 family Hominidae +31.42 6906 0 153473 1.68 NA 207598 subfamily Homininae +31.42 6906 0 153473 1.68 NA 9605 genus Homo +31.42 6906 6906 153473 1.68 NA 9606 species Homo sapiens diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_after_resumming.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_higher_node_trimmed_at_first_pass.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt new file mode 100644 index 000000000..a7868efec --- /dev/null +++ b/test/input/TestKrakenUniqSummaryFilter/input/should_be_unchanged.txt @@ -0,0 +1,37 @@ +# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus +# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz + +% reads taxReads kmers dup cov taxID rank taxName +8.134 1788 1788 1193206 1.69 NA 0 no rank unclassified +91.87 20195 4 356763 3 NA 1 no rank root +71.38 15692 709 346092 2.29 NA 131567 no rank cellular organisms +31.42 6906 0 153473 1.68 NA 2759 superkingdom Eukaryota +31.42 6906 0 153473 1.68 NA 33154 no rank Opisthokonta +31.42 6906 0 153473 1.68 NA 33208 kingdom Metazoa +31.42 6906 0 153473 1.68 NA 6072 no rank Eumetazoa +31.42 6906 0 153473 1.68 NA 33213 no rank Bilateria +31.42 6906 0 153473 1.68 NA 33511 no rank Deuterostomia +31.42 6906 0 153473 1.68 NA 7711 phylum Chordata +31.42 6906 0 153473 1.68 NA 89593 subphylum Craniata +31.42 6906 0 153473 1.68 NA 7742 no rank Vertebrata +31.42 6906 0 153473 1.68 NA 7776 no rank Gnathostomata +31.42 6906 0 153473 1.68 NA 117570 no rank Teleostomi +31.42 6906 0 153473 1.68 NA 117571 no rank Euteleostomi +31.42 6906 0 153473 1.68 NA 8287 superclass Sarcopterygii +31.42 6906 0 153473 1.68 NA 1338369 no rank Dipnotetrapodomorpha +31.42 6906 0 153473 1.68 NA 32523 no rank Tetrapoda +31.42 6906 0 153473 1.68 NA 32524 no rank Amniota +31.42 6906 0 153473 1.68 NA 40674 class Mammalia +31.42 6906 0 153473 1.68 NA 32525 no rank Theria +31.42 6906 0 153473 1.68 NA 9347 no rank Eutheria +31.42 6906 0 153473 1.68 NA 1437010 no rank Boreoeutheria +31.42 6906 0 153473 1.68 NA 314146 superorder Euarchontoglires +31.42 6906 0 153473 1.68 NA 9443 order Primates +31.42 6906 0 153473 1.68 NA 376913 suborder Haplorrhini +31.42 6906 0 153473 1.68 NA 314293 infraorder Simiiformes +31.42 6906 0 153473 1.68 NA 9526 parvorder Catarrhini +31.42 6906 0 153473 1.68 NA 314295 superfamily Hominoidea +31.42 6906 0 153473 1.68 NA 9604 family Hominidae +31.42 6906 0 153473 1.68 NA 207598 subfamily Homininae +31.42 6906 0 153473 1.68 NA 9605 genus Homo +31.42 6906 6906 153473 1.68 NA 9606 species Homo sapiens diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_after_resumming.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_higher_node_trimmed_at_first_pass.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py index 63bc18237..a7f724ccf 100644 --- a/test/unit/test_metagenomics.py +++ b/test/unit/test_metagenomics.py @@ -29,6 +29,24 @@ def test_help_parser_for_each_command(self): parser = parser_fun(argparse.ArgumentParser()) helpstring = parser.format_help() +class TestKrakenUniqSummaryFilter(TestCaseWithTmp): + + def test_unchanged_report(self): + test_files_dir = util.file.get_test_input_path(self) + in_report = os.path.join(test_files_dir, 'input', 'should_be_unchanged.txt') + out_report = util.file.mkstempfname('.txt') + + parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser()) + args = parser.parse_args([in_report, out_report]) + args.func_main(args) + + # Check that results match (roughly) with what is expected + expected_out = os.path.join(test_files_dir, 'expected', 'should_be_unchanged.txt') + self.assertApproxEqualValuesInDelimitedFiles(out_report, + expected_out, + dialect="tsv", + numeric_rel_tol=1e-3, + header_lines_to_skip=3) class TestKronaCalls(TestCaseWithTmp): From 37424388cfb0aa19d38361d8e47d7d7424c69186 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Fri, 6 Dec 2019 21:55:16 -0500 Subject: [PATCH 09/11] add test TestKrakenUniqSummaryFilter::test_leaves_trimmed --- .../expected/should_have_leaves_trimmed.txt | 13 +++++++++++ .../input/should_have_leaves_trimmed.txt | 19 ++++++++++++++++ test/unit/test_metagenomics.py | 22 +++++++++++++++---- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt index e69de29bb..69a67aac9 100644 --- a/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt +++ b/test/input/TestKrakenUniqSummaryFilter/expected/should_have_leaves_trimmed.txt @@ -0,0 +1,13 @@ +# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus +# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz + +% reads taxReads kmers dup cov taxID rank taxName +27.810893206534104 6112 6112 1193206 1.69 NA 0 no rank unclassified +72.1891067934659 15865 11368 349850 3 NA 1 no rank root +20.46230149701961 4497 0 7949 31.5 NA 10239 superkingdom Viruses +20.46230149701961 4497 0 7930 32 NA 439488 no rank ssRNA viruses +20.46230149701961 4497 0 7930 32 NA 35301 no rank ssRNA negative-strand viruses +20.46230149701961 4497 0 7930 32 NA 11157 order Mononegavirales +20.46230149701961 4497 0 7930 32 NA 11266 family Filoviridae +20.46230149701961 4497 0 7930 32 NA 186536 genus Ebolavirus +20.46230149701961 4497 4497 7916 32 NA 186538 species Zaire ebolavirus diff --git a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt index e69de29bb..4981a88bc 100644 --- a/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt +++ b/test/input/TestKrakenUniqSummaryFilter/input/should_have_leaves_trimmed.txt @@ -0,0 +1,19 @@ +# KrakenUniq v0.5.7 DATE:2019-11-21T16:03:15Z DB:/tmp/tmp.ohQ3jTMIQh/krakenuniq DB_SIZE:198130386856 WD:/home/dnanexus +# CL:perl /opt/miniconda/envs/viral-ngs-env/bin/krakenuniq --db /tmp/tmp.ohQ3jTMIQh/krakenuniq --threads 32 --fastq-input --gzip-compressed --preload --report-file E023.l16.dedup.krakenuniq-summary_report.txt --paired --output E023.l16.dedup.krakenuniq-reads.txt.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmp0eg_wsp4.1.fastq.gz /tmp/tmp-metagenomics-krakenuniq-jfekrm6y/tmpx686dhd3.2.fastq.gz + +% reads taxReads kmers dup cov taxID rank taxName +27.81 6112 6112 1193206 1.69 NA 0 no rank unclassified +72.19 15867 11368 349960 3 NA 1 no rank root +20.47 4499 0 8059 31.5 NA 10239 superkingdom Viruses +20.46 4497 0 7930 32 NA 439488 no rank ssRNA viruses +20.46 4497 0 7930 32 NA 35301 no rank ssRNA negative-strand viruses +20.46 4497 0 7930 32 NA 11157 order Mononegavirales +20.46 4497 0 7930 32 NA 11266 family Filoviridae +20.46 4497 0 7930 32 NA 186536 genus Ebolavirus +20.46 4497 4497 7916 32 NA 186538 species Zaire ebolavirus +0.009098 2 0 110 1.71 NA 35237 no rank dsDNA viruses, no RNA stage +0.009098 2 0 100 1.6 NA 28883 order Caudovirales +0.009098 2 0 93 1.65 NA 10699 family Siphoviridae +0.009098 2 0 11 1.73 NA 1982251 genus Pa6virus +0.009098 2 0 8 1.62 NA 1982272 species Propionibacterium virus PAS50 +0.009098 2 2 8 1.62 NA 504553 no rank Propionibacterium phage PAS50 diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py index a7f724ccf..28c9b4850 100644 --- a/test/unit/test_metagenomics.py +++ b/test/unit/test_metagenomics.py @@ -30,10 +30,10 @@ def test_help_parser_for_each_command(self): helpstring = parser.format_help() class TestKrakenUniqSummaryFilter(TestCaseWithTmp): - - def test_unchanged_report(self): + + def _test_report(self, report_basename): test_files_dir = util.file.get_test_input_path(self) - in_report = os.path.join(test_files_dir, 'input', 'should_be_unchanged.txt') + in_report = os.path.join(test_files_dir, 'input', report_basename) out_report = util.file.mkstempfname('.txt') parser = metagenomics.parser_krakenuniq_report_filter(argparse.ArgumentParser()) @@ -41,13 +41,27 @@ def test_unchanged_report(self): args.func_main(args) # Check that results match (roughly) with what is expected - expected_out = os.path.join(test_files_dir, 'expected', 'should_be_unchanged.txt') + expected_out = os.path.join(test_files_dir, 'expected', report_basename) self.assertApproxEqualValuesInDelimitedFiles(out_report, expected_out, dialect="tsv", numeric_rel_tol=1e-3, header_lines_to_skip=3) + def test_unchanged_report(self): + self._test_report("should_be_unchanged.txt") + + def test_leaves_trimmed(self): + self._test_report("should_have_leaves_trimmed.txt") + + #def test_higher_node_trimmed_after_resumming(self): + # self._test_report("should_have_higher_node_trimmed_after_resumming.txt") + + #def test_higher_node_trimmed_at_first_pass(self): + # self._test_report("should_have_higher_node_trimmed_at_first_pass.txt") + + + class TestKronaCalls(TestCaseWithTmp): def setUp(self): From e94a745b24bca70cd3456253bf7b48cf69289250 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Mon, 9 Dec 2019 10:12:56 -0500 Subject: [PATCH 10/11] extend assertApproxEqualValuesInDelimitedFiles extend assertApproxEqualValuesInDelimitedFiles() test to use dicts in the case where a header line of field names is available so value comparisons can be made independent of column order. This is only used if use_first_processed_line_for_fieldnames=True. is_number() moved to util.misc; zip_dicts() added to util.misc --- test/__init__.py | 52 +++++++++++++++++++++++++--------- test/unit/test_metagenomics.py | 3 +- util/misc.py | 17 +++++++++++ 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/test/__init__.py b/test/__init__.py index 1a12ebf81..1a398f037 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -21,7 +21,7 @@ # intra-project import util.file -from util.misc import available_cpu_count +from util.misc import available_cpu_count, zip_dicts, is_number from tools.samtools import SamtoolsTool logging.getLogger('botocore').setLevel(logging.WARNING) @@ -114,13 +114,19 @@ def inputs(self, *fnames): '''Return the full filenames for files in the test input directory for this test class''' return [self.input(fname) for fname in fnames] - def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0): - def _is_number(s): - try: - complex(s) # for int, long, float and complex - except ValueError: - return False - return True + def assertApproxEqualValuesInDelimitedFiles(self, file_one, file_two, dialect="tsv", numeric_rel_tol=1e-5, header_lines_to_skip=0, use_first_processed_line_for_fieldnames=False): + ''' + This test checks whether two delimited (tsv, csv, or custom dialect) files + are approximately the same, by comparing the values present at each line + with a configurable tolerance for comparison of numeric values, and exact + comparison of string values. A specified number of header lines can be skipped + before the comparison begins to account for lines written by some tools with + for metadata, invokation params, etc. + If use_first_processed_line_for_fieldnames is specified, the columns can be + in any order since the rows are parsed as dicts, otherwise the columns are + assumed to be in the same order in both of the files. + ''' + header_fieldnames=None csv.register_dialect('tsv', quoting=csv.QUOTE_MINIMAL, delimiter="\t") csv.register_dialect('csv', quoting=csv.QUOTE_MINIMAL, delimiter=",") @@ -136,19 +142,39 @@ def _is_number(s): if line_num < header_lines_to_skip: continue else: - inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect)) - inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect)) + if header_fieldnames is None and use_first_processed_line_for_fieldnames: + inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect)) + inf2_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect)) + self.assertEqual(inf1_row,inf2_row, msg="header lines are not the same") + header_fieldnames=inf1_row + continue + + # if the header names are defined + if header_fieldnames is not None: + inf1_row = next(csv.DictReader([line1.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames)) + inf2_row = next(csv.DictReader([line2.strip().rstrip('\n')], dialect=dialect, fieldnames=header_fieldnames)) + else: + inf1_row = next(csv.reader([line1.strip().rstrip('\n')], dialect=dialect)) + inf2_row = next(csv.reader([line2.strip().rstrip('\n')], dialect=dialect)) # assume the rows have the same number of elements self.assertTrue(len(inf1_row) == len(inf2_row), msg="Files have lines of different length on line %s %s %s" % (line_num, inf1_row, inf2_row)) - for inf1_row_item, inf2_row_item in zip(inf1_row,inf2_row): + if header_fieldnames is not None: + def _dict_values_only(dict1_in, dict2_in): + for key, values in zip_dicts(dict1_in,dict2_in): + yield values + items_to_compare=_dict_values_only(inf1_row,inf2_row) + else: + items_to_compare=zip(inf1_row,inf2_row) + + for inf1_row_item, inf2_row_item in items_to_compare: # we assume at the item from the same position is a number # or not a number in both files - self.assertTrue(_is_number(inf1_row_item)==_is_number(inf2_row_item)) + self.assertTrue(is_number(inf1_row_item)==is_number(inf2_row_item)) # if we're dealing with numbers, check that they're approximately equal - if _is_number(inf1_row_item): + if is_number(inf1_row_item): assert float(inf2_row_item) == pytest.approx(float(inf1_row_item), rel=numeric_rel_tol) else: # otherwise we're probably dealing with a string diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py index 28c9b4850..922620432 100644 --- a/test/unit/test_metagenomics.py +++ b/test/unit/test_metagenomics.py @@ -46,7 +46,8 @@ def _test_report(self, report_basename): expected_out, dialect="tsv", numeric_rel_tol=1e-3, - header_lines_to_skip=3) + header_lines_to_skip=3, + use_first_processed_line_for_fieldnames=True) def test_unchanged_report(self): self._test_report("should_be_unchanged.txt") diff --git a/util/misc.py b/util/misc.py index ce8a8043a..fec0f8b2f 100644 --- a/util/misc.py +++ b/util/misc.py @@ -50,6 +50,16 @@ def unique(items): seen.add(i) yield i +def is_number(s): + """ + If the string can be parsed as an + int, long, float or complex number, return True + """ + try: + complex(s) + except ValueError: + return False + return True def histogram(items): ''' I count the number of times I see stuff and return a dict of counts. ''' @@ -614,6 +624,13 @@ def as_type(val, types): pass raise TypeError('Could not convert {} to any of {}: {}'.format(val, types, errs)) +def zip_dicts(*dicts): + keys_sets = map(set, dicts) + common_keys = functools.reduce(set.intersection, keys_sets) + for key in common_keys: + #yield (key,) + tuple(map(operator.itemgetter(key), dicts)) + yield key, tuple(map(operator.itemgetter(key), dicts)) + def subdict(d, keys): """Return a newly allocated shallow copy of a mapping `d` restricted to keys in `keys`.""" d = dict(d) From bb7fb489c005a15b115d6bbed4d7befeeedd29f6 Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 10 Dec 2019 12:54:42 -0500 Subject: [PATCH 11/11] regex raw string --- metagenomics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagenomics.py b/metagenomics.py index 308a078da..b42503753 100755 --- a/metagenomics.py +++ b/metagenomics.py @@ -962,7 +962,7 @@ def fasta_library_accessions(library): for seqr in SeqIO.parse(filepath, 'fasta'): name = seqr.name # Search for accession - mo = re.search('([A-Z]+_?\d+\.\d+)', name) + mo = re.search(r'([A-Z]+_?\d+\.\d+)', name) if mo: accession = mo.group(1) library_accessions.add(accession)