11#!/usr/bin/env python3
22
33import argparse
4- import pandas as pd
5- import numpy as np
4+ import csv
5+ import json
6+ import sys
67
7- def main (args ):
8- cols = 'subject_taxids,lineage,name,rank' .split (',' )
9- #taxon_results = pd.read_csv('F1910235_taxon_results.txt',sep = '\t', names=cols)
10- taxon_results = pd .read_csv (args .taxonresult ,sep = '\t ' , names = cols )
118
12- blast_results = pd .read_csv (args .blastresult ,sep = ',' )
13- taxon_results = taxon_results .dropna ()
9+ def parse_taxonkit_lineage (taxonkit_path ):
10+ """
11+ Parse taxonkit lineage outputs
1412
13+ :param taxonkit_path: path to taxonkit lineage output
14+ :type taxonkit_path: str
15+ :return: taxonkit lineages, by taxid. dict of dicts. keys of inner dicts: ['taxid', 'lineage', 'name', 'rank']
16+ :rtype: dict[str, dict[str, str]]
17+ """
18+ taxonkit_lineage_by_taxid = {}
19+ with open (taxonkit_path , 'r' ) as f :
20+ reader = csv .DictReader (f , delimiter = '\t ' )
21+ for row in reader :
22+ taxonkit_lineage_record = {}
23+ query_taxid = row ['query_taxid' ]
24+ lineage = row ['lineage' ]
25+ lineage_split = lineage .split (';' )
26+ taxids = row ['lineage_taxids' ]
27+ taxids_split = taxids .split (';' )
28+ name = row ['query_taxon_name' ]
29+ ranks = row ['lineage_ranks' ]
30+ ranks_split = ranks .split (';' )
1531
32+ taxonkit_lineage_record ['query_taxid' ] = query_taxid
33+ for idx , rank in enumerate (ranks_split ):
34+ if rank == 'species' :
35+ taxonkit_lineage_record ['species_taxid' ] = taxids_split [idx ]
36+ taxonkit_lineage_record ['species_name' ] = lineage_split [idx ]
37+ elif rank == 'genus' :
38+ taxonkit_lineage_record ['genus_taxid' ] = taxids_split [idx ]
39+ taxonkit_lineage_record ['genus_name' ] = lineage_split [idx ]
1640
17- conditions = [
18- (taxon_results ['rank' ] == "genus" ),
19- (taxon_results ['rank' ] == "species" ),
20- (taxon_results ['rank' ] == "strain" )
41+ taxonkit_lineage_by_taxid [query_taxid ] = taxonkit_lineage_record
42+
43+ return taxonkit_lineage_by_taxid
2144
22- ]
2345
24- choices_species = [None , taxon_results ['lineage' ].apply (lambda x : x .split (';' )[- 1 ]), taxon_results ['lineage' ].apply (lambda x : x .split (';' )[- 2 ])]
25- choices_genus = [taxon_results ['lineage' ].apply (lambda x : x .split (';' )[- 1 ]), taxon_results ['lineage' ].apply (lambda x : x .split (';' )[- 2 ]),taxon_results ['lineage' ].apply (lambda x : x .split (';' )[- 3 ])]
46+ def parse_blast_results (blast_results_path ):
47+ """
48+ Parse blast results
49+
50+ :param blast_results_path: path to blast results
51+ :type blast_results_path: str
52+
53+ """
54+ header_fieldnames = []
55+ blast_results = []
56+ with open (blast_results_path , 'r' ) as f :
57+ header_line = f .readline ().strip ()
58+ header_fieldnames = header_line .split (',' )
59+
60+ with open (blast_results_path , 'r' ) as f :
61+ reader = csv .DictReader (f )
62+ for row in reader :
63+ blast_results .append (row )
64+
65+ return header_fieldnames , blast_results
2666
27- taxon_results [ 'species' ] = np . select ( conditions , choices_species , default = taxon_results [ 'lineage' ]. apply ( lambda x : x . split ( ';' )[ - 1 ]))
28- taxon_results [ 'genus' ] = np . select ( conditions , choices_genus , default = taxon_results [ 'lineage' ]. apply ( lambda x : x . split ( ';' )[ - 2 ]))
67+
68+ def main ( args ):
2969
70+ taxonkit_lineage_by_taxid = parse_taxonkit_lineage (args .taxonresult )
71+
72+ output_fieldnames , blast_results = parse_blast_results (args .blastresult )
3073
31- merged = pd .merge (blast_results ,taxon_results [['subject_taxids' ,'species' ,'genus' ]],on = 'subject_taxids' , how = 'left' )
32- fil = merged ['species' ].str .contains ('uncultured' )
33- filtered_merged = merged [~ fil ]
74+ for blast_result in blast_results :
75+ subject_taxid = blast_result ['subject_taxids' ]
76+ if subject_taxid in taxonkit_lineage_by_taxid :
77+ blast_result ['species' ] = taxonkit_lineage_by_taxid [subject_taxid ].get ('species_name' , None )
78+ blast_result ['genus' ] = taxonkit_lineage_by_taxid [subject_taxid ].get ('genus_name' , None )
79+ else :
80+ blast_result ['species' ] = None
81+ blast_result ['genus' ] = None
82+
83+ output_fieldnames += [
84+ 'genus' ,
85+ 'species' ,
86+ ]
87+ writer = csv .DictWriter (sys .stdout , fieldnames = output_fieldnames , delimiter = ',' , lineterminator = '\n ' , quoting = csv .QUOTE_MINIMAL , extrasaction = 'ignore' )
88+ writer .writeheader ()
89+ for blast_result in blast_results :
90+ writer .writerow (blast_result )
91+
3492
35- filtered_merged .to_csv (args .outfile )
3693
3794if __name__ == "__main__" :
3895 parser = argparse .ArgumentParser ()
3996
4097 parser .add_argument ('-f' ,'--taxonresult' )
4198 parser .add_argument ('-b' ,'--blastresult' )
42- parser .add_argument ('-o' ,'--outfile' )
4399 args = parser .parse_args ()
44- main (args )
100+ main (args )
0 commit comments