33`BLAST`_ reader for output generated with outfmt 7 (preferred), 6 or 10
44"""
55
6- from collections import namedtuple
7- from sugar .core .fts import Feature , FeatureList , Location
8- from sugar .core .meta import Meta
6+ from sugar ._io .tab .core import read_tabular
97from sugar ._io .util import _add_fmt_doc
108
119
12- _BHV = namedtuple ('_BlastHeaderValue' , 'name long_name type' )
13- _MHV = namedtuple ('_MMseqsHeaderValue' , 'name type blast_equivalent' )
14- _IHV = namedtuple ('_InfernalHeaderValue' , 'name type blast_equivalent' )
15- _HEADER = {
16- 'blast' : [
17- _BHV ('qseqid' , 'query id' , str ),
18- _BHV ('qgi' , 'query gi' , str ),
19- _BHV ('qacc' , 'query acc.' , str ),
20- _BHV ('qaccver' , 'query acc.ver' , str ),
21- _BHV ('qlen' , 'query length' , int ),
22- _BHV ('sseqid' , 'subject id' , str ),
23- _BHV ('sallseqid' , 'subject ids' , str ),
24- _BHV ('sgi' , 'subject gi' , str ),
25- _BHV ('sallgi' , 'subject gis' , str ),
26- _BHV ('sacc' , 'subject acc.' , str ),
27- _BHV ('saccver' , 'subject acc.ver' , str ),
28- _BHV ('sallacc' , 'subject accs.' , str ),
29- _BHV ('slen' , 'subject length' , int ),
30- _BHV ('qstart' , 'q. start' , int ),
31- _BHV ('qend' , 'q. end' , int ),
32- _BHV ('sstart' , 's. start' , int ),
33- _BHV ('send' , 's. end' , int ),
34- _BHV ('qseq' , 'query seq' , str ),
35- _BHV ('sseq' , 'subject seq' , str ),
36- _BHV ('evalue' , 'evalue' , float ),
37- _BHV ('bitscore' , 'bit score' , float ),
38- _BHV ('score' , 'score' , float ),
39- _BHV ('length' , 'alignment length' , int ),
40- _BHV ('pident' , '% identity' , float ),
41- _BHV ('nident' , 'identical' , int ),
42- _BHV ('mismatch' , 'mismatches' , int ),
43- _BHV ('positive' , 'positives' , int ),
44- _BHV ('gapopen' , 'gap opens' , int ),
45- _BHV ('gaps' , 'gaps' , int ),
46- _BHV ('ppos' , '% positives' , float ),
47- _BHV ('frames' , 'query/sbjct frames' , str ),
48- _BHV ('qframe' , 'query frame' , int ),
49- _BHV ('sframe' , 'sbjct frame' , int ),
50- _BHV ('btop' , 'BTOP' , str ),
51- _BHV ('staxid' , 'subject tax id' , str ),
52- _BHV ('ssciname' , 'subject sci name' , str ),
53- _BHV ('scomname' , 'subject com name' , str ),
54- _BHV ('sblastname' , 'subject blast name' , str ),
55- _BHV ('sskingdom' , 'subject super kingdom' , str ),
56- _BHV ('staxids' , 'subject tax ids' , str ),
57- _BHV ('sscinames' , 'subject sci names' , str ),
58- _BHV ('scomnames' , 'subject com names' , str ),
59- _BHV ('sskingdoms' , 'subject super kingdoms' , str ),
60- _BHV ('stitle' , 'subject title' , str ),
61- _BHV ('salltitles' , 'subject titles' , str ),
62- _BHV ('sstrand' , 'subject strand' , str ),
63- _BHV ('qcovs' , '% query coverage per subject' , float ),
64- _BHV ('qcovhsp' , '% query coverage per hsp' , float ),
65- _BHV ('qcovus' , '% query coverage per uniq subject' , float ),
66- ],
67- 'mmseqs' : [
68- _MHV ('query' , str , 'qseqid' ),
69- _MHV ('qlen' , int , 'qlen' ),
70- _MHV ('target' , str , 'sseqid' ),
71- _MHV ('tlen' , int , 'slen' ),
72- _MHV ('qstart' , int , 'qstart' ),
73- _MHV ('qend' , int , 'qend' ),
74- _MHV ('tstart' , int , 'sstart' ),
75- _MHV ('tend' , int , 'send' ),
76- _MHV ('qseq' , str , 'qseq' ),
77- _MHV ('tseq' , str , 'sseq' ),
78- _MHV ('evalue' , float , 'evalue' ),
79- _MHV ('bits' , float , 'bitscore' ),
80- _MHV ('alnlen' , int , 'length' ),
81- _MHV ('pident' , float , 'pident' ),
82- _MHV ('nident' , int , 'nident' ),
83- _MHV ('mismatch' , int , 'mismatch' ),
84- _MHV ('gapopen' , int , 'gapopen' ),
85- _MHV ('ppos' , float , 'ppos' ),
86- _MHV ('qframe' , str , 'qframe' ),
87- _MHV ('tframe' , str , 'sframe' ),
88- _MHV ('qcov' , float , 'qcovs' ),
89- _MHV ('fident' , float , None ),
90- _MHV ('raw' , str , None ),
91- _MHV ('cigar' , str , None ),
92- _MHV ('qheader' , str , None ),
93- _MHV ('theader' , str , None ),
94- _MHV ('qaln' , str , None ),
95- _MHV ('taln' , str , None ),
96- _MHV ('tcov' , float , None ),
97- _MHV ('qset' , str , None ),
98- _MHV ('qsetid' , str , None ),
99- _MHV ('tset' , str , None ),
100- _MHV ('tsetid' , str , None ),
101- _MHV ('taxid' , str , None ),
102- _MHV ('taxname' , str , None ),
103- _MHV ('taxlineage' , str , None ),
104- _MHV ('qorfstart' , int , None ),
105- _MHV ('qorfend' , int , None ),
106- _MHV ('torfstart' , int , None ),
107- _MHV ('torfend' , int , None ),
108- ],
109- 'infernal' : [
110- # 18 fields from tblout fmt 1
111- _IHV ('target' , str , 'sseqid' ),
112- _IHV ('target_acc' , str , None ),
113- _IHV ('query' , str , 'qseqid' ),
114- _IHV ('query_acc' , str , None ),
115- _IHV ('model' , str , None ),
116- _IHV ('mstart' , int , 'qstart' ),
117- _IHV ('mend' , int , 'qend' ),
118- _IHV ('sstart' , int , 'sstart' ),
119- _IHV ('send' , int , 'send' ),
120- _IHV ('sstrand' , str , 'sstrand' ),
121- _IHV ('trunc' , str , None ),
122- _IHV ('pass' , int , None ),
123- _IHV ('gc' , float , None ),
124- _IHV ('bias' , float , None ),
125- _IHV ('bitscore' , float , 'bitscore' ),
126- _IHV ('evalue' , float , 'evalue' ),
127- _IHV ('inc' , str , None ),
128- _IHV ('description' , str , None ),
129- # 11 additional field from tblout fmt 2
130- _IHV ('idx' , int , None ),
131- _IHV ('clan' , str , None ),
132- _IHV ('overlap' , str , None ),
133- _IHV ('anyidx' , float , None ),
134- _IHV ('anyfrct1' , float , None ),
135- _IHV ('anyfrct2' , float , None ),
136- _IHV ('winidx' , float , None ),
137- _IHV ('winfrct1' , float , None ),
138- _IHV ('winfrct2' , float , None ),
139- _IHV ('mlen' , int , 'qlen' ),
140- _IHV ('slen' , int , 'slen' ),
141- ],
142- }
143- _CONVERTH = {
144- 'blast' : {hv .name : hv .name for hv in _HEADER ['blast' ]},
145- 'mmseqs' : {hv .blast_equivalent : hv .name for hv in _HEADER ['mmseqs' ]},
146- 'infernal' : {hv .blast_equivalent : hv .name for hv in _HEADER ['infernal' ]},
147- }
148- _DEFAULT_OUTFMT = {
149- 'blast' : ( # outfmt 6
150- 'qseqid sseqid pident length mismatch gapopen '
151- 'qstart qend sstart send evalue bitscore'
152- ).split (),
153- 'mmseqs' : ( # fmtmode 0
154- 'query target fident alnlen mismatch gapopen '
155- 'qstart qend tstart tend evalue bits'
156- ).split (),
157- 'infernal_1' : ( # fmt 1
158- 'target target_acc query query_acc '
159- 'model mstart mend sstart send sstrand '
160- 'trunc pass gc bias bitscore evalue inc '
161- 'description'
162- ).split (),
163- 'infernal_2' : ( # fmt 2
164- 'idx target target_acc query query_acc clan '
165- 'model mstart mend sstart send sstrand '
166- 'trunc pass gc bias bitscore evalue inc '
167- 'overlap anyidx anyfrct1 anyfrct2 winidx winfrct1 winfrct2 '
168- 'mlen slen description'
169- ).split (),
170- # fmt 2, old? version found in the Infernal guide
171- # and here https://docs.rfam.org/en/latest/genome-annotation.html
172- 'infernal_2old' : (
173- 'idx target target_acc query query_acc clan '
174- 'model mstart mend sstart send sstrand '
175- 'trunc pass gc bias bitscore evalue inc '
176- 'overlap anyidx anyfrct1 anyfrct2 winidx winfrct1 winfrct2 '
177- 'description'
178- ).split (),
179- 'infernal_3' : ( # fmt 3
180- 'target target_acc query query_acc '
181- 'model mstart mend sstart send sstrand '
182- 'trunc pass gc bias bitscore evalue inc '
183- 'mlen slen description'
184- ).split (),
185- }
186- _MMSEQS_HEADER_NAMES = [hv .name for hv in _HEADER ['mmseqs' ]]
187-
188- copyattrs = [('bitscore' , 'score' ),
189- ('evalue' , 'evalue' ),
190- ('sseqid' , 'seqid' ),
191- ('qseqid' , 'name' ),
192- ]
193-
194-
195- def extract_seqs (fts , x = 'source' , change_id = False ):
196- """
197- Extract sequences from BLAST features and return `.BioBasket`
198-
199- :param x: One of ``'source'`` or ``'query'``
200- """
201- # TODO remove change_id parameter
202- from sugar import BioSeq , BioBasket
203- x = x [0 ]
204- seqs = []
205- for ft in fts :
206- seq = BioSeq (ft .meta ._blast [f'{ x } seq' ],
207- id = ft .meta ._blast [f'{ x } seqid' ],
208- meta = {'fts' : FeatureList ([ft ])})
209- if change_id :
210- evalue = ft .meta .evalue
211- from math import log10
212- if abs (evalue ) > 0 :
213- evalue = log10 (evalue )
214- start = min (ft .meta ._blast [f'{ x } start' ], ft .meta ._blast [f'{ x } end' ])
215- seq .id = f'{ seq .id } /{ start } { ft .loc .strand } /e{ evalue :.0f} '
216- seqs .append (seq )
217- return BioBasket (seqs )
218-
219-
220- def get_query_fts (fts ):
221- """
222- Convert BLAST source features to query features
223- """
224- fts = fts .copy ()
225- for ft in fts :
226- _b = ft .meta ._blast
227- if 'qseqid' in _b :
228- ft .meta .seqid = _b .qseqid
229- if 'sseqid' in _b :
230- ft .meta .name = _b .sseqid
231- start , stop = _b .qstart , _b .qend
232- if start > stop :
233- start , stop , strand = stop , start , '-'
234- elif start < stop :
235- strand = '+'
236- else :
237- strand = '.'
238- loc = Location (start - 1 , stop , strand )
239- ft .locs = [loc ]
240- return fts
241-
242-
24310def is_format_fts (f , outfmt = None , ** kw ):
24411 content = f .read (1000 )
24512 if content .startswith ('#' ) and 'BLAST' in content :
@@ -263,101 +30,4 @@ def read_fts(f, *, sep='\t', outfmt=None, ftype=None, comments=None):
26330 :param list comments: comment lines inside the file are stored in
26431 the comments list (optional)
26532 """
266- return _read_tabular (f , sep = sep , outfmt = outfmt , ftype = ftype , comments = comments , fmt = 'blast' )
267-
268-
269- def _headers_from_fmtstrings (fmtstrs , fmt = 'blast' , attr = 'name' ):
270- headers = []
271- for fmtstr in fmtstrs :
272- for h in _HEADER [fmt ]:
273- if getattr (h , attr ) == fmtstr :
274- headers .append (h )
275- break
276- else :
277- raise ValueError (f'Unknown header string: { fmtstr } ' )
278- return headers
279-
280-
281- def _read_tabular (f , * , sep = '\t ' , outfmt = None , ftype = None , fmt = 'blast' ,
282- comments = None ):
283- assert fmt in ('blast' , 'infernal' , 'mmseqs' )
284- fts = []
285- headers = None
286- maxsplit = - 1
287- if outfmt is not None :
288- headers = _headers_from_fmtstrings (outfmt .split (), fmt = fmt )
289- for line in f :
290- if comments is not None and line .startswith ('#' ):
291- comments .append (line )
292- if fmt == 'blast' and outfmt is None and line .startswith ('# Fields:' ):
293- headerline = line .removeprefix ('# Fields:' )
294- headers = _headers_from_fmtstrings (map (str .strip , headerline .split (',' )),
295- fmt = fmt , attr = 'long_name' )
296- elif fmt == 'infernal' and headers is None and '--' in line :
297- nheaders = len (line .lstrip ('#' ).split ())
298- fmtv = {18 : '1' , 29 : '2' , 20 : '3' , 27 : '2old' }[nheaders ]
299- headers = _headers_from_fmtstrings (_DEFAULT_OUTFMT [f'{ fmt } _{ fmtv } ' ], fmt = fmt )
300- maxsplit = nheaders - 1
301- elif line .startswith ('#' ) or line .strip () == '' :
302- pass
303- elif (fmt == 'mmseqs' and
304- len (header_fmtstrings := line .strip ().split (sep )) > 1 and
305- set (header_fmtstrings ) <= set (_MMSEQS_HEADER_NAMES )):
306- if headers is None :
307- headers = _headers_from_fmtstrings (header_fmtstrings , fmt = fmt )
308- else :
309- if headers is None :
310- # outfmt not defined in header
311- # (as with BLAST outfmt 7 or MMseqs2 fmtmode 4)
312- # and not supplied by user
313- # assume default
314- headers = _headers_from_fmtstrings (_DEFAULT_OUTFMT [fmt ], fmt = fmt )
315- splitted_line = line .strip ().split (sep , maxsplit = maxsplit )
316- if len (splitted_line ) != len (headers ):
317- raise ValueError ('Number of header fields does not equal '
318- 'number of data values' )
319- attrs = {}
320- for i , v in enumerate (splitted_line ):
321- try :
322- v = headers [i ].type (v )
323- except ValueError :
324- pass
325- attrs [headers [i ].name ] = v
326- c = _CONVERTH [fmt ]
327- start = attrs [c ['sstart' ]]
328- stop = attrs [c ['send' ]]
329- qstart = attrs [c ['qstart' ]]
330- qstop = attrs [c ['qend' ]]
331- if attrs .get ('sstrand' ) == 'N/A' : # only applicable for BLAST
332- strand = '.'
333- elif (start > stop and qstart < qstop ) or (start < stop and qstart > qstop ):
334- if start > stop :
335- start , stop = stop , start
336- else :
337- qstart , qstop = qstop , qstart
338- strand = '-'
339- if attrs .get ('sstrand' ) not in (None , '-' , 'minus' ):
340- raise ValueError ('Expected strand -, got + in sstrand' )
341- elif (start < stop and qstart < qstop ) or (start > stop and qstart > qstop ):
342- if start > stop :
343- start , stop = stop , start
344- qstart , qstop = qstop , qstart
345- strand = '+'
346- if attrs .get ('sstrand' ) not in (None , '+' , 'plus' ):
347- raise ValueError ('Expected strand +, got - in sstrand' )
348- else :
349- strand = attrs .get ('sstrand' , '.' )
350- loc = Location (start - 1 , stop , strand )
351- _fmt = '_' + fmt
352- ft = Feature (attrs .get (ftype , ftype ), [loc ],
353- meta = Meta ({_fmt : attrs }))
354- if 'pident' in ft .meta [_fmt ] and 'fident' not in ft .meta [_fmt ]:
355- ft .meta [_fmt ].fident = ft .meta [_fmt ].pident / 100
356- if 'fident' in ft .meta [_fmt ] and 'pident' not in ft .meta [_fmt ]:
357- ft .meta [_fmt ].pident = ft .meta [_fmt ].fident * 100
358- # TODO adapt
359- for blastattr , metaattr in copyattrs :
360- if c [blastattr ] in ft .meta [_fmt ]:
361- ft .meta [metaattr ] = ft .meta [_fmt ][c [blastattr ]]
362- fts .append (ft )
363- return FeatureList (fts )
33+ return read_tabular (f , sep = sep , outfmt = outfmt , ftype = ftype , comments = comments , fmt = 'blast' )
0 commit comments