Skip to content

Commit 5a5302b

Browse files
committed
refactor tab readers, do not display private members in doc
1 parent 3a747b0 commit 5a5302b

File tree

9 files changed

+359
-339
lines changed

9 files changed

+359
-339
lines changed

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def download_logo():
7575
autodoc_default_options = {
7676
'members': True,
7777
'undoc-members': True,
78-
'private-members': True,
78+
'private-members': False,
7979
'show-inheritance': True,
8080
}
8181

docs/src/sugar._io.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ Submodules
1616
stockholm
1717
tab.blast
1818
tab.infernal
19+
tab.core
1920
tab.mmseqs
2021
util
2122

docs/src/sugar._io.tab.core.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
sugar.\_io.tab.core module
2+
==========================
3+
4+
.. automodule:: sugar._io.tab.core

sugar/_io/tab/blast.py

Lines changed: 2 additions & 332 deletions
Original file line numberDiff line numberDiff line change
@@ -3,243 +3,10 @@
33
`BLAST`_ reader for output generated with outfmt 7 (preferred), 6 or 10
44
"""
55

6-
from collections import namedtuple
7-
from sugar.core.fts import Feature, FeatureList, Location
8-
from sugar.core.meta import Meta
6+
from sugar._io.tab.core import read_tabular
97
from sugar._io.util import _add_fmt_doc
108

119

12-
_BHV = namedtuple('_BlastHeaderValue', 'name long_name type')
13-
_MHV = namedtuple('_MMseqsHeaderValue', 'name type blast_equivalent')
14-
_IHV = namedtuple('_InfernalHeaderValue', 'name type blast_equivalent')
15-
_HEADER = {
16-
'blast': [
17-
_BHV('qseqid', 'query id', str),
18-
_BHV('qgi', 'query gi', str),
19-
_BHV('qacc', 'query acc.', str),
20-
_BHV('qaccver', 'query acc.ver', str),
21-
_BHV('qlen', 'query length', int),
22-
_BHV('sseqid', 'subject id', str),
23-
_BHV('sallseqid', 'subject ids', str),
24-
_BHV('sgi', 'subject gi', str),
25-
_BHV('sallgi', 'subject gis', str),
26-
_BHV('sacc', 'subject acc.', str),
27-
_BHV('saccver', 'subject acc.ver', str),
28-
_BHV('sallacc', 'subject accs.', str),
29-
_BHV('slen', 'subject length', int),
30-
_BHV('qstart', 'q. start', int),
31-
_BHV('qend', 'q. end', int),
32-
_BHV('sstart', 's. start', int),
33-
_BHV('send', 's. end', int),
34-
_BHV('qseq', 'query seq', str),
35-
_BHV('sseq', 'subject seq', str),
36-
_BHV('evalue', 'evalue', float),
37-
_BHV('bitscore', 'bit score', float),
38-
_BHV('score', 'score', float),
39-
_BHV('length', 'alignment length', int),
40-
_BHV('pident', '% identity', float),
41-
_BHV('nident', 'identical', int),
42-
_BHV('mismatch', 'mismatches', int),
43-
_BHV('positive', 'positives', int),
44-
_BHV('gapopen', 'gap opens', int),
45-
_BHV('gaps', 'gaps', int),
46-
_BHV('ppos', '% positives', float),
47-
_BHV('frames', 'query/sbjct frames', str),
48-
_BHV('qframe', 'query frame', int),
49-
_BHV('sframe', 'sbjct frame', int),
50-
_BHV('btop', 'BTOP', str),
51-
_BHV('staxid', 'subject tax id', str),
52-
_BHV('ssciname', 'subject sci name', str),
53-
_BHV('scomname', 'subject com name', str),
54-
_BHV('sblastname', 'subject blast name', str),
55-
_BHV('sskingdom', 'subject super kingdom', str),
56-
_BHV('staxids', 'subject tax ids', str),
57-
_BHV('sscinames', 'subject sci names', str),
58-
_BHV('scomnames', 'subject com names', str),
59-
_BHV('sskingdoms', 'subject super kingdoms', str),
60-
_BHV('stitle', 'subject title', str),
61-
_BHV('salltitles', 'subject titles', str),
62-
_BHV('sstrand', 'subject strand', str),
63-
_BHV('qcovs', '% query coverage per subject', float),
64-
_BHV('qcovhsp', '% query coverage per hsp', float),
65-
_BHV('qcovus', '% query coverage per uniq subject', float),
66-
],
67-
'mmseqs': [
68-
_MHV('query', str, 'qseqid'),
69-
_MHV('qlen', int, 'qlen'),
70-
_MHV('target', str, 'sseqid'),
71-
_MHV('tlen', int, 'slen'),
72-
_MHV('qstart', int, 'qstart'),
73-
_MHV('qend', int, 'qend'),
74-
_MHV('tstart', int, 'sstart'),
75-
_MHV('tend', int, 'send'),
76-
_MHV('qseq', str, 'qseq'),
77-
_MHV('tseq', str, 'sseq'),
78-
_MHV('evalue', float, 'evalue'),
79-
_MHV('bits', float, 'bitscore'),
80-
_MHV('alnlen', int, 'length'),
81-
_MHV('pident', float, 'pident'),
82-
_MHV('nident', int, 'nident'),
83-
_MHV('mismatch', int, 'mismatch'),
84-
_MHV('gapopen', int, 'gapopen'),
85-
_MHV('ppos', float, 'ppos'),
86-
_MHV('qframe', str, 'qframe'),
87-
_MHV('tframe', str, 'sframe'),
88-
_MHV('qcov', float, 'qcovs'),
89-
_MHV('fident', float, None),
90-
_MHV('raw', str, None),
91-
_MHV('cigar', str, None),
92-
_MHV('qheader', str, None),
93-
_MHV('theader', str, None),
94-
_MHV('qaln', str, None),
95-
_MHV('taln', str, None),
96-
_MHV('tcov', float, None),
97-
_MHV('qset', str, None),
98-
_MHV('qsetid', str, None),
99-
_MHV('tset', str, None),
100-
_MHV('tsetid', str, None),
101-
_MHV('taxid', str, None),
102-
_MHV('taxname', str, None),
103-
_MHV('taxlineage', str, None),
104-
_MHV('qorfstart', int, None),
105-
_MHV('qorfend', int, None),
106-
_MHV('torfstart', int, None),
107-
_MHV('torfend', int, None),
108-
],
109-
'infernal': [
110-
# 18 fields from tblout fmt 1
111-
_IHV('target', str, 'sseqid'),
112-
_IHV('target_acc', str, None),
113-
_IHV('query', str, 'qseqid'),
114-
_IHV('query_acc', str, None),
115-
_IHV('model', str, None),
116-
_IHV('mstart', int, 'qstart'),
117-
_IHV('mend', int, 'qend'),
118-
_IHV('sstart', int, 'sstart'),
119-
_IHV('send', int, 'send'),
120-
_IHV('sstrand', str, 'sstrand'),
121-
_IHV('trunc', str, None),
122-
_IHV('pass', int, None),
123-
_IHV('gc', float, None),
124-
_IHV('bias', float, None),
125-
_IHV('bitscore', float, 'bitscore'),
126-
_IHV('evalue', float, 'evalue'),
127-
_IHV('inc', str, None),
128-
_IHV('description', str, None),
129-
# 11 additional field from tblout fmt 2
130-
_IHV('idx', int, None),
131-
_IHV('clan', str, None),
132-
_IHV('overlap', str, None),
133-
_IHV('anyidx', float, None),
134-
_IHV('anyfrct1', float, None),
135-
_IHV('anyfrct2', float, None),
136-
_IHV('winidx', float, None),
137-
_IHV('winfrct1', float, None),
138-
_IHV('winfrct2', float, None),
139-
_IHV('mlen', int, 'qlen'),
140-
_IHV('slen', int, 'slen'),
141-
],
142-
}
143-
_CONVERTH = {
144-
'blast': {hv.name: hv.name for hv in _HEADER['blast']},
145-
'mmseqs': {hv.blast_equivalent: hv.name for hv in _HEADER['mmseqs']},
146-
'infernal': {hv.blast_equivalent: hv.name for hv in _HEADER['infernal']},
147-
}
148-
_DEFAULT_OUTFMT = {
149-
'blast': ( # outfmt 6
150-
'qseqid sseqid pident length mismatch gapopen '
151-
'qstart qend sstart send evalue bitscore'
152-
).split(),
153-
'mmseqs': ( # fmtmode 0
154-
'query target fident alnlen mismatch gapopen '
155-
'qstart qend tstart tend evalue bits'
156-
).split(),
157-
'infernal_1': ( # fmt 1
158-
'target target_acc query query_acc '
159-
'model mstart mend sstart send sstrand '
160-
'trunc pass gc bias bitscore evalue inc '
161-
'description'
162-
).split(),
163-
'infernal_2': ( # fmt 2
164-
'idx target target_acc query query_acc clan '
165-
'model mstart mend sstart send sstrand '
166-
'trunc pass gc bias bitscore evalue inc '
167-
'overlap anyidx anyfrct1 anyfrct2 winidx winfrct1 winfrct2 '
168-
'mlen slen description'
169-
).split(),
170-
# fmt 2, old? version found in the Infernal guide
171-
# and here https://docs.rfam.org/en/latest/genome-annotation.html
172-
'infernal_2old': (
173-
'idx target target_acc query query_acc clan '
174-
'model mstart mend sstart send sstrand '
175-
'trunc pass gc bias bitscore evalue inc '
176-
'overlap anyidx anyfrct1 anyfrct2 winidx winfrct1 winfrct2 '
177-
'description'
178-
).split(),
179-
'infernal_3': ( # fmt 3
180-
'target target_acc query query_acc '
181-
'model mstart mend sstart send sstrand '
182-
'trunc pass gc bias bitscore evalue inc '
183-
'mlen slen description'
184-
).split(),
185-
}
186-
_MMSEQS_HEADER_NAMES = [hv.name for hv in _HEADER['mmseqs']]
187-
188-
copyattrs = [('bitscore', 'score'),
189-
('evalue', 'evalue'),
190-
('sseqid', 'seqid'),
191-
('qseqid', 'name'),
192-
]
193-
194-
195-
def extract_seqs(fts, x='source', change_id=False):
196-
"""
197-
Extract sequences from BLAST features and return `.BioBasket`
198-
199-
:param x: One of ``'source'`` or ``'query'``
200-
"""
201-
# TODO remove change_id parameter
202-
from sugar import BioSeq, BioBasket
203-
x = x[0]
204-
seqs = []
205-
for ft in fts:
206-
seq = BioSeq(ft.meta._blast[f'{x}seq'],
207-
id=ft.meta._blast[f'{x}seqid'],
208-
meta={'fts': FeatureList([ft])})
209-
if change_id:
210-
evalue = ft.meta.evalue
211-
from math import log10
212-
if abs(evalue) > 0:
213-
evalue = log10(evalue)
214-
start = min(ft.meta._blast[f'{x}start'], ft.meta._blast[f'{x}end'])
215-
seq.id = f'{seq.id}/{start}{ft.loc.strand}/e{evalue:.0f}'
216-
seqs.append(seq)
217-
return BioBasket(seqs)
218-
219-
220-
def get_query_fts(fts):
221-
"""
222-
Convert BLAST source features to query features
223-
"""
224-
fts = fts.copy()
225-
for ft in fts:
226-
_b = ft.meta._blast
227-
if 'qseqid' in _b:
228-
ft.meta.seqid = _b.qseqid
229-
if 'sseqid' in _b:
230-
ft.meta.name = _b.sseqid
231-
start, stop = _b.qstart, _b.qend
232-
if start > stop:
233-
start, stop, strand = stop, start, '-'
234-
elif start < stop:
235-
strand = '+'
236-
else:
237-
strand = '.'
238-
loc = Location(start-1, stop, strand)
239-
ft.locs = [loc]
240-
return fts
241-
242-
24310
def is_format_fts(f, outfmt=None, **kw):
24411
content = f.read(1000)
24512
if content.startswith('#') and 'BLAST' in content:
@@ -263,101 +30,4 @@ def read_fts(f, *, sep='\t', outfmt=None, ftype=None, comments=None):
26330
:param list comments: comment lines inside the file are stored in
26431
the comments list (optional)
26532
"""
266-
return _read_tabular(f, sep=sep, outfmt=outfmt, ftype=ftype, comments=comments, fmt='blast')
267-
268-
269-
def _headers_from_fmtstrings(fmtstrs, fmt='blast', attr='name'):
270-
headers = []
271-
for fmtstr in fmtstrs:
272-
for h in _HEADER[fmt]:
273-
if getattr(h, attr) == fmtstr:
274-
headers.append(h)
275-
break
276-
else:
277-
raise ValueError(f'Unknown header string: {fmtstr}')
278-
return headers
279-
280-
281-
def _read_tabular(f, *, sep='\t', outfmt=None, ftype=None, fmt='blast',
282-
comments=None):
283-
assert fmt in ('blast', 'infernal', 'mmseqs')
284-
fts = []
285-
headers = None
286-
maxsplit = -1
287-
if outfmt is not None:
288-
headers = _headers_from_fmtstrings(outfmt.split(), fmt=fmt)
289-
for line in f:
290-
if comments is not None and line.startswith('#'):
291-
comments.append(line)
292-
if fmt == 'blast' and outfmt is None and line.startswith('# Fields:'):
293-
headerline = line.removeprefix('# Fields:')
294-
headers = _headers_from_fmtstrings(map(str.strip, headerline.split(',')),
295-
fmt=fmt, attr='long_name')
296-
elif fmt == 'infernal' and headers is None and '--' in line:
297-
nheaders = len(line.lstrip('#').split())
298-
fmtv = {18: '1', 29: '2', 20: '3', 27: '2old'}[nheaders]
299-
headers = _headers_from_fmtstrings(_DEFAULT_OUTFMT[f'{fmt}_{fmtv}'], fmt=fmt)
300-
maxsplit = nheaders - 1
301-
elif line.startswith('#') or line.strip() == '':
302-
pass
303-
elif (fmt == 'mmseqs' and
304-
len(header_fmtstrings := line.strip().split(sep)) > 1 and
305-
set(header_fmtstrings) <= set(_MMSEQS_HEADER_NAMES)):
306-
if headers is None:
307-
headers = _headers_from_fmtstrings(header_fmtstrings, fmt=fmt)
308-
else:
309-
if headers is None:
310-
# outfmt not defined in header
311-
# (as with BLAST outfmt 7 or MMseqs2 fmtmode 4)
312-
# and not supplied by user
313-
# assume default
314-
headers = _headers_from_fmtstrings(_DEFAULT_OUTFMT[fmt], fmt=fmt)
315-
splitted_line = line.strip().split(sep, maxsplit=maxsplit)
316-
if len(splitted_line) != len(headers):
317-
raise ValueError('Number of header fields does not equal '
318-
'number of data values')
319-
attrs = {}
320-
for i, v in enumerate(splitted_line):
321-
try:
322-
v = headers[i].type(v)
323-
except ValueError:
324-
pass
325-
attrs[headers[i].name] = v
326-
c = _CONVERTH[fmt]
327-
start = attrs[c['sstart']]
328-
stop = attrs[c['send']]
329-
qstart = attrs[c['qstart']]
330-
qstop = attrs[c['qend']]
331-
if attrs.get('sstrand') == 'N/A': # only applicable for BLAST
332-
strand = '.'
333-
elif (start > stop and qstart < qstop) or (start < stop and qstart > qstop):
334-
if start > stop:
335-
start, stop = stop, start
336-
else:
337-
qstart, qstop = qstop, qstart
338-
strand = '-'
339-
if attrs.get('sstrand') not in (None, '-', 'minus'):
340-
raise ValueError('Expected strand -, got + in sstrand')
341-
elif (start < stop and qstart < qstop) or (start > stop and qstart > qstop):
342-
if start > stop:
343-
start, stop = stop, start
344-
qstart, qstop = qstop, qstart
345-
strand = '+'
346-
if attrs.get('sstrand') not in (None, '+', 'plus'):
347-
raise ValueError('Expected strand +, got - in sstrand')
348-
else:
349-
strand = attrs.get('sstrand', '.')
350-
loc = Location(start-1, stop, strand)
351-
_fmt = '_' + fmt
352-
ft = Feature(attrs.get(ftype, ftype), [loc],
353-
meta=Meta({_fmt: attrs}))
354-
if 'pident' in ft.meta[_fmt] and 'fident' not in ft.meta[_fmt]:
355-
ft.meta[_fmt].fident = ft.meta[_fmt].pident / 100
356-
if 'fident' in ft.meta[_fmt] and 'pident' not in ft.meta[_fmt]:
357-
ft.meta[_fmt].pident = ft.meta[_fmt].fident * 100
358-
# TODO adapt
359-
for blastattr, metaattr in copyattrs:
360-
if c[blastattr] in ft.meta[_fmt]:
361-
ft.meta[metaattr] = ft.meta[_fmt][c[blastattr]]
362-
fts.append(ft)
363-
return FeatureList(fts)
33+
return read_tabular(f, sep=sep, outfmt=outfmt, ftype=ftype, comments=comments, fmt='blast')

0 commit comments

Comments
 (0)