Skip to content

Commit 7fd9ace

Browse files
Dom LaetschDom Laetsch
authored andcommitted
v0.9.14
- added new output format for bam2cov - parses readcov as well
1 parent b6291f3 commit 7fd9ace

File tree

5 files changed

+154
-99
lines changed

5 files changed

+154
-99
lines changed

bam2cov.py

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*-
33

4-
"""usage: blobtools bam2cov -i FASTA -b BAM [-h|--help]
5-
4+
"""usage: blobtools bam2cov -i FASTA -b BAM [--mq MQ] [--no_base_cov]
5+
[-h|--help]
6+
67
Options:
78
-h --help show this
8-
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
9+
-i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
910
-b, --bam <BAM> BAM file (requires samtools in $PATH)
11+
--mq <MQ> minimum Mapping Quality (MQ) [default: 1]
12+
--no_base_cov only parse read coverage (faster, but ...
13+
can only be used for "blobtools blobplot --noblobs")
1014
"""
1115

1216
from __future__ import division
@@ -18,11 +22,12 @@
1822

1923
class Fasta():
2024
def __init__(self, name, seq):
21-
self.name = name
25+
self.name = name
2226
self.length = len(seq)
2327
self.n_count = seq.count('N')
2428
self.agct_count = self.length - self.n_count
25-
self.cov = 0.0
29+
self.base_cov = 0.0
30+
self.read_cov = 0
2631

2732
def which(program):
2833
def is_exe(fpath):
@@ -47,9 +52,9 @@ def runCmd(command):
4752
return iter(p.stdout.readline, b'')
4853

4954
def readFasta(infile):
50-
with open(infile) as fh:
55+
with open(infile) as fh:
5156
header, seqs = '', []
52-
for l in fh:
57+
for l in fh:
5358
if l[0] == '>':
5459
if (header):
5560
yield header, ''.join(seqs)
@@ -86,49 +91,58 @@ def readBam(infile, fasta_headers):
8691
progress_unit = int(int(reads_total)/1000)
8792
base_cov_dict = {}
8893
cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's
89-
# execute samtools to get only mapped reads
90-
command = "samtools view -F 4 " + infile
94+
95+
read_cov_dict = {}
96+
# execute samtools to get only mapped reads from primary alignment
97+
command = "samtools view -q " + str(mq) + " -F 256 -F 4 " + infile
9198
# only one counter since only yields mapped reads
92-
parsed_reads = 0
99+
parsed_reads = 0
93100
for line in runCmd(command):
94101
match = line.split("\t")
95-
if match >= 11:
96-
seq_name = match[2]
97-
base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
98-
if (base_cov):
99-
parsed_reads += 1
100-
if seq_name not in fasta_headers:
101-
print BtLog.warn_d['2'] % (seq_name, infile)
102-
else:
103-
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
102+
seq_name = match[2]
103+
if seq_name not in fasta_headers:
104+
print BtLog.warn_d['2'] % (seq_name, infile)
105+
else:
106+
read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
107+
if not (no_base_cov_flag):
108+
base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
109+
if (base_cov):
110+
base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
111+
parsed_reads += 1
104112
BtLog.progress(parsed_reads, progress_unit, reads_total)
105113
BtLog.progress(reads_total, progress_unit, reads_total)
106-
107-
if not int(reads_mapped) == int(parsed_reads):
108-
print warn_d['3'] % (reads_mapped, parsed_reads)
109-
return base_cov_dict, reads_total, parsed_reads
114+
return base_cov_dict, read_cov_dict, reads_total, parsed_reads
110115

111116
def parseBam(bam_f, fasta_dict):
112-
base_cov_dict, reads_total, reads_mapped = readBam(bam_f, set(fasta_dict.keys()))
117+
base_cov_dict, read_cov_dict, reads_total, reads_mapped = readBam(bam_f, set(fasta_dict.keys()))
113118
if reads_total == 0:
114119
print BtLog.warn_d['4'] % bam_f
115120
for name, base_cov in base_cov_dict.items():
116-
fasta_dict[name].cov = base_cov / fasta_dict[name].agct_count
117-
return fasta_dict
121+
fasta_dict[name].base_cov = base_cov / fasta_dict[name].agct_count
122+
for name, read_cov in read_cov_dict.items():
123+
fasta_dict[name].read_cov = read_cov
124+
return fasta_dict, reads_total, reads_mapped
118125

119-
def writeCov(fasta_dict, out_f):
126+
def writeCov(fasta_dict, reads_total, reads_mapped, out_f):
120127
with open(out_f, 'w') as fh:
128+
fh.write("# Total Reads = %s\n" % (reads_total))
129+
fh.write("# Mapped Reads = %s\n" % (reads_mapped))
130+
fh.write("# Unmapped Reads = %s\n" % (reads_total - reads_mapped))
131+
fh.write("# Parameters : MQ = %s, No_base_cov_flag = %s\n" % (mq, no_base_cov_flag))
132+
fh.write("# %s\t%s\t%s\n" % ("contig_id", "read_cov", "base_cov"))
121133
for name, fasta_obj in fasta_dict.items():
122-
fh.write("%s\t%s\n" % (name, fasta_obj.cov))
134+
fh.write("%s\t%s\t%s\n" % (name, fasta_obj.read_cov, fasta_obj.base_cov))
123135

124136
if __name__ == '__main__':
125137
args = docopt(__doc__)
126-
138+
127139
fasta_f = args['--infile']
128140
bam_f = args['--bam']
129141
out_f = os.path.basename(bam_f) + ".cov"
142+
mq = int(args['--mq'])
143+
no_base_cov_flag = args['--no_base_cov']
130144

131145
fasta_dict = parseFasta(fasta_f)
132-
fasta_dict = parseBam(bam_f, fasta_dict)
133-
writeCov(fasta_dict, out_f)
146+
fasta_dict, reads_total, reads_mapped = parseBam(bam_f, fasta_dict)
147+
writeCov(fasta_dict, reads_total, reads_mapped, out_f)
134148

0 commit comments

Comments
 (0)