1
1
#!/usr/bin/env python
2
2
# -*- coding: utf-8 -*-
3
3
4
- """usage: blobtools bam2cov -i FASTA -b BAM [-h|--help]
5
-
4
+ """usage: blobtools bam2cov -i FASTA -b BAM [--mq MQ] [--no_base_cov]
5
+ [-h|--help]
6
+
6
7
Options:
7
8
-h --help show this
8
- -i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
9
+ -i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces.
9
10
-b, --bam <BAM> BAM file (requires samtools in $PATH)
11
+ --mq <MQ> minimum Mapping Quality (MQ) [default: 1]
12
+ --no_base_cov only parse read coverage (faster, but ...
13
+ can only be used for "blobtools blobplot --noblobs")
10
14
"""
11
15
12
16
from __future__ import division
18
22
19
23
class Fasta ():
20
24
def __init__ (self , name , seq ):
21
- self .name = name
25
+ self .name = name
22
26
self .length = len (seq )
23
27
self .n_count = seq .count ('N' )
24
28
self .agct_count = self .length - self .n_count
25
- self .cov = 0.0
29
+ self .base_cov = 0.0
30
+ self .read_cov = 0
26
31
27
32
def which (program ):
28
33
def is_exe (fpath ):
@@ -47,9 +52,9 @@ def runCmd(command):
47
52
return iter (p .stdout .readline , b'' )
48
53
49
54
def readFasta (infile ):
50
- with open (infile ) as fh :
55
+ with open (infile ) as fh :
51
56
header , seqs = '' , []
52
- for l in fh :
57
+ for l in fh :
53
58
if l [0 ] == '>' :
54
59
if (header ):
55
60
yield header , '' .join (seqs )
@@ -86,49 +91,58 @@ def readBam(infile, fasta_headers):
86
91
progress_unit = int (int (reads_total )/ 1000 )
87
92
base_cov_dict = {}
88
93
cigar_match_re = re .compile (r"(\d+)M" ) # only gets digits before M's
89
- # execute samtools to get only mapped reads
90
- command = "samtools view -F 4 " + infile
94
+
95
+ read_cov_dict = {}
96
+ # execute samtools to get only mapped reads from primary alignment
97
+ command = "samtools view -q " + str (mq ) + " -F 256 -F 4 " + infile
91
98
# only one counter since only yields mapped reads
92
- parsed_reads = 0
99
+ parsed_reads = 0
93
100
for line in runCmd (command ):
94
101
match = line .split ("\t " )
95
- if match >= 11 :
96
- seq_name = match [2 ]
97
- base_cov = sum ([int (matching ) for matching in cigar_match_re .findall (match [5 ])])
98
- if (base_cov ):
99
- parsed_reads += 1
100
- if seq_name not in fasta_headers :
101
- print BtLog .warn_d ['2' ] % (seq_name , infile )
102
- else :
103
- base_cov_dict [seq_name ] = base_cov_dict .get (seq_name , 0 ) + base_cov
102
+ seq_name = match [2 ]
103
+ if seq_name not in fasta_headers :
104
+ print BtLog .warn_d ['2' ] % (seq_name , infile )
105
+ else :
106
+ read_cov_dict [seq_name ] = read_cov_dict .get (seq_name , 0 ) + 1
107
+ if not (no_base_cov_flag ):
108
+ base_cov = sum ([int (matching ) for matching in cigar_match_re .findall (match [5 ])])
109
+ if (base_cov ):
110
+ base_cov_dict [seq_name ] = base_cov_dict .get (seq_name , 0 ) + base_cov
111
+ parsed_reads += 1
104
112
BtLog .progress (parsed_reads , progress_unit , reads_total )
105
113
BtLog .progress (reads_total , progress_unit , reads_total )
106
-
107
- if not int (reads_mapped ) == int (parsed_reads ):
108
- print warn_d ['3' ] % (reads_mapped , parsed_reads )
109
- return base_cov_dict , reads_total , parsed_reads
114
+ return base_cov_dict , read_cov_dict , reads_total , parsed_reads
110
115
111
116
def parseBam (bam_f , fasta_dict ):
112
- base_cov_dict , reads_total , reads_mapped = readBam (bam_f , set (fasta_dict .keys ()))
117
+ base_cov_dict , read_cov_dict , reads_total , reads_mapped = readBam (bam_f , set (fasta_dict .keys ()))
113
118
if reads_total == 0 :
114
119
print BtLog .warn_d ['4' ] % bam_f
115
120
for name , base_cov in base_cov_dict .items ():
116
- fasta_dict [name ].cov = base_cov / fasta_dict [name ].agct_count
117
- return fasta_dict
121
+ fasta_dict [name ].base_cov = base_cov / fasta_dict [name ].agct_count
122
+ for name , read_cov in read_cov_dict .items ():
123
+ fasta_dict [name ].read_cov = read_cov
124
+ return fasta_dict , reads_total , reads_mapped
118
125
119
- def writeCov (fasta_dict , out_f ):
126
+ def writeCov (fasta_dict , reads_total , reads_mapped , out_f ):
120
127
with open (out_f , 'w' ) as fh :
128
+ fh .write ("# Total Reads = %s\n " % (reads_total ))
129
+ fh .write ("# Mapped Reads = %s\n " % (reads_mapped ))
130
+ fh .write ("# Unmapped Reads = %s\n " % (reads_total - reads_mapped ))
131
+ fh .write ("# Parameters : MQ = %s, No_base_cov_flag = %s\n " % (mq , no_base_cov_flag ))
132
+ fh .write ("# %s\t %s\t %s\n " % ("contig_id" , "read_cov" , "base_cov" ))
121
133
for name , fasta_obj in fasta_dict .items ():
122
- fh .write ("%s\t %s\n " % (name , fasta_obj .cov ))
134
+ fh .write ("%s\t %s\t %s \ n " % (name , fasta_obj .read_cov , fasta_obj . base_cov ))
123
135
124
136
if __name__ == '__main__' :
125
137
args = docopt (__doc__ )
126
-
138
+
127
139
fasta_f = args ['--infile' ]
128
140
bam_f = args ['--bam' ]
129
141
out_f = os .path .basename (bam_f ) + ".cov"
142
+ mq = int (args ['--mq' ])
143
+ no_base_cov_flag = args ['--no_base_cov' ]
130
144
131
145
fasta_dict = parseFasta (fasta_f )
132
- fasta_dict = parseBam (bam_f , fasta_dict )
133
- writeCov (fasta_dict , out_f )
146
+ fasta_dict , reads_total , reads_mapped = parseBam (bam_f , fasta_dict )
147
+ writeCov (fasta_dict , reads_total , reads_mapped , out_f )
134
148
0 commit comments