@@ -10,6 +10,44 @@ use crate::bam::record::Cigar;
10
10
use crate :: htslib;
11
11
use std:: collections:: HashMap ;
12
12
13
+ pub struct IterAlignedBlockPairs {
14
+ genome_pos : i64 ,
15
+ read_pos : i64 ,
16
+ cigar_index : usize ,
17
+ cigar : Vec < Cigar > ,
18
+ }
19
+
20
+ impl Iterator for IterAlignedBlockPairs {
21
+ type Item = ( [ i64 ; 2 ] , [ i64 ; 2 ] ) ;
22
+ fn next ( & mut self ) -> Option < Self :: Item > {
23
+ while self . cigar_index < self . cigar . len ( ) {
24
+ let entry = self . cigar [ self . cigar_index ] ;
25
+ match entry {
26
+ Cigar :: Match ( len) | Cigar :: Equal ( len) | Cigar :: Diff ( len) => {
27
+ let qstart = self . read_pos ;
28
+ let qend = qstart + len as i64 ;
29
+ let rstart = self . genome_pos ;
30
+ let rend = self . genome_pos + len as i64 ;
31
+ self . read_pos += len as i64 ;
32
+ self . genome_pos += len as i64 ;
33
+ self . cigar_index += 1 ;
34
+ return Some ( ( [ qstart, qend] , [ rstart, rend] ) ) ;
35
+ }
36
+ Cigar :: Ins ( len) | Cigar :: SoftClip ( len) => {
37
+ self . read_pos += len as i64 ;
38
+ }
39
+ Cigar :: Del ( len) | Cigar :: RefSkip ( len) => {
40
+ self . genome_pos += len as i64 ;
41
+ }
42
+ Cigar :: HardClip ( _) => { } // no advance
43
+ Cigar :: Pad ( _) => panic ! ( "Padding (Cigar::Pad) is not supported." ) , //padding is only used for multiple sequence alignment
44
+ }
45
+ self . cigar_index += 1 ;
46
+ }
47
+ None
48
+ }
49
+ }
50
+
13
51
pub struct IterAlignedBlocks {
14
52
pos : i64 ,
15
53
cigar_index : usize ,
@@ -185,10 +223,22 @@ pub trait BamRecordExtensions {
185
223
/// this happens on insertions.
186
224
///
187
225
/// pysam: blocks
226
+ /// See also: [aligned_block_pairs](#tymethod.aligned_block_pairs) if you need
227
+ /// the read coordinates as well.
188
228
fn aligned_blocks ( & self ) -> IterAlignedBlocks ;
189
229
190
- /// Iter intron positions (start, stop)
230
+ ///Iter over <([read_start, read_stop], [genome_start, genome_stop]) blocks
231
+ ///of continously aligned reads.
232
+ ///
233
+ ///In contrast to [aligned_blocks](#tymethod.aligned_blocks), this returns
234
+ ///read and genome coordinates.
235
+ ///In contrast to aligned_pairs, this returns just the start-stop
236
+ ///coordinates of each block.
191
237
///
238
+ ///There is not necessarily a gap between blocks in either coordinate space
239
+ ///(this happens in in-dels).
240
+ fn aligned_block_pairs ( & self ) -> IterAlignedBlockPairs ;
241
+
192
242
/// This scans the CIGAR for reference skips
193
243
/// and reports their positions.
194
244
/// It does not inspect the reported regions
@@ -201,6 +251,11 @@ pub trait BamRecordExtensions {
201
251
/// No entry for insertions, deletions or skipped pairs
202
252
///
203
253
/// pysam: get_aligned_pairs(matches_only = True)
254
+ ///
255
+ /// See also [aligned_block_pairs](#tymethod.aligned_block_pairs)
256
+ /// if you just need start&end coordinates of each block.
257
+ /// That way you can allocate less memory for the same
258
+ /// informational content.
204
259
fn aligned_pairs ( & self ) -> IterAlignedPairs ;
205
260
206
261
/// iter list of read and reference positions on a basepair level.
@@ -275,6 +330,15 @@ impl BamRecordExtensions for bam::Record {
275
330
}
276
331
}
277
332
333
+ fn aligned_block_pairs ( & self ) -> IterAlignedBlockPairs {
334
+ IterAlignedBlockPairs {
335
+ genome_pos : self . pos ( ) ,
336
+ read_pos : 0 ,
337
+ cigar : self . cigar ( ) . take ( ) . 0 ,
338
+ cigar_index : 0 ,
339
+ }
340
+ }
341
+
278
342
fn aligned_pairs ( & self ) -> IterAlignedPairs {
279
343
IterAlignedPairs {
280
344
genome_pos : self . pos ( ) ,
@@ -4690,6 +4754,47 @@ mod tests {
4690
4754
assert_eq ! ( none_count( & pairs, 1 ) , 4 ) ;
4691
4755
}
4692
4756
4757
+ #[ test]
4758
+ fn test_aligned_block_pairs ( ) {
4759
+ let mut bam = bam:: Reader :: from_path ( "./test/test_spliced_reads.bam" ) . unwrap ( ) ;
4760
+ let mut it = bam. records ( ) ;
4761
+
4762
+ let read = it. next ( ) . unwrap ( ) . unwrap ( ) ;
4763
+ let pairs: Vec < _ > = read. aligned_pairs ( ) . collect ( ) ;
4764
+ let block_pairs: Vec < _ > = read. aligned_block_pairs ( ) . collect ( ) ;
4765
+
4766
+ //first coordinates identical
4767
+ assert_eq ! ( pairs[ 0 ] [ 0 ] , block_pairs[ 0 ] . 0 [ 0 ] ) ; //read
4768
+ assert_eq ! ( pairs[ 0 ] [ 1 ] , block_pairs[ 0 ] . 1 [ 0 ] ) ; // genomic
4769
+
4770
+ //end coordinates are + 1, so the ranges are the same...
4771
+ assert_eq ! (
4772
+ pairs[ pairs. len( ) - 1 ] [ 0 ] ,
4773
+ block_pairs[ block_pairs. len( ) - 1 ] . 0 [ 1 ] - 1
4774
+ ) ;
4775
+ assert_eq ! (
4776
+ pairs[ pairs. len( ) - 1 ] [ 1 ] ,
4777
+ block_pairs[ block_pairs. len( ) - 1 ] . 1 [ 1 ] - 1
4778
+ ) ;
4779
+
4780
+ //let's see if they're really identical
4781
+ for read in it {
4782
+ let read = read. unwrap ( ) ;
4783
+ let pairs: Vec < _ > = read. aligned_pairs ( ) . collect ( ) ;
4784
+ let block_pairs: Vec < _ > = read. aligned_block_pairs ( ) . collect ( ) ;
4785
+ let mut ii = 0 ;
4786
+ for ( [ read_start, read_stop] , [ genome_start, genome_stop] ) in block_pairs {
4787
+ assert_eq ! ( read_stop - read_start, genome_stop - genome_start) ;
4788
+ for ( read_pos, genome_pos) in ( read_start..read_stop) . zip ( genome_start..genome_stop)
4789
+ {
4790
+ assert_eq ! ( pairs[ ii] [ 0 ] , read_pos) ;
4791
+ assert_eq ! ( pairs[ ii] [ 1 ] , genome_pos) ;
4792
+ ii += 1 ;
4793
+ }
4794
+ }
4795
+ }
4796
+ }
4797
+
4693
4798
#[ test]
4694
4799
fn test_get_cigar_stats ( ) {
4695
4800
let mut bam = bam:: Reader :: from_path ( "./test/test_spliced_reads.bam" ) . unwrap ( ) ;
0 commit comments