Skip to content

Commit fa07d5e

Browse files
Introduced BamRecordExtensions::aligned_block_pairs (#262)
* Introduced BamRecordExtensions::aligned_block_pairs * format Co-authored-by: Roman Valls Guimera <brainstorm@users.noreply.github.com>
1 parent 6eb361b commit fa07d5e

File tree

1 file changed

+106
-1
lines changed

1 file changed

+106
-1
lines changed

src/bam/ext.rs

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,44 @@ use crate::bam::record::Cigar;
1010
use crate::htslib;
1111
use std::collections::HashMap;
1212

13+
pub struct IterAlignedBlockPairs {
14+
genome_pos: i64,
15+
read_pos: i64,
16+
cigar_index: usize,
17+
cigar: Vec<Cigar>,
18+
}
19+
20+
impl Iterator for IterAlignedBlockPairs {
21+
type Item = ([i64; 2], [i64; 2]);
22+
fn next(&mut self) -> Option<Self::Item> {
23+
while self.cigar_index < self.cigar.len() {
24+
let entry = self.cigar[self.cigar_index];
25+
match entry {
26+
Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => {
27+
let qstart = self.read_pos;
28+
let qend = qstart + len as i64;
29+
let rstart = self.genome_pos;
30+
let rend = self.genome_pos + len as i64;
31+
self.read_pos += len as i64;
32+
self.genome_pos += len as i64;
33+
self.cigar_index += 1;
34+
return Some(([qstart, qend], [rstart, rend]));
35+
}
36+
Cigar::Ins(len) | Cigar::SoftClip(len) => {
37+
self.read_pos += len as i64;
38+
}
39+
Cigar::Del(len) | Cigar::RefSkip(len) => {
40+
self.genome_pos += len as i64;
41+
}
42+
Cigar::HardClip(_) => {} // no advance
43+
Cigar::Pad(_) => panic!("Padding (Cigar::Pad) is not supported."), //padding is only used for multiple sequence alignment
44+
}
45+
self.cigar_index += 1;
46+
}
47+
None
48+
}
49+
}
50+
1351
pub struct IterAlignedBlocks {
1452
pos: i64,
1553
cigar_index: usize,
@@ -185,10 +223,22 @@ pub trait BamRecordExtensions {
185223
/// this happens on insertions.
186224
///
187225
/// pysam: blocks
226+
/// See also: [aligned_block_pairs](#tymethod.aligned_block_pairs) if you need
227+
/// the read coordinates as well.
188228
fn aligned_blocks(&self) -> IterAlignedBlocks;
189229

190-
/// Iter intron positions (start, stop)
230+
///Iter over <([read_start, read_stop], [genome_start, genome_stop]) blocks
231+
///of continously aligned reads.
232+
///
233+
///In contrast to [aligned_blocks](#tymethod.aligned_blocks), this returns
234+
///read and genome coordinates.
235+
///In contrast to aligned_pairs, this returns just the start-stop
236+
///coordinates of each block.
191237
///
238+
///There is not necessarily a gap between blocks in either coordinate space
239+
///(this happens in in-dels).
240+
fn aligned_block_pairs(&self) -> IterAlignedBlockPairs;
241+
192242
/// This scans the CIGAR for reference skips
193243
/// and reports their positions.
194244
/// It does not inspect the reported regions
@@ -201,6 +251,11 @@ pub trait BamRecordExtensions {
201251
/// No entry for insertions, deletions or skipped pairs
202252
///
203253
/// pysam: get_aligned_pairs(matches_only = True)
254+
///
255+
/// See also [aligned_block_pairs](#tymethod.aligned_block_pairs)
256+
/// if you just need start&end coordinates of each block.
257+
/// That way you can allocate less memory for the same
258+
/// informational content.
204259
fn aligned_pairs(&self) -> IterAlignedPairs;
205260

206261
/// iter list of read and reference positions on a basepair level.
@@ -275,6 +330,15 @@ impl BamRecordExtensions for bam::Record {
275330
}
276331
}
277332

333+
fn aligned_block_pairs(&self) -> IterAlignedBlockPairs {
334+
IterAlignedBlockPairs {
335+
genome_pos: self.pos(),
336+
read_pos: 0,
337+
cigar: self.cigar().take().0,
338+
cigar_index: 0,
339+
}
340+
}
341+
278342
fn aligned_pairs(&self) -> IterAlignedPairs {
279343
IterAlignedPairs {
280344
genome_pos: self.pos(),
@@ -4690,6 +4754,47 @@ mod tests {
46904754
assert_eq!(none_count(&pairs, 1), 4);
46914755
}
46924756

4757+
#[test]
4758+
fn test_aligned_block_pairs() {
4759+
let mut bam = bam::Reader::from_path("./test/test_spliced_reads.bam").unwrap();
4760+
let mut it = bam.records();
4761+
4762+
let read = it.next().unwrap().unwrap();
4763+
let pairs: Vec<_> = read.aligned_pairs().collect();
4764+
let block_pairs: Vec<_> = read.aligned_block_pairs().collect();
4765+
4766+
//first coordinates identical
4767+
assert_eq!(pairs[0][0], block_pairs[0].0[0]); //read
4768+
assert_eq!(pairs[0][1], block_pairs[0].1[0]); // genomic
4769+
4770+
//end coordinates are + 1, so the ranges are the same...
4771+
assert_eq!(
4772+
pairs[pairs.len() - 1][0],
4773+
block_pairs[block_pairs.len() - 1].0[1] - 1
4774+
);
4775+
assert_eq!(
4776+
pairs[pairs.len() - 1][1],
4777+
block_pairs[block_pairs.len() - 1].1[1] - 1
4778+
);
4779+
4780+
//let's see if they're really identical
4781+
for read in it {
4782+
let read = read.unwrap();
4783+
let pairs: Vec<_> = read.aligned_pairs().collect();
4784+
let block_pairs: Vec<_> = read.aligned_block_pairs().collect();
4785+
let mut ii = 0;
4786+
for ([read_start, read_stop], [genome_start, genome_stop]) in block_pairs {
4787+
assert_eq!(read_stop - read_start, genome_stop - genome_start);
4788+
for (read_pos, genome_pos) in (read_start..read_stop).zip(genome_start..genome_stop)
4789+
{
4790+
assert_eq!(pairs[ii][0], read_pos);
4791+
assert_eq!(pairs[ii][1], genome_pos);
4792+
ii += 1;
4793+
}
4794+
}
4795+
}
4796+
}
4797+
46934798
#[test]
46944799
fn test_get_cigar_stats() {
46954800
let mut bam = bam::Reader::from_path("./test/test_spliced_reads.bam").unwrap();

0 commit comments

Comments
 (0)