Skip to content

Commit de703c3

Browse files
Speed up sequence retrieval by avoiding bounds checking for decoded bases (impossible to fail). Offer entirely unchecked variant of base decoding.
1 parent 1c2c259 commit de703c3

File tree

3 files changed

+40
-13
lines changed

3 files changed

+40
-13
lines changed

src/bam/buffer.rs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
// except according to those terms.
55

66
use std::collections::{vec_deque, VecDeque};
7+
use std::mem;
78
use std::rc::Rc;
89
use std::str;
910

@@ -23,6 +24,7 @@ pub struct RecordBuffer {
2324
overflow: Option<Rc<bam::Record>>,
2425
cache_cigar: bool,
2526
min_refetch_distance: u64,
27+
buffer_record: Rc<bam::Record>,
2628
}
2729

2830
unsafe impl Sync for RecordBuffer {}
@@ -42,6 +44,7 @@ impl RecordBuffer {
4244
overflow: None,
4345
cache_cigar,
4446
min_refetch_distance: 1,
47+
buffer_record: Rc::new(bam::Record::new()),
4548
}
4649
}
4750

@@ -107,26 +110,29 @@ impl RecordBuffer {
107110

108111
// extend to the right
109112
loop {
110-
let mut record = Rc::new(bam::Record::new());
111-
if self
113+
match self
112114
.reader
113-
.read(Rc::get_mut(&mut record).unwrap())
114-
.is_none()
115+
.read(Rc::get_mut(&mut self.buffer_record).unwrap())
115116
{
116-
break;
117+
None => break,
118+
Some(res) => res?,
117119
}
118120

119-
if record.is_unmapped() {
121+
if self.buffer_record.is_unmapped() {
120122
continue;
121123
}
122124

123-
let pos = record.pos();
125+
let pos = self.buffer_record.pos();
124126

125127
// skip records before the start
126128
if pos < start as i64 {
127129
continue;
128130
}
129131

132+
// Record is kept, do not reuse it for next iteration
133+
// and thus create a new one.
134+
let mut record = mem::replace(&mut self.buffer_record, Rc::new(bam::Record::new()));
135+
130136
if self.cache_cigar {
131137
Rc::get_mut(&mut record).unwrap().cache_cigar();
132138
}

src/bam/record.rs

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,7 @@ impl SequenceRead for Record {
717717
}
718718

719719
fn base(&self, i: usize) -> u8 {
720-
decode_base(encoded_base(self.seq_data(), i))
720+
*decode_base_unchecked(encoded_base(self.seq_data(), i))
721721
}
722722

723723
fn base_qual(&self, i: usize) -> u8 {
@@ -820,12 +820,19 @@ static ENCODE_BASE: [u8; 256] = [
820820
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
821821
];
822822

823+
#[inline]
823824
fn encoded_base(encoded_seq: &[u8], i: usize) -> u8 {
824825
(encoded_seq[i / 2] >> ((!i & 1) << 2)) & 0b1111
825826
}
826827

827-
fn decode_base(base: u8) -> u8 {
828-
DECODE_BASE[base as usize]
828+
#[inline]
829+
unsafe fn encoded_base_unchecked(encoded_seq: &[u8], i: usize) -> u8 {
830+
(encoded_seq.get_unchecked(i / 2) >> ((!i & 1) << 2)) & 0b1111
831+
}
832+
833+
#[inline]
834+
fn decode_base_unchecked(base: u8) -> &'static u8 {
835+
unsafe { DECODE_BASE.get_unchecked(base as usize) }
829836
}
830837

831838
/// The sequence of a record.
@@ -842,6 +849,20 @@ impl<'a> Seq<'a> {
842849
encoded_base(self.encoded, i)
843850
}
844851

852+
/// Return encoded base. Complexity: O(1).
853+
#[inline]
854+
pub unsafe fn encoded_base_unchecked(&self, i: usize) -> u8 {
855+
encoded_base_unchecked(self.encoded, i)
856+
}
857+
858+
/// Obtain decoded base without performing bounds checking.
859+
/// Use index based access seq()[i], for checked, safe access.
860+
/// Complexity: O(1).
861+
#[inline]
862+
pub unsafe fn decoded_base_unchecked(&self, i: usize) -> u8 {
863+
*decode_base_unchecked(self.encoded_base_unchecked(i))
864+
}
865+
845866
/// Return decoded sequence. Complexity: O(m) with m being the read length.
846867
pub fn as_bytes(&self) -> Vec<u8> {
847868
(0..self.len()).map(|i| self[i]).collect()
@@ -862,7 +883,7 @@ impl<'a> ops::Index<usize> for Seq<'a> {
862883

863884
/// Return decoded base at given position within read. Complexity: O(1).
864885
fn index(&self, index: usize) -> &u8 {
865-
&DECODE_BASE[self.encoded_base(index) as usize]
886+
decode_base_unchecked(self.encoded_base(index))
866887
}
867888
}
868889

src/bcf/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,7 +1327,7 @@ mod tests {
13271327
let mut reader = Reader::from_path("test/test_svlen.vcf").unwrap();
13281328

13291329
let mut record = reader.empty_record();
1330-
reader.read(&mut record).unwrap();
1330+
reader.read(&mut record).unwrap().unwrap();
13311331

13321332
assert_eq!(record.info(b"SVLEN").integer().unwrap(), Some(&[-127][..]));
13331333
}
@@ -1366,7 +1366,7 @@ mod tests {
13661366
fn test_alt_allele_dosage() {
13671367
let path = &"test/test_string.vcf";
13681368
let mut bcf = Reader::from_path(path).expect("Error opening file.");
1369-
let header = bcf.header();
1369+
let _header = bcf.header();
13701370
// FORMAT fields of first record of the vcf should look like:
13711371
// GT:FS1:FN1 ./1:LongString1:1 1/1:ss1:2
13721372
let mut first_record = bcf.records().next().unwrap().expect("Fail to read record");

0 commit comments

Comments
 (0)