From 05ee782056b7149dc6777b558f2efdea69cc3343 Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Wed, 6 Nov 2019 17:30:10 +0100 Subject: [PATCH 1/7] Add basic functionality to read indexed FASTA files --- src/faidx/mod.rs | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- wrapper.h | 1 + 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 src/faidx/mod.rs diff --git a/src/faidx/mod.rs b/src/faidx/mod.rs new file mode 100644 index 000000000..b3cbb2c02 --- /dev/null +++ b/src/faidx/mod.rs @@ -0,0 +1,90 @@ +// Copyright 2019 Manuel Landesfeind, Evotec International GmbH +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! +//! Module for working with faidx-indexed FASTA files. +//! + +use std::ffi; +use std::path::Path; +use url::Url; + +use crate::htslib; + +pub mod errors; +pub use errors::{Error, Result}; + +fn path_as_bytes<'a, P: 'a + AsRef>(path: P, must_exist: bool) -> Result> { + if path.as_ref().exists() || !must_exist { + Ok(path + .as_ref() + .to_str() + .ok_or(Error::NonUnicodePath)? + .as_bytes() + .to_owned()) + } else { + Err(Error::FileNotFound { + path: path.as_ref().to_owned(), + }) + } +} +/// A Fasta reader. +#[derive(Debug)] +pub struct Reader { + inner: *mut htslib::faidx_t +} + +impl Reader { + /// Create a new Reader from path. + /// + /// # Arguments + /// + /// * `path` - the path to open. + pub fn from_path>(path: P) -> Result { + Self::new(&path_as_bytes(path, true)?) + } + + /// Create a new Reader from URL. + pub fn from_url(url: &Url) -> Result { + Self::new(url.as_str().as_bytes()) + } + + /// Create a new Reader. + /// + /// # Arguments + /// + /// * `path` - the path to open + fn new(path: &[u8]) -> Result { + let cpath = ffi::CString::new(path).unwrap(); + let inner = unsafe { htslib::fai_load(cpath.as_ptr()) }; + Ok(Self { inner }) + } + + /// Fetches the sequence and returns it + /// + /// # Arguments + /// + /// * `name` - the name of the template sequence (e.g., "chr1") + /// * `begin` - the offset within the template sequence (starting with 0) + /// * `end` - the end position to return + pub fn fetch_seq>(&self, name: N, begin: usize, end: usize) -> String { + let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap(); + let len_out: i32 = 0; + let cseq = unsafe { + let ptr = htslib::faidx_fetch_seq( + self.inner, //*const faidx_t, + cname.as_ptr(), // c_name + begin as ::std::os::raw::c_int, // p_beg_i + end as ::std::os::raw::c_int, // p_end_i + &mut (len_out as ::std::os::raw::c_int) //len + ); + ffi::CStr::from_ptr(ptr) + }; + + cseq.to_str().unwrap().to_owned() + } +} + + diff --git a/src/lib.rs b/src/lib.rs index f07980e2c..8dd6762bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -97,5 +97,6 @@ extern crate snafu; pub mod bam; pub mod bcf; pub mod htslib; +pub mod faidx; pub mod tbx; -pub mod utils; +mod utils; diff --git a/wrapper.h b/wrapper.h index 46f4fea2d..8f9161995 100644 --- a/wrapper.h +++ b/wrapper.h @@ -7,6 +7,7 @@ #include "htslib/htslib/tbx.h" #include "htslib/htslib/synced_bcf_reader.h" #include "htslib/htslib/kbitset.h" +#include "htslib/htslib/faidx.h" // The following functions have to be wrapped here because they are inline in htslib. From 4478e1d90e36cb532fa0c9f09682da1ef3378295 Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Fri, 8 Nov 2019 16:09:25 +0100 Subject: [PATCH 2/7] Add missing file --- src/faidx/errors.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/faidx/errors.rs diff --git a/src/faidx/errors.rs b/src/faidx/errors.rs new file mode 100644 index 000000000..d0ae44627 --- /dev/null +++ b/src/faidx/errors.rs @@ -0,0 +1,14 @@ +use snafu::Snafu; +use std::path::PathBuf; + +pub type Result = std::result::Result; + +#[derive(Snafu, Debug, PartialEq)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("file not found: {}", path.display()))] + FileNotFound { path: PathBuf }, + #[snafu(display("invalid (non-unique) characters in path"))] + NonUnicodePath, +} + From bf6fd6e1412a06364ff3b4c8f1740f220a901af5 Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Wed, 6 Nov 2019 17:30:10 +0100 Subject: [PATCH 3/7] Add basic functionality to read indexed FASTA files --- hts-sys/wrapper.h | 1 + src/faidx/mod.rs | 90 +++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 3 +- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 src/faidx/mod.rs diff --git a/hts-sys/wrapper.h b/hts-sys/wrapper.h index 46f4fea2d..8f9161995 100644 --- a/hts-sys/wrapper.h +++ b/hts-sys/wrapper.h @@ -7,6 +7,7 @@ #include "htslib/htslib/tbx.h" #include "htslib/htslib/synced_bcf_reader.h" #include "htslib/htslib/kbitset.h" +#include "htslib/htslib/faidx.h" // The following functions have to be wrapped here because they are inline in htslib. diff --git a/src/faidx/mod.rs b/src/faidx/mod.rs new file mode 100644 index 000000000..b3cbb2c02 --- /dev/null +++ b/src/faidx/mod.rs @@ -0,0 +1,90 @@ +// Copyright 2019 Manuel Landesfeind, Evotec International GmbH +// Licensed under the MIT license (http://opensource.org/licenses/MIT) +// This file may not be copied, modified, or distributed +// except according to those terms. + +//! +//! Module for working with faidx-indexed FASTA files. +//! + +use std::ffi; +use std::path::Path; +use url::Url; + +use crate::htslib; + +pub mod errors; +pub use errors::{Error, Result}; + +fn path_as_bytes<'a, P: 'a + AsRef>(path: P, must_exist: bool) -> Result> { + if path.as_ref().exists() || !must_exist { + Ok(path + .as_ref() + .to_str() + .ok_or(Error::NonUnicodePath)? + .as_bytes() + .to_owned()) + } else { + Err(Error::FileNotFound { + path: path.as_ref().to_owned(), + }) + } +} +/// A Fasta reader. +#[derive(Debug)] +pub struct Reader { + inner: *mut htslib::faidx_t +} + +impl Reader { + /// Create a new Reader from path. + /// + /// # Arguments + /// + /// * `path` - the path to open. + pub fn from_path>(path: P) -> Result { + Self::new(&path_as_bytes(path, true)?) + } + + /// Create a new Reader from URL. + pub fn from_url(url: &Url) -> Result { + Self::new(url.as_str().as_bytes()) + } + + /// Create a new Reader. + /// + /// # Arguments + /// + /// * `path` - the path to open + fn new(path: &[u8]) -> Result { + let cpath = ffi::CString::new(path).unwrap(); + let inner = unsafe { htslib::fai_load(cpath.as_ptr()) }; + Ok(Self { inner }) + } + + /// Fetches the sequence and returns it + /// + /// # Arguments + /// + /// * `name` - the name of the template sequence (e.g., "chr1") + /// * `begin` - the offset within the template sequence (starting with 0) + /// * `end` - the end position to return + pub fn fetch_seq>(&self, name: N, begin: usize, end: usize) -> String { + let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap(); + let len_out: i32 = 0; + let cseq = unsafe { + let ptr = htslib::faidx_fetch_seq( + self.inner, //*const faidx_t, + cname.as_ptr(), // c_name + begin as ::std::os::raw::c_int, // p_beg_i + end as ::std::os::raw::c_int, // p_end_i + &mut (len_out as ::std::os::raw::c_int) //len + ); + ffi::CStr::from_ptr(ptr) + }; + + cseq.to_str().unwrap().to_owned() + } +} + + diff --git a/src/lib.rs b/src/lib.rs index 9dbcc350b..628bcfe0c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -92,5 +92,6 @@ extern crate snafu; pub mod bam; pub mod bcf; pub mod htslib; +pub mod faidx; pub mod tbx; -pub mod utils; +mod utils; From ab53edcb5103ea6d579a5d68a8ca4c8edc02f263 Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Fri, 8 Nov 2019 16:09:25 +0100 Subject: [PATCH 4/7] Add missing file --- src/faidx/errors.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/faidx/errors.rs diff --git a/src/faidx/errors.rs b/src/faidx/errors.rs new file mode 100644 index 000000000..d0ae44627 --- /dev/null +++ b/src/faidx/errors.rs @@ -0,0 +1,14 @@ +use snafu::Snafu; +use std::path::PathBuf; + +pub type Result = std::result::Result; + +#[derive(Snafu, Debug, PartialEq)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("file not found: {}", path.display()))] + FileNotFound { path: PathBuf }, + #[snafu(display("invalid (non-unique) characters in path"))] + NonUnicodePath, +} + From 21a7899b153fb6810ef2bbc4c28689416fce9deb Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Sun, 3 May 2020 16:27:23 +0200 Subject: [PATCH 5/7] Apply cargo fmt --- src/faidx/errors.rs | 1 - src/faidx/mod.rs | 44 +++++++++++++++++++++----------------------- src/lib.rs | 2 +- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/src/faidx/errors.rs b/src/faidx/errors.rs index d0ae44627..da717ec46 100644 --- a/src/faidx/errors.rs +++ b/src/faidx/errors.rs @@ -11,4 +11,3 @@ pub enum Error { #[snafu(display("invalid (non-unique) characters in path"))] NonUnicodePath, } - diff --git a/src/faidx/mod.rs b/src/faidx/mod.rs index b3cbb2c02..1772fd68e 100644 --- a/src/faidx/mod.rs +++ b/src/faidx/mod.rs @@ -33,7 +33,7 @@ fn path_as_bytes<'a, P: 'a + AsRef>(path: P, must_exist: bool) -> Result>(path: P) -> Result { + pub fn from_path>(path: P) -> Result { Self::new(&path_as_bytes(path, true)?) } /// Create a new Reader from URL. - pub fn from_url(url: &Url) -> Result { + pub fn from_url(url: &Url) -> Result { Self::new(url.as_str().as_bytes()) } @@ -56,10 +56,10 @@ impl Reader { /// # Arguments /// /// * `path` - the path to open - fn new(path: &[u8]) -> Result { - let cpath = ffi::CString::new(path).unwrap(); - let inner = unsafe { htslib::fai_load(cpath.as_ptr()) }; - Ok(Self { inner }) + fn new(path: &[u8]) -> Result { + let cpath = ffi::CString::new(path).unwrap(); + let inner = unsafe { htslib::fai_load(cpath.as_ptr()) }; + Ok(Self { inner }) } /// Fetches the sequence and returns it @@ -70,21 +70,19 @@ impl Reader { /// * `begin` - the offset within the template sequence (starting with 0) /// * `end` - the end position to return pub fn fetch_seq>(&self, name: N, begin: usize, end: usize) -> String { - let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap(); - let len_out: i32 = 0; - let cseq = unsafe { - let ptr = htslib::faidx_fetch_seq( - self.inner, //*const faidx_t, - cname.as_ptr(), // c_name - begin as ::std::os::raw::c_int, // p_beg_i - end as ::std::os::raw::c_int, // p_end_i - &mut (len_out as ::std::os::raw::c_int) //len - ); - ffi::CStr::from_ptr(ptr) - }; - - cseq.to_str().unwrap().to_owned() + let cname = ffi::CString::new(name.as_ref().as_bytes()).unwrap(); + let len_out: i32 = 0; + let cseq = unsafe { + let ptr = htslib::faidx_fetch_seq( + self.inner, //*const faidx_t, + cname.as_ptr(), // c_name + begin as ::std::os::raw::c_int, // p_beg_i + end as ::std::os::raw::c_int, // p_end_i + &mut (len_out as ::std::os::raw::c_int), //len + ); + ffi::CStr::from_ptr(ptr) + }; + + cseq.to_str().unwrap().to_owned() } } - - diff --git a/src/lib.rs b/src/lib.rs index 628bcfe0c..3a251c2c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -91,7 +91,7 @@ extern crate snafu; pub mod bam; pub mod bcf; -pub mod htslib; pub mod faidx; +pub mod htslib; pub mod tbx; mod utils; From 4c120357a924916554c56d2f2c95f9495fb51549 Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Sun, 3 May 2020 16:57:03 +0200 Subject: [PATCH 6/7] Add test for Faidx loading --- src/faidx/mod.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/faidx/mod.rs b/src/faidx/mod.rs index 1772fd68e..e1f9d01e2 100644 --- a/src/faidx/mod.rs +++ b/src/faidx/mod.rs @@ -86,3 +86,49 @@ impl Reader { cseq.to_str().unwrap().to_owned() } } + + +#[cfg(test)] +mod tests { + use super::*; + + fn open_reader() -> Reader { + Reader::from_path(format!("{}/test/test_cram.fa", env!("CARGO_MANIFEST_DIR"))).ok().unwrap() + } + + #[test] + fn faidx_open() { + } + + #[test] + fn faidx_read_chr_first_base() { + let r = open_reader(); + let seq = r.fetch_seq("chr1", 0 , 0); + assert_eq!(seq.len(), 1); + assert_eq!(seq, "G"); + } + + #[test] + fn faidx_read_chr_start() { + let r = open_reader(); + let seq = r.fetch_seq("chr1", 0 , 9); + assert_eq!(seq.len(), 10); + assert_eq!(seq, "GGGCACAGCC"); + } + + #[test] + fn faidx_read_chr_between() { + let r = open_reader(); + let seq = r.fetch_seq("chr1", 4 , 14); + assert_eq!(seq.len(), 11); + assert_eq!(seq, "ACAGCCTCACC"); + } + + #[test] + fn faidx_read_chr_end() { + let r = open_reader(); + let seq = r.fetch_seq("chr1", 110, 120); + assert_eq!(seq.len(), 10); + assert_eq!(seq, "CCCCTCCGTG"); + } +} From d10fe5dff6f0ba5e116a957ef24f80921eef60b9 Mon Sep 17 00:00:00 2001 From: Manuel Landesfeind Date: Sun, 10 May 2020 20:00:31 +0200 Subject: [PATCH 7/7] Add test to read twice from the same faidx --- src/faidx/mod.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/faidx/mod.rs b/src/faidx/mod.rs index e1f9d01e2..8b3f81912 100644 --- a/src/faidx/mod.rs +++ b/src/faidx/mod.rs @@ -131,4 +131,16 @@ mod tests { assert_eq!(seq.len(), 10); assert_eq!(seq, "CCCCTCCGTG"); } + + #[test] + fn faidx_read_twice() { + let r = open_reader(); + let seq = r.fetch_seq("chr1", 110, 120); + assert_eq!(seq.len(), 10); + assert_eq!(seq, "CCCCTCCGTG"); + + let seq = r.fetch_seq("chr1", 5, 9); + assert_eq!(seq.len(), 5); + assert_eq!(seq, "CAGCC"); + } }