Skip to content

Feature/compression type1 #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 21 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ parquet = { version = "53.0.0", optional = true }
serde = { version = "1.0.210", features = ["derive"], optional = true }
serde_json = { version = "1.0.128", optional = true }
timscompress = {version = "0.1.0", optional=true}
lzf = "1.0.0"

[features]
tdf = ["rusqlite"]
Expand Down
118 changes: 104 additions & 14 deletions src/io/readers/file_readers/tdf_blob_reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod tdf_blobs;

use lzf::decompress as lzf_decompress;
use memmap2::Mmap;
use std::fs::File;
use std::io;
Expand All @@ -9,6 +10,7 @@ use zstd::decode_all;
use crate::readers::{TimsTofFileType, TimsTofPathError, TimsTofPathLike};

const U32_SIZE: usize = std::mem::size_of::<u32>();

const HEADER_SIZE: usize = 2;

#[derive(Debug)]
Expand All @@ -23,7 +25,14 @@ impl TdfBlobReader {
Ok(reader)
}

pub fn get(&self, offset: usize) -> Result<TdfBlob, TdfBlobReaderError> {
/// Returns a TDF blob with uncompressed data
///
pub fn get(
&self,
offset: usize,
compression_type: u8,
max_peaks_per_scan: usize,
) -> Result<TdfBlob, TdfBlobReaderError> {
let offset = self.bin_file_reader.global_file_offset + offset;
let byte_count = self
.bin_file_reader
Expand All @@ -36,11 +45,75 @@ impl TdfBlobReader {
if data.len() == 0 {
return Err(TdfBlobReaderError::EmptyData);
}
let bytes =
decode_all(data).map_err(|_| TdfBlobReaderError::Decompression)?;
let blob = TdfBlob::new(bytes)?;
let blob = if compression_type == 1 {
let bytes = self.decompress_v1(offset, data, max_peaks_per_scan)?;
TdfBlob::new(bytes, false)?
} else {
let bytes = decode_all(data)
.map_err(|_| TdfBlobReaderError::Decompression)?;
TdfBlob::new(bytes, true)?
};
Ok(blob)
}

/// Get a TDF blob compressed with version 1
/// Basically a reimplementation of the alphatims implementation
/// Returns the uncompressed data compatible
/// * scan_count: 4 bytes
/// * scan_indices: (scan_count) * 4 bytes
/// * scan: remaining bytes
///
/// # Arguments
/// * `offset` - The offset of the blob in the binary file
/// * `max_peaks_per_scan` - The maximum number of peaks per scan
/// * `data` - The compressed data
/// * `max_peaks_per_scan` - The maximum number of peaks per scan from the metadata
fn decompress_v1(
&self,
offset: usize,
data: &[u8],
max_peaks_per_scan: usize,
) -> Result<Vec<u8>, TdfBlobReaderError> {
let scan_count = self
.bin_file_reader
.get_scan_count(offset)
.ok_or(TdfBlobReaderError::NoScanCount)?;
let max_peak_count = max_peaks_per_scan * 2;
let scan_offsets = data[..(scan_count + 1) * U32_SIZE]
.chunks_exact(U32_SIZE)
.map(|x| u32::from_le_bytes(x.try_into().unwrap()))
.map(|x| x as usize - HEADER_SIZE * U32_SIZE)
.collect::<Vec<usize>>();
let mut tdf_bytes = vec![];
let mut last_offset = scan_count as u32 + 1;
let mut scan_bytes = last_offset.to_le_bytes().to_vec();
for scan_index in 0..scan_count {
let start = scan_offsets[scan_index];
let end = scan_offsets[scan_index + 1];
if start == end {
scan_bytes.extend(last_offset.to_le_bytes());
continue;
}
let decompressed_bytes = match lzf_decompress(
&data[start as usize..end as usize],
max_peak_count * U32_SIZE,
) {
Ok(bytes) => bytes,
Err(_) => {
return Err(TdfBlobReaderError::Decompression);
},
};
if decompressed_bytes.len() % U32_SIZE != 0 {
return Err(TdfBlobReaderError::CorruptData);
}
last_offset += decompressed_bytes.len() as u32 / U32_SIZE as u32;
scan_bytes.extend(last_offset.to_le_bytes());
tdf_bytes.extend(decompressed_bytes);
}
let mut blob_bytes = scan_bytes;
blob_bytes.extend(tdf_bytes);
Ok(blob_bytes)
}
}

#[derive(Debug)]
Expand Down Expand Up @@ -68,6 +141,8 @@ impl TdfBinFileReader {
Ok(reader)
}

/// Get byte count, first 4 bytes of the blob
///
fn get_byte_count(&self, offset: usize) -> Option<usize> {
let start = offset as usize;
let end = start + U32_SIZE as usize;
Expand All @@ -77,14 +152,16 @@ impl TdfBinFileReader {
Some(byte_count)
}

// fn get_scan_count(&self, offset: usize) -> Option<usize> {
// let start = (offset + U32_SIZE) as usize;
// let end = start + U32_SIZE as usize;
// let raw_scan_count = self.mmap.get(start..end)?;
// let scan_count =
// u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
// Some(scan_count)
// }
/// Get scan count, second 4 bytes of the blob
///
fn get_scan_count(&self, offset: usize) -> Option<usize> {
let start = (offset + U32_SIZE) as usize;
let end = start + U32_SIZE as usize;
let raw_scan_count = self.mmap.get(start..end)?;
let scan_count =
u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
Some(scan_count)
}

fn get_data(&self, offset: usize, byte_count: usize) -> Option<&[u8]> {
let start = offset + HEADER_SIZE * U32_SIZE;
Expand All @@ -109,7 +186,7 @@ impl IndexedTdfBlobReader {
let blob_reader = TdfBlobReader::new(path)?;
let reader = Self {
binary_offsets,
blob_reader: blob_reader,
blob_reader,
};
Ok(reader)
}
Expand All @@ -122,7 +199,12 @@ impl IndexedTdfBlobReader {
.binary_offsets
.get(index)
.ok_or(IndexedTdfBlobReaderError::InvalidIndex(index))?;
let blob = self.blob_reader.get(offset)?;
let blob = self.blob_reader.get(
offset,
// TODO: Compression type 1 seems to be irrelevant for minitdf. Correct?
// Set compression to type 2 for latest compression and max peaks to 0 which is only relevant for type 1
2, 0,
)?;
Ok(blob)
}
}
Expand All @@ -145,6 +227,14 @@ pub enum TdfBlobReaderError {
TimsTofPathError(#[from] TimsTofPathError),
#[error("No binary file found")]
NoBinary,
#[error("No scan count found")]
NoScanCount,
#[error("No binary size found")]
NoBinarySize,
#[error("Scan offset error")]
ScanOffsetError,
#[error("No scan offsets found")]
NoScanOffsets,
}

#[derive(Debug, thiserror::Error)]
Expand Down
27 changes: 19 additions & 8 deletions src/io/readers/file_readers/tdf_blob_reader/tdf_blobs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@ const BLOB_TYPE_SIZE: usize = std::mem::size_of::<u32>();
#[derive(Clone, Debug, Default, PartialEq)]
pub struct TdfBlob {
bytes: Vec<u8>,
shuffled: bool,
}

impl TdfBlob {
pub fn new(bytes: Vec<u8>) -> Result<Self, TdfBlobError> {
pub fn new(bytes: Vec<u8>, shuffled: bool) -> Result<Self, TdfBlobError> {
if bytes.len() % BLOB_TYPE_SIZE != 0 {
Err(TdfBlobError(bytes.len()))
} else {
Ok(Self { bytes })
Ok(Self { bytes, shuffled })
}
}

Expand All @@ -27,12 +28,22 @@ impl TdfBlob {
if index >= self.len() {
None
} else {
Some(Self::concatenate_bytes(
self.bytes[index],
self.bytes[index + self.len()],
self.bytes[index + 2 * self.len()],
self.bytes[index + 3 * self.len()],
))
if self.shuffled {
Some(Self::concatenate_bytes(
self.bytes[index],
self.bytes[index + self.len()],
self.bytes[index + 2 * self.len()],
self.bytes[index + 3 * self.len()],
))
} else {
let index = index * BLOB_TYPE_SIZE;
Some(Self::concatenate_bytes(
self.bytes[index],
self.bytes[index + 1],
self.bytes[index + 2],
self.bytes[index + 3],
))
}
}
}

Expand Down
Loading