Skip to content

add support for compression type 1 #42

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 21 additions & 14 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ parquet = { version = "53.0.0", optional = true }
serde = { version = "1.0.210", features = ["derive"], optional = true }
serde_json = { version = "1.0.128", optional = true }
timscompress = {version = "0.1.0", optional=true}
lzf = "1.0.0"

[features]
tdf = ["rusqlite"]
Expand Down
191 changes: 176 additions & 15 deletions src/io/readers/file_readers/tdf_blob_reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod tdf_blobs;

use lzf::decompress as lzf_decompress;
use memmap2::Mmap;
use std::fs::File;
use std::io;
Expand All @@ -9,6 +10,7 @@ use zstd::decode_all;
use crate::readers::{TimsTofFileType, TimsTofPathError, TimsTofPathLike};

const U32_SIZE: usize = std::mem::size_of::<u32>();

const HEADER_SIZE: usize = 2;

#[derive(Debug)]
Expand All @@ -23,7 +25,14 @@ impl TdfBlobReader {
Ok(reader)
}

pub fn get(&self, offset: usize) -> Result<TdfBlob, TdfBlobReaderError> {
/// Returns a TDF blob with uncompressed data
///
pub fn get(
&self,
offset: usize,
compression_type: u8,
max_peaks_per_scan: usize,
) -> Result<TdfBlob, TdfBlobReaderError> {
let offset = self.bin_file_reader.global_file_offset + offset;
let byte_count = self
.bin_file_reader
Expand All @@ -36,10 +45,126 @@ impl TdfBlobReader {
if data.len() == 0 {
return Err(TdfBlobReaderError::EmptyData);
}
let bytes =
decode_all(data).map_err(|_| TdfBlobReaderError::Decompression)?;
let blob = TdfBlob::new(bytes)?;
Ok(blob)
if compression_type == 1 {
let bytes = self.decompress_v1(
offset,
byte_count,
data,
max_peaks_per_scan,
)?;
let blob = TdfBlob::new(bytes)?;
Ok(blob)
} else {
let bytes = decode_all(data)
.map_err(|_| TdfBlobReaderError::Decompression)?;
let blob: TdfBlob = TdfBlob::new(bytes)?;
Ok(blob)
}
}

/// Get a TDF blob compressed with version 1
/// Basically a reimplementation of the alphatims implementation
/// Returns the uncompressed data compatible
/// * scan_count: 4 bytes
/// * scan_indices: (scan_count) * 4 bytes
/// * scan: remaining bytes
///
/// # Arguments
/// * `offset` - The offset of the blob in the binary file
/// * `max_peaks_per_scan` - The maximum number of peaks per scan
/// * `data` - The compressed data
/// * `max_peaks_per_scan` - The maximum number of peaks per scan from the metadata
fn decompress_v1(
&self,
offset: usize,
byte_count: usize,
data: &[u8],
max_peaks_per_scan: usize,
) -> Result<Vec<u8>, TdfBlobReaderError> {
// bin_size = int.from_bytes(infile.read(4), "little")
// bin_size == byte_count

// scan_count = int.from_bytes(infile.read(4), "little")
let scan_count = self
.bin_file_reader
.get_scan_count(offset)
.ok_or(TdfBlobReaderError::NoScanCount)?;

// TODO: frame_end - frame_start should be equal to bin_size/byte_count?
// max_peak_count = min(
// max_peaks_per_scan,
// frame_end - frame_start
// )
let max_peak_count = std::cmp::min(max_peaks_per_scan, byte_count);

// compression_offset = 8 + (scan_count + 1) * 4
let compression_offset = (scan_count + 1) * U32_SIZE;

// TODO: For some reason scan offsets were i32 not u32. Convert to u32 than to usize for easier indexing
// scan_offsets = np.frombuffer(
// infile.read((scan_count + 1) * 4),
// dtype=np.int32
// ) - compression_offset
let mut scan_offsets = self
.bin_file_reader
.get_scan_offsets(offset, scan_count)
.ok_or(TdfBlobReaderError::CorruptData)?;
scan_offsets = scan_offsets.iter_mut().map(|x| *x - compression_offset).collect();

// bin_size + scan_count + scan_offsets + compressed_data (which is max_peak_count * 4)
let tdf_bytes_capacity = U32_SIZE + U32_SIZE + scan_offsets.len() * U32_SIZE + scan_count * max_peak_count;

// this is basically the uncompressed frame
// scan_count: 4 bytes
// scan_indices: (scan_count) * 4 bytes
// scan: remaining bytes
let mut tdf_bytes = Vec::with_capacity(tdf_bytes_capacity);
tdf_bytes.extend_from_slice(&byte_count.to_le_bytes());


let mut scan_indexes: Vec<u8> = Vec::with_capacity(scan_count * U32_SIZE);
let mut scans: Vec<u8> = Vec::with_capacity(byte_count);

let mut scan_start: u32 = 0;
// for scan_index in range(scan_count):
for scan_index in 0..scan_count {
//start = scan_offsets[scan_index]
let start = scan_offsets[scan_index];

//end = scan_offsets[scan_index + 1]
let end = scan_offsets[scan_index + 1];

//if start == end:
// continue
if start == end {
continue;
}

//decompressed_bytes = lzf.decompress(
// compressed_data[start: end],
// max_peak_count * 4 * 2
//)
let mut decompressed_bytes = match lzf_decompress(
&data[start as usize..end as usize],
max_peak_count * U32_SIZE * 2,
) {
Ok(bytes) => bytes,
Err(_) => return Err(TdfBlobReaderError::Decompression),
};

if decompressed_bytes.len() % U32_SIZE != 0 {
return Err(TdfBlobReaderError::CorruptData);
}

scan_indexes.extend_from_slice(&scan_start.to_le_bytes());
scan_start = decompressed_bytes.len() as u32;
scans.append(&mut decompressed_bytes);
}

tdf_bytes.append(&mut scan_indexes);
tdf_bytes.append(&mut scans);

Ok(tdf_bytes)
}
}

Expand Down Expand Up @@ -68,6 +193,8 @@ impl TdfBinFileReader {
Ok(reader)
}

/// Get byte count, first 4 bytes of the blob
///
fn get_byte_count(&self, offset: usize) -> Option<usize> {
let start = offset as usize;
let end = start + U32_SIZE as usize;
Expand All @@ -77,14 +204,33 @@ impl TdfBinFileReader {
Some(byte_count)
}

// fn get_scan_count(&self, offset: usize) -> Option<usize> {
// let start = (offset + U32_SIZE) as usize;
// let end = start + U32_SIZE as usize;
// let raw_scan_count = self.mmap.get(start..end)?;
// let scan_count =
// u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
// Some(scan_count)
// }
/// Get scan count, second 4 bytes of the blob
///
fn get_scan_count(&self, offset: usize) -> Option<usize> {
let start = (offset + U32_SIZE) as usize;
let end = start + U32_SIZE as usize;
let raw_scan_count = self.mmap.get(start..end)?;
let scan_count =
u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
Some(scan_count)
}

/// Get scan offsets, third 4 bytes of the blob
///
fn get_scan_offsets(&self, offset: usize, scan_count: usize) -> Option<Vec<usize>> {
let start = (offset + U32_SIZE * 2) as usize;
let end = start + U32_SIZE * (scan_count + 1) as usize;
let raw_scan_offsets = self.mmap.get(start..end)?;
if raw_scan_offsets.len() % U32_SIZE != 0 {
return None;
}
let scan_offsets = raw_scan_offsets
.chunks_exact(U32_SIZE)
.map(|x| u32::from_le_bytes(x.try_into().unwrap()))
.map(|x| x as usize)
.collect::<Vec<usize>>();
Some(scan_offsets)
}

fn get_data(&self, offset: usize, byte_count: usize) -> Option<&[u8]> {
let start = offset + HEADER_SIZE * U32_SIZE;
Expand All @@ -106,10 +252,11 @@ impl IndexedTdfBlobReader {
path: impl TimsTofPathLike,
binary_offsets: Vec<usize>,
) -> Result<Self, IndexedTdfBlobReaderError> {

let blob_reader = TdfBlobReader::new(path)?;
let reader = Self {
binary_offsets,
blob_reader: blob_reader,
blob_reader,
};
Ok(reader)
}
Expand All @@ -122,7 +269,13 @@ impl IndexedTdfBlobReader {
.binary_offsets
.get(index)
.ok_or(IndexedTdfBlobReaderError::InvalidIndex(index))?;
let blob = self.blob_reader.get(offset)?;
let blob = self.blob_reader.get(
offset,
// TODO: Compression type 1 seems to be irrelevant for minitdf. Correct?
// Set compression to type 2 for latest compression and max peaks to 0 which is only relevant for type 1
2,
0
)?;
Ok(blob)
}
}
Expand All @@ -145,6 +298,14 @@ pub enum TdfBlobReaderError {
TimsTofPathError(#[from] TimsTofPathError),
#[error("No binary file found")]
NoBinary,
#[error("No scan count found")]
NoScanCount,
#[error("No binary size found")]
NoBinarySize,
#[error("Scan offset error")]
ScanOffsetError,
#[error("No scan offsets found")]
NoScanOffsets,
}

#[derive(Debug, thiserror::Error)]
Expand Down
Loading