MannLabs · sander-willems-bruker · Apr 23, 2025 · May 20, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -25,6 +25,7 @@ parquet = { version = "53.0.0", optional = true }
 serde = { version = "1.0.210", features = ["derive"], optional = true }
 serde_json = { version = "1.0.128", optional = true }
 timscompress = {version = "0.1.0", optional=true}
+lzf = "1.0.0"
 
 [features]
 tdf = ["rusqlite"]

diff --git a/src/io/readers/file_readers/tdf_blob_reader.rs b/src/io/readers/file_readers/tdf_blob_reader.rs
@@ -1,5 +1,6 @@
 mod tdf_blobs;
 
+use lzf::decompress as lzf_decompress;
 use memmap2::Mmap;
 use std::fs::File;
 use std::io;
@@ -9,6 +10,7 @@ use zstd::decode_all;
 use crate::readers::{TimsTofFileType, TimsTofPathError, TimsTofPathLike};
 
 const U32_SIZE: usize = std::mem::size_of::<u32>();
+
 const HEADER_SIZE: usize = 2;
 
 #[derive(Debug)]
@@ -23,7 +25,14 @@ impl TdfBlobReader {
         Ok(reader)
     }
 
-    pub fn get(&self, offset: usize) -> Result<TdfBlob, TdfBlobReaderError> {
+    /// Returns a TDF blob with uncompressed data
+    ///
+    pub fn get(
+        &self,
+        offset: usize,
+        compression_type: u8,
+        max_peaks_per_scan: usize,
+    ) -> Result<TdfBlob, TdfBlobReaderError> {
         let offset = self.bin_file_reader.global_file_offset + offset;
         let byte_count = self
             .bin_file_reader
@@ -36,11 +45,75 @@ impl TdfBlobReader {
         if data.len() == 0 {
             return Err(TdfBlobReaderError::EmptyData);
         }
-        let bytes =
-            decode_all(data).map_err(|_| TdfBlobReaderError::Decompression)?;
-        let blob = TdfBlob::new(bytes)?;
+        let blob = if compression_type == 1 {
+            let bytes = self.decompress_v1(offset, data, max_peaks_per_scan)?;
+            TdfBlob::new(bytes, false)?
+        } else {
+            let bytes = decode_all(data)
+                .map_err(|_| TdfBlobReaderError::Decompression)?;
+            TdfBlob::new(bytes, true)?
+        };
         Ok(blob)
     }
+
+    /// Get a TDF blob compressed with version 1
+    /// Basically a reimplementation of the alphatims implementation
+    /// Returns the uncompressed data compatible
+    /// * scan_count: 4 bytes
+    /// * scan_indices: (scan_count) * 4 bytes
+    /// * scan: remaining bytes
+    ///
+    /// # Arguments
+    /// * `offset` - The offset of the blob in the binary file
+    /// * `max_peaks_per_scan` - The maximum number of peaks per scan
+    /// * `data` - The compressed data
+    /// * `max_peaks_per_scan` - The maximum number of peaks per scan from the metadata
+    fn decompress_v1(
+        &self,
+        offset: usize,
+        data: &[u8],
+        max_peaks_per_scan: usize,
+    ) -> Result<Vec<u8>, TdfBlobReaderError> {
+        let scan_count = self
+            .bin_file_reader
+            .get_scan_count(offset)
+            .ok_or(TdfBlobReaderError::NoScanCount)?;
+        let max_peak_count = max_peaks_per_scan * 2;
+        let scan_offsets = data[..(scan_count + 1) * U32_SIZE]
+            .chunks_exact(U32_SIZE)
+            .map(|x| u32::from_le_bytes(x.try_into().unwrap()))
+            .map(|x| x as usize - HEADER_SIZE * U32_SIZE)
+            .collect::<Vec<usize>>();
+        let mut tdf_bytes = vec![];
+        let mut last_offset = scan_count as u32 + 1;
+        let mut scan_bytes = last_offset.to_le_bytes().to_vec();
+        for scan_index in 0..scan_count {
+            let start = scan_offsets[scan_index];
+            let end = scan_offsets[scan_index + 1];
+            if start == end {
+                scan_bytes.extend(last_offset.to_le_bytes());
+                continue;
+            }
+            let decompressed_bytes = match lzf_decompress(
+                &data[start as usize..end as usize],
+                max_peak_count * U32_SIZE,
+            ) {
+                Ok(bytes) => bytes,
+                Err(_) => {
+                    return Err(TdfBlobReaderError::Decompression);
+                },
+            };
+            if decompressed_bytes.len() % U32_SIZE != 0 {
+                return Err(TdfBlobReaderError::CorruptData);
+            }
+            last_offset += decompressed_bytes.len() as u32 / U32_SIZE as u32;
+            scan_bytes.extend(last_offset.to_le_bytes());
+            tdf_bytes.extend(decompressed_bytes);
+        }
+        let mut blob_bytes = scan_bytes;
+        blob_bytes.extend(tdf_bytes);
+        Ok(blob_bytes)
+    }
 }
 
 #[derive(Debug)]
@@ -68,6 +141,8 @@ impl TdfBinFileReader {
         Ok(reader)
     }
 
+    /// Get byte count, first 4 bytes of the blob
+    ///
     fn get_byte_count(&self, offset: usize) -> Option<usize> {
         let start = offset as usize;
         let end = start + U32_SIZE as usize;
@@ -77,14 +152,16 @@ impl TdfBinFileReader {
         Some(byte_count)
     }
 
-    // fn get_scan_count(&self, offset: usize) -> Option<usize> {
-    //     let start = (offset + U32_SIZE) as usize;
-    //     let end = start + U32_SIZE as usize;
-    //     let raw_scan_count = self.mmap.get(start..end)?;
-    //     let scan_count =
-    //         u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
-    //     Some(scan_count)
-    // }
+    /// Get scan count, second 4 bytes of the blob
+    ///
+    fn get_scan_count(&self, offset: usize) -> Option<usize> {
+        let start = (offset + U32_SIZE) as usize;
+        let end = start + U32_SIZE as usize;
+        let raw_scan_count = self.mmap.get(start..end)?;
+        let scan_count =
+            u32::from_le_bytes(raw_scan_count.try_into().ok()?) as usize;
+        Some(scan_count)
+    }
 
     fn get_data(&self, offset: usize, byte_count: usize) -> Option<&[u8]> {
         let start = offset + HEADER_SIZE * U32_SIZE;
@@ -109,7 +186,7 @@ impl IndexedTdfBlobReader {
         let blob_reader = TdfBlobReader::new(path)?;
         let reader = Self {
             binary_offsets,
-            blob_reader: blob_reader,
+            blob_reader,
         };
         Ok(reader)
     }
@@ -122,7 +199,12 @@ impl IndexedTdfBlobReader {
             .binary_offsets
             .get(index)
             .ok_or(IndexedTdfBlobReaderError::InvalidIndex(index))?;
-        let blob = self.blob_reader.get(offset)?;
+        let blob = self.blob_reader.get(
+            offset,
+            // TODO: Compression type 1 seems to be irrelevant for minitdf. Correct?
+            // Set compression to type 2 for latest compression and max peaks to 0 which is only relevant for type 1
+            2, 0,
+        )?;
         Ok(blob)
     }
 }
@@ -145,6 +227,14 @@ pub enum TdfBlobReaderError {
     TimsTofPathError(#[from] TimsTofPathError),
     #[error("No binary file found")]
     NoBinary,
+    #[error("No scan count found")]
+    NoScanCount,
+    #[error("No binary size found")]
+    NoBinarySize,
+    #[error("Scan offset error")]
+    ScanOffsetError,
+    #[error("No scan offsets found")]
+    NoScanOffsets,
 }
 
 #[derive(Debug, thiserror::Error)]

diff --git a/src/io/readers/file_readers/tdf_blob_reader/tdf_blobs.rs b/src/io/readers/file_readers/tdf_blob_reader/tdf_blobs.rs
@@ -3,14 +3,15 @@ const BLOB_TYPE_SIZE: usize = std::mem::size_of::<u32>();
 #[derive(Clone, Debug, Default, PartialEq)]
 pub struct TdfBlob {
     bytes: Vec<u8>,
+    shuffled: bool,
 }
 
 impl TdfBlob {
-    pub fn new(bytes: Vec<u8>) -> Result<Self, TdfBlobError> {
+    pub fn new(bytes: Vec<u8>, shuffled: bool) -> Result<Self, TdfBlobError> {
         if bytes.len() % BLOB_TYPE_SIZE != 0 {
             Err(TdfBlobError(bytes.len()))
         } else {
-            Ok(Self { bytes })
+            Ok(Self { bytes, shuffled })
         }
     }
 
@@ -27,12 +28,22 @@ impl TdfBlob {
         if index >= self.len() {
             None
         } else {
-            Some(Self::concatenate_bytes(
-                self.bytes[index],
-                self.bytes[index + self.len()],
-                self.bytes[index + 2 * self.len()],
-                self.bytes[index + 3 * self.len()],
-            ))
+            if self.shuffled {
+                Some(Self::concatenate_bytes(
+                    self.bytes[index],
+                    self.bytes[index + self.len()],
+                    self.bytes[index + 2 * self.len()],
+                    self.bytes[index + 3 * self.len()],
+                ))
+            } else {
+                let index = index * BLOB_TYPE_SIZE;
+                Some(Self::concatenate_bytes(
+                    self.bytes[index],
+                    self.bytes[index + 1],
+                    self.bytes[index + 2],
+                    self.bytes[index + 3],
+                ))
+            }
         }
     }