feat: prefetch batches

OlivierDehaene · OlivierDehaene · commit b90ce0ee1f12 · 2023-10-16T13:03:30.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/backends/Cargo.toml b/backends/Cargo.toml
@@ -6,7 +6,7 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
-flume = "^0.10"
+flume = "^0.11"
 clap = { version = "4.1.4", features = ["derive"], optional = true }
 text-embeddings-backend-core = { path = "core" }
 text-embeddings-backend-python = { path = "python", optional = true }
diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml
@@ -16,14 +16,13 @@ candle-flash-attn-v1 = { git = "https://github.com/huggingface/candle-flash-attn
 candle-cublaslt = { git = "https://github.com/huggingface/candle-cublaslt", rev = "07e1a5490211e25ed0d096a2b21d3c607666eaae", optional = true }
 candle-layer-norm = { git = "https://github.com/huggingface/candle-layer-norm", rev = "5ed96012a693dff9685320765dd55a57fdaecdd6", optional = true }
 lazy_static = "^1.4"
-flume = "^0.10"
 text-embeddings-backend-core = { path = "../core" }
 tracing = "^0.1"
-safetensors = "^0.3"
+safetensors = "^0.4"
 thiserror = "^1.0"
 serde = { version = "^1.0", features = ["serde_derive"] }
 serde_json = "^1.0"
-memmap2 = "^0.7"
+memmap2 = "^0.9"
 
 [build-dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
diff --git a/core/Cargo.toml b/core/Cargo.toml
@@ -6,11 +6,11 @@ authors.workspace = true
 homepage.workspace = true
 
 [dependencies]
-flume = "^0.10"
+flume = "^0.11"
 hf-hub = { version = "^0.3.0", features = ["tokio"] }
 metrics = "^0.21"
 text-embeddings-backend = { path = "../backends" }
 thiserror = "^1.0"
-tokenizers = { version = "^0.13", default-features=false, features=["onig"] }
+tokenizers = { version = "^0.14", default-features=false, features=["onig"] }
 tracing = "^0.1"
 tokio = { version = "^1.25", features = ["rt", "rt-multi-thread", "parking_lot", "sync"] }
diff --git a/core/src/infer.rs b/core/src/infer.rs
@@ -29,6 +29,12 @@ impl Infer {
     ) -> Self {
         let notify_batching_task = Arc::new(Notify::new());
 
+        // Create two batching tasks to prefetch batches
+        tokio::spawn(batching_task(
+            backend.clone(),
+            queue.clone(),
+            notify_batching_task.clone(),
+        ));
         tokio::spawn(batching_task(
             backend.clone(),
             queue.clone(),
@@ -86,7 +92,6 @@ impl Infer {
             metadata: Metadata {
                 response_tx,
                 span: Span::current(),
-                temp_span: None,
                 tokenization: start_time.elapsed(),
                 queue_time: Instant::now(),
                 batch_time: None,
@@ -133,12 +138,16 @@ impl Infer {
     }
 }
 
+#[instrument(skip_all)]
 async fn batching_task(backend: Backend, queue: Queue, notify: Arc<Notify>) {
     loop {
         notify.notified().await;
 
         while let Some(batch) = queue.next_batch().await {
-            match backend.embed(batch.1).await {
+            let results = backend.embed(batch.1).await;
+
+            // Handle sending responses in another thread to not starve the model
+            tokio::task::spawn_blocking(move || match results {
                 Ok(embeddings) => {
                     batch.0.into_iter().zip(embeddings).for_each(|(m, e)| {
                         let _ = m.response_tx.send(Ok(InferResponse {
@@ -160,7 +169,7 @@ async fn batching_task(backend: Backend, queue: Queue, notify: Arc<Notify>) {
                         let _ = m.response_tx.send(Err(err.clone()));
                     });
                 }
-            }
+            });
         }
     }
 }
diff --git a/core/src/queue.rs b/core/src/queue.rs
@@ -1,7 +1,9 @@
 use crate::infer::InferResponse;
 use crate::tokenization::Encoding;
+use std::alloc::{alloc, Layout};
 use std::cmp::max;
 use std::collections::VecDeque;
+use std::ptr;
 use std::time::{Duration, Instant};
 use text_embeddings_backend::{BackendError, Batch};
 use tokio::sync::oneshot;
@@ -23,8 +25,6 @@ pub struct Metadata {
     pub response_tx: oneshot::Sender<Result<InferResponse, BackendError>>,
     /// Span that will live as long as entry
     pub span: Span,
-    /// Temporary span used as a guard when logging inference, wait times...
-    pub temp_span: Option<Span>,
     /// Tokenization duration
     pub tokenization: Duration,
     /// Instant when this entry was queued
@@ -43,16 +43,22 @@ pub struct Queue {
 }
 
 impl Queue {
-    pub fn new(max_batch_tokens: usize, max_batch_requests: Option<usize>) -> Self {
+    pub fn new(
+        max_batch_tokens: usize,
+        max_batch_requests: Option<usize>,
+        max_concurrent_requests: usize,
+    ) -> Self {
         // Create channels
         let (queue_sender, queue_receiver) = flume::unbounded();
 
-        // Launch background queue task
-        tokio::spawn(queue_task(
-            max_batch_tokens,
-            max_batch_requests,
-            queue_receiver,
-        ));
+        tokio::task::spawn_blocking(move || {
+            queue_task(
+                max_batch_tokens,
+                max_batch_requests,
+                max_concurrent_requests,
+                queue_receiver,
+            )
+        });
 
         Self { queue_sender }
     }
@@ -89,16 +95,17 @@ impl Queue {
 }
 
 // Background task responsible of the queue state
-async fn queue_task(
+fn queue_task(
     max_batch_tokens: usize,
     max_batch_requests: Option<usize>,
+    max_concurrent_requests: usize,
     queue_receiver: flume::Receiver<QueueCommand>,
 ) {
-    let capacity = max_batch_requests.unwrap_or(512);
+    let capacity = max_batch_requests.unwrap_or(max_concurrent_requests);
 
-    let mut entries: VecDeque<Entry> = VecDeque::with_capacity(512);
+    let mut entries: VecDeque<Entry> = VecDeque::with_capacity(max_concurrent_requests);
 
-    while let Ok(cmd) = queue_receiver.recv_async().await {
+    while let Ok(cmd) = queue_receiver.recv() {
         match cmd {
             QueueCommand::Append(entry, span) => {
                 let _span = span.entered();
@@ -108,23 +115,23 @@ async fn queue_task(
             QueueCommand::NextBatch {
                 response_sender,
                 span,
-            } => {
+            } => unsafe {
                 let _span = span.entered();
 
-                let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty, tokens = tracing::field::Empty);
-                next_batch_span.follows_from(Span::current());
-
                 let mut metadata = Vec::with_capacity(capacity);
 
-                let mut input_ids = Vec::with_capacity(max_batch_tokens);
-                let mut token_type_ids = Vec::with_capacity(max_batch_tokens);
-                let mut position_ids = Vec::with_capacity(max_batch_tokens);
+                let raw_input_ids = raw_u32_vec(max_batch_tokens);
+                let raw_token_type_ids = raw_u32_vec(max_batch_tokens);
+                let raw_position_ids = raw_u32_vec(max_batch_tokens);
+
                 let mut cu_seq_lengths = Vec::with_capacity(capacity);
                 cu_seq_lengths.push(0);
 
                 let mut current_tokens = 0;
                 let mut max_length = 0;
 
+                let batch_time = Instant::now();
+
                 while let Some(mut entry) = entries.pop_front() {
                     // Filter entries where the response receiver was dropped (== entries where the request
                     // was dropped by the client)
@@ -141,37 +148,59 @@ async fn queue_task(
                     }
 
                     max_length = max(max_length, entry_tokens as u32);
-                    current_tokens += entry_tokens;
-
-                    // Create a new span to link the batch back to this entry
-                    let entry_batch_span = info_span!(parent: &entry.metadata.span, "infer");
-                    // Add relationships
-                    next_batch_span.follows_from(&entry_batch_span);
-                    entry_batch_span.follows_from(&next_batch_span);
 
-                    entry.metadata.batch_time = Some(Instant::now());
-                    entry.metadata.temp_span = Some(entry_batch_span);
-
-                    metadata.push(entry.metadata);
-                    input_ids.extend(entry.encoding.input_ids);
-                    token_type_ids.extend(entry.encoding.token_type_ids);
-                    position_ids.extend(entry.encoding.position_ids);
-                    cu_seq_lengths.push(current_tokens as u32);
+                    entry.metadata.batch_time = Some(batch_time);
+
+                    {
+                        let _span = info_span!("extend").entered();
+
+                        ptr::copy(
+                            entry.encoding.input_ids.as_mut_ptr(),
+                            raw_input_ids.add(current_tokens),
+                            entry.encoding.input_ids.len(),
+                        );
+                        ptr::copy(
+                            entry.encoding.token_type_ids.as_mut_ptr(),
+                            raw_token_type_ids.add(current_tokens),
+                            entry.encoding.token_type_ids.len(),
+                        );
+                        ptr::copy(
+                            entry.encoding.position_ids.as_mut_ptr(),
+                            raw_position_ids.add(current_tokens),
+                            entry.encoding.position_ids.len(),
+                        );
+
+                        // input_ids.extend_from_slice(entry.encoding.input_ids.as_slice());
+                        // token_type_ids.extend_from_slice(entry.encoding.token_type_ids.as_slice());
+                        // position_ids.extend_from_slice(entry.encoding.position_ids.as_slice());
+
+                        // for i in 0..entry.encoding.input_ids.len() {
+                        //     input_ids.push(entry.encoding.input_ids[i]);
+                        //     token_type_ids.push(entry.encoding.token_type_ids[i]);
+                        //     position_ids.push(entry.encoding.position_ids[i]);
+                        // }
+
+                        current_tokens += entry_tokens;
+                        metadata.push(entry.metadata);
+                        cu_seq_lengths.push(current_tokens as u32);
+                    }
 
                     if Some(metadata.len()) == max_batch_requests {
                         break;
                     }
                 }
 
+                let input_ids =
+                    Vec::from_raw_parts(raw_input_ids, current_tokens, max_batch_tokens);
+                let token_type_ids =
+                    Vec::from_raw_parts(raw_token_type_ids, current_tokens, max_batch_tokens);
+                let position_ids =
+                    Vec::from_raw_parts(raw_position_ids, current_tokens, max_batch_tokens);
+
+                let batch_size = metadata.len();
                 let next_batch = if metadata.is_empty() {
                     None
                 } else {
-                    next_batch_span.record("batch_size", metadata.len() as u32);
-                    next_batch_span.record("tokens", current_tokens as u32);
-
-                    metrics::histogram!("te_batch_next_size", metadata.len() as f64);
-                    metrics::histogram!("te_batch_next_tokens", current_tokens as f64);
-
                     Some((
                         metadata,
                         Batch {
@@ -181,18 +210,25 @@ async fn queue_task(
                             cumulative_seq_lengths: cu_seq_lengths,
                             max_length,
                         },
-                        next_batch_span,
                     ))
                 };
 
                 let _ = response_sender.send(next_batch);
+
+                metrics::histogram!("te_batch_next_size", batch_size as f64);
+                metrics::histogram!("te_batch_next_tokens", current_tokens as f64);
                 metrics::gauge!("te_queue_size", entries.len() as f64);
-            }
+            },
         }
     }
 }
 
-type NextBatch = (Vec<Metadata>, Batch, Span);
+unsafe fn raw_u32_vec(capacity: usize) -> *mut u32 {
+    let layout = Layout::array::<u32>(capacity).unwrap();
+    alloc(layout).cast::<u32>()
+}
+
+type NextBatch = (Vec<Metadata>, Batch);
 
 #[derive(Debug)]
 enum QueueCommand {
diff --git a/router/Cargo.toml b/router/Cargo.toml
@@ -23,26 +23,27 @@ text-embeddings-backend = { path = "../backends", features = ["clap"] }
 text-embeddings-core = { path = "../core" }
 clap = { version = "4.1.4", features = ["derive", "env"] }
 futures = "^0.3"
-flume = "0.10.14"
+flume = "0.11.0"
 init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
 hf-hub = { version = "0.3.0", features = ["tokio"] }
 num_cpus = "1.16.0"
 metrics = "0.21.0"
 metrics-exporter-prometheus = { version = "0.12.1", features = [] }
-opentelemetry = { version = "0.19.0", features = ["rt-tokio"] }
-opentelemetry-otlp = "0.12.0"
+opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
+opentelemetry-otlp = "0.13.0"
 reqwest = { version = "0.11.14", features = [] }
 serde = "1.0.152"
 serde_json = "1.0.93"
 thiserror = "1.0.38"
-tokenizers = { version = "^0.13", default-features=false, features=["onig"] }
+tokenizers = { version = "0.14.1", default-features=false, features=["onig"] }
 tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tower-http = { version = "0.4.0", features = ["cors"] }
 tracing = "0.1.37"
-tracing-opentelemetry = "0.19.0"
+tracing-chrome = "0.7.1"
+tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
-utoipa = { version = "3.0.1", features = ["axum_extras"] }
-utoipa-swagger-ui = { version = "3.0.2", features = ["axum"] }
+utoipa = { version = "4.0.0", features = ["axum_extras"] }
+utoipa-swagger-ui = { version = "4.0.0", features = ["axum"] }
 
 [build-dependencies]
 vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -209,7 +209,11 @@ async fn main() -> Result<()> {
     });
 
     // Queue logic
-    let queue = Queue::new(args.max_batch_tokens, max_batch_requests);
+    let queue = Queue::new(
+        args.max_batch_tokens,
+        max_batch_requests,
+        args.max_concurrent_requests,
+    );
 
     // Create infer task
     let infer = Infer::new(tokenization, queue, args.max_concurrent_requests, backend);