feat: get n tokenization workers from the number of CPUs

OlivierDehaene · OlivierDehaene · commit 31a4e219669f · 2023-10-15T14:15:34.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/core/src/tokenization.rs b/core/src/tokenization.rs
@@ -19,6 +19,8 @@ impl Tokenization {
         max_input_length: usize,
         position_offset: usize,
     ) -> Self {
+        tracing::info!("Starting {workers} tokenization workers");
+
         // Create channel
         let (sender, receiver) = flume::unbounded();
 
diff --git a/load_tests/load.js b/load_tests/load.js
@@ -26,8 +26,8 @@ export const options = {
         load_test: {
             executor: 'constant-arrival-rate',
             duration: '30s',
-            preAllocatedVUs: 2000,
-            rate: 500,
+            preAllocatedVUs: 10000,
+            rate: 9000,
             timeUnit: '1s',
             gracefulStop: '1s',
         },
diff --git a/router/Cargo.toml b/router/Cargo.toml
@@ -26,6 +26,7 @@ futures = "^0.3"
 flume = "0.10.14"
 init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
 hf-hub = { version = "0.3.0", features = ["tokio"] }
+num_cpus = "1.16.0"
 metrics = "0.21.0"
 metrics-exporter-prometheus = { version = "0.12.1", features = [] }
 opentelemetry = { version = "0.19.0", features = ["rt-tokio"] }
diff --git a/router/src/main.rs b/router/src/main.rs
@@ -41,10 +41,11 @@ struct Args {
     #[clap(long, env)]
     revision: Option<String>,
 
-    /// The number of tokenizer workers used for payload validation and truncation inside the
-    /// router.
-    #[clap(default_value = "8", long, env)]
-    tokenization_workers: usize,
+    /// Optionally control the number of tokenizer workers used for payload tokenization, validation
+    /// and truncation.
+    /// Default to the number of CPU cores on the machine.
+    #[clap(long, env)]
+    tokenization_workers: Option<usize>,
 
     /// The dtype to be forced upon the model.
     #[clap(default_value = "float16", long, env, value_enum)]
@@ -175,9 +176,13 @@ async fn main() -> Result<()> {
         config.pad_token_id + 1
     };
 
+    let tokenization_workers = args
+        .tokenization_workers
+        .unwrap_or_else(num_cpus::get_physical);
+
     // Tokenization logic
     let tokenization = Tokenization::new(
-        args.tokenization_workers,
+        tokenization_workers,
         tokenizer,
         config.max_position_embeddings,
         position_offset,
@@ -217,7 +222,7 @@ async fn main() -> Result<()> {
         max_concurrent_requests: args.max_concurrent_requests,
         max_input_length: config.max_position_embeddings,
         max_batch_tokens: args.max_batch_tokens,
-        tokenization_workers: args.tokenization_workers,
+        tokenization_workers,
         max_batch_requests,
         max_client_batch_size: args.max_client_batch_size,
         version: env!("CARGO_PKG_VERSION"),