huggingface · kozistr · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025
diff --git a/core/src/infer.rs b/core/src/infer.rs
@@ -151,20 +151,16 @@ impl Infer {
             panic!("unexpected enum variant")
         };
 
-        // Timings
         let total_time = start_time.elapsed();
 
-        // Metrics
-        let counter = metrics::counter!("te_embed_success");
-        counter.increment(1);
-        let histogram = metrics::histogram!("te_embed_duration");
-        histogram.record(total_time.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_tokenization_duration");
-        histogram.record(response.metadata.tokenization.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_queue_duration");
-        histogram.record(response.metadata.queue.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_inference_duration");
-        histogram.record(response.metadata.inference.as_secs_f64());
+        metrics::counter!("te_embed_success").increment(1);
+        metrics::histogram!("te_embed_duration").record(total_time.as_secs_f64());
+        metrics::histogram!("te_embed_tokenization_duration")
+            .record(response.metadata.tokenization.as_secs_f64());
+        metrics::histogram!("te_embed_queue_duration")
+            .record(response.metadata.queue.as_secs_f64());
+        metrics::histogram!("te_embed_inference_duration")
+            .record(response.metadata.inference.as_secs_f64());
 
         Ok(response)
     }
@@ -224,6 +220,7 @@ impl Infer {
         Ok(response)
     }
 
+    #[allow(clippy::too_many_arguments)]
     #[instrument(skip(self, inputs, permit))]
     pub async fn embed_pooled<I: Into<EncodingInput> + std::fmt::Debug>(
         &self,
@@ -232,20 +229,29 @@ impl Infer {
         truncation_direction: TruncationDirection,
         prompt_name: Option<String>,
         normalize: bool,
+        dimensions: Option<usize>,
         permit: OwnedSemaphorePermit,
     ) -> Result<PooledEmbeddingsInferResponse, TextEmbeddingsError> {
         let start_time = Instant::now();
 
         if self.is_splade() && normalize {
-            let counter = metrics::counter!("te_request_failure", "err" => "model_type");
-            counter.increment(1);
+            metrics::counter!("te_request_failure", "err" => "model_type").increment(1);
             let message = "`normalize` is not available for SPLADE models".to_string();
             tracing::error!("{message}");
             return Err(TextEmbeddingsError::Backend(BackendError::Inference(
                 message,
             )));
         }
 
+        if let Some(dimensions) = dimensions {
+            if dimensions == 0 {
+                metrics::counter!("te_request_failure", "err" => "validation").increment(1);
+                let message = "`dimensions` should be positive".to_string();
+                tracing::error!("{message}");
+                return Err(TextEmbeddingsError::Validation(message));
+            }
+        }
+
         let results = self
             .embed(
                 inputs,
@@ -262,6 +268,11 @@ impl Infer {
             panic!("unexpected enum variant")
         };
 
+        if let Some(mrl_dimensions) = dimensions {
+            let mrl_dimensions = mrl_dimensions.min(response.results.len());
+            response.results.truncate(mrl_dimensions);
+        }
+
         if normalize {
             // Normalize embedding
             let scale = (1.0
@@ -283,16 +294,14 @@ impl Infer {
         let total_time = start_time.elapsed();
 
         // Metrics
-        let counter = metrics::counter!("te_embed_success");
-        counter.increment(1);
-        let histogram = metrics::histogram!("te_embed_duration");
-        histogram.record(total_time.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_tokenization_duration");
-        histogram.record(response.metadata.tokenization.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_queue_duration");
-        histogram.record(response.metadata.queue.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_inference_duration");
-        histogram.record(response.metadata.inference.as_secs_f64());
+        metrics::counter!("te_embed_success").increment(1);
+        metrics::histogram!("te_embed_duration").record(total_time.as_secs_f64());
+        metrics::histogram!("te_embed_tokenization_duration")
+            .record(response.metadata.tokenization.as_secs_f64());
+        metrics::histogram!("te_embed_queue_duration")
+            .record(response.metadata.queue.as_secs_f64());
+        metrics::histogram!("te_embed_inference_duration")
+            .record(response.metadata.inference.as_secs_f64());
 
         Ok(response)
     }

diff --git a/proto/tei.proto b/proto/tei.proto
@@ -80,6 +80,7 @@ message EmbedRequest {
     bool normalize = 3;
     TruncationDirection truncation_direction = 4;
     optional string prompt_name = 5;
+    optional uint32 dimensions = 6;
 }
 
 message EmbedResponse {

diff --git a/router/src/grpc/server.rs b/router/src/grpc/server.rs
@@ -91,6 +91,7 @@ impl TextEmbeddingsService {
                 truncation_direction,
                 request.prompt_name,
                 request.normalize,
+                request.dimensions.map(|v| v as usize),
                 permit,
             )
             .await

diff --git a/router/src/http/server.rs b/router/src/http/server.rs
@@ -544,6 +544,7 @@ async fn similarity(
         truncation_direction: parameters.truncation_direction,
         prompt_name: parameters.prompt_name,
         normalize: false,
+        dimensions: None,
     };
 
     // Get embeddings
@@ -611,6 +612,7 @@ async fn embed(
                     req.truncation_direction.into(),
                     req.prompt_name,
                     req.normalize,
+                    req.dimensions,
                     permit,
                 )
                 .await
@@ -679,6 +681,7 @@ async fn embed(
                             req.truncation_direction.into(),
                             prompt_name,
                             req.normalize,
+                            req.dimensions,
                             permit,
                         )
                         .await
@@ -1156,6 +1159,7 @@ async fn openai_embed(
                     tokenizers::TruncationDirection::Right,
                     None,
                     true,
+                    req.dimensions,
                     permit,
                 )
                 .await
@@ -1228,6 +1232,7 @@ async fn openai_embed(
                             tokenizers::TruncationDirection::Right,
                             None,
                             true,
+                            req.dimensions,
                             permit,
                         )
                         .await

diff --git a/router/src/http/types.rs b/router/src/http/types.rs
@@ -323,6 +323,8 @@ pub(crate) struct OpenAICompatRequest {
     #[schema(default = "float", example = "float")]
     #[serde(default)]
     pub encoding_format: EncodingFormat,
+    #[schema(default = "null", example = "null", nullable = true)]
+    pub dimensions: Option<usize>,
 }
 
 #[derive(Serialize, ToSchema)]
@@ -406,12 +408,15 @@ pub(crate) struct SimilarityResponse(pub Vec<f32>);
 #[derive(Deserialize, ToSchema)]
 pub(crate) struct EmbedRequest {
     pub inputs: Input,
+
     #[serde(default)]
     #[schema(default = "false", example = "false", nullable = true)]
     pub truncate: Option<bool>,
+
     #[serde(default)]
     #[schema(default = "right", example = "right")]
     pub truncation_direction: TruncationDirection,
+
     /// The name of the prompt that should be used by for encoding. If not set, no prompt
     /// will be applied.
     ///
@@ -423,9 +428,15 @@ pub(crate) struct EmbedRequest {
     /// any text to encode.
     #[schema(default = "null", example = "null", nullable = true)]
     pub prompt_name: Option<String>,
+
     #[serde(default = "default_normalize")]
     #[schema(default = "true", example = "true")]
     pub normalize: bool,
+
+    /// The number of dimensions the resulting output embeddings should have. If not set, the orignal
+    /// shape of the representation will be returned.
+    #[schema(default = "null", example = "null", nullable = true)]
+    pub dimensions: Option<usize>,
 }
 
 fn default_normalize() -> bool {