huggingface
diff --git a/‎core/src/infer.rs
Lines changed: 35 additions & 12 deletions b/‎core/src/infer.rs
Lines changed: 35 additions & 12 deletions
diff --git a/‎proto/tei.proto
Lines changed: 1 addition & 0 deletions b/‎proto/tei.proto
Lines changed: 1 addition & 0 deletions
diff --git a/‎router/src/grpc/server.rs
Lines changed: 1 addition & 0 deletions b/‎router/src/grpc/server.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎router/src/http/server.rs
Lines changed: 5 additions & 0 deletions b/‎router/src/http/server.rs
Lines changed: 5 additions & 0 deletions
diff --git a/‎router/src/http/types.rs
Lines changed: 11 additions & 0 deletions b/‎router/src/http/types.rs
Lines changed: 11 additions & 0 deletions
@@ -151,20 +151,16 @@ impl Infer {
             panic!("unexpected enum variant")
         };
 
-        // Timings
         let total_time = start_time.elapsed();
 
-        // Metrics
-        let counter = metrics::counter!("te_embed_success");
-        counter.increment(1);
-        let histogram = metrics::histogram!("te_embed_duration");
-        histogram.record(total_time.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_tokenization_duration");
-        histogram.record(response.metadata.tokenization.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_queue_duration");
-        histogram.record(response.metadata.queue.as_secs_f64());
-        let histogram = metrics::histogram!("te_embed_inference_duration");
-        histogram.record(response.metadata.inference.as_secs_f64());
+        metrics::counter!("te_embed_success").increment(1);
+        metrics::histogram!("te_embed_duration").record(total_time.as_secs_f64());
+        metrics::histogram!("te_embed_tokenization_duration")
+            .record(response.metadata.tokenization.as_secs_f64());
+        metrics::histogram!("te_embed_queue_duration")
+            .record(response.metadata.queue.as_secs_f64());
+        metrics::histogram!("te_embed_inference_duration")
+            .record(response.metadata.inference.as_secs_f64());
 
         Ok(response)
     }
@@ -224,6 +220,7 @@ impl Infer {
         Ok(response)
     }
 
+    #[allow(clippy::too_many_arguments)]
     #[instrument(skip(self, inputs, permit))]
     pub async fn embed_pooled<I: Into<EncodingInput> + std::fmt::Debug>(
         &self,
@@ -232,20 +229,31 @@ impl Infer {
         truncation_direction: TruncationDirection,
         prompt_name: Option<String>,
         normalize: bool,
+        dimensions: Option<usize>,
         permit: OwnedSemaphorePermit,
     ) -> Result<PooledEmbeddingsInferResponse, TextEmbeddingsError> {
         let start_time = Instant::now();
 
         if self.is_splade() && normalize {
             let counter = metrics::counter!("te_request_failure", "err" => "model_type");
             counter.increment(1);
+
             let message = "`normalize` is not available for SPLADE models".to_string();
             tracing::error!("{message}");
             return Err(TextEmbeddingsError::Backend(BackendError::Inference(
                 message,
             )));
         }
 
+        if let Some(dimensions) = dimensions {
+            if dimensions == 0 {
+                metrics::counter!("te_request_failure", "err" => "validation").increment(1);
+                let message = "`dimensions` should be positive".to_string();
+                tracing::error!("{message}");
+                return Err(TextEmbeddingsError::Validation(message));
+            }
+        }
+
         let results = self
             .embed(
                 inputs,
@@ -262,6 +270,21 @@ impl Infer {
             panic!("unexpected enum variant")
         };
 
+        if let Some(mrl_dimensions) = dimensions {
+            if mrl_dimensions > response.results.len() {
+                metrics::counter!("te_request_failure", "err" => "validation").increment(1);
+
+                let message =
+                    "`dimensions` should be smaller than the maximum embedding dimension."
+                        .to_string();
+                tracing::error!("{message}");
+
+                return Err(TextEmbeddingsError::Validation(message));
+            }
+
+            response.results.truncate(mrl_dimensions);
+        }
+
         if normalize {
             // Normalize embedding
             let scale = (1.0
 
@@ -80,6 +80,7 @@ message EmbedRequest {
     bool normalize = 3;
     TruncationDirection truncation_direction = 4;
     optional string prompt_name = 5;
+    optional uint32 dimensions = 6;
 }
 
 message EmbedResponse {
 
@@ -91,6 +91,7 @@ impl TextEmbeddingsService {
                 truncation_direction,
                 request.prompt_name,
                 request.normalize,
+                request.dimensions.map(|v| v as usize),
                 permit,
             )
             .await
 
@@ -544,6 +544,7 @@ async fn similarity(
         truncation_direction: parameters.truncation_direction,
         prompt_name: parameters.prompt_name,
         normalize: false,
+        dimensions: None,
     };
 
     // Get embeddings
@@ -611,6 +612,7 @@ async fn embed(
                     req.truncation_direction.into(),
                     req.prompt_name,
                     req.normalize,
+                    req.dimensions,
                     permit,
                 )
                 .await
@@ -679,6 +681,7 @@ async fn embed(
                             req.truncation_direction.into(),
                             prompt_name,
                             req.normalize,
+                            req.dimensions,
                             permit,
                         )
                         .await
@@ -1156,6 +1159,7 @@ async fn openai_embed(
                     tokenizers::TruncationDirection::Right,
                     None,
                     true,
+                    req.dimensions,
                     permit,
                 )
                 .await
@@ -1228,6 +1232,7 @@ async fn openai_embed(
                             tokenizers::TruncationDirection::Right,
                             None,
                             true,
+                            req.dimensions,
                             permit,
                         )
                         .await
 
@@ -323,6 +323,8 @@ pub(crate) struct OpenAICompatRequest {
     #[schema(default = "float", example = "float")]
     #[serde(default)]
     pub encoding_format: EncodingFormat,
+    #[schema(default = "null", example = "null", nullable = true)]
+    pub dimensions: Option<usize>,
 }
 
 #[derive(Serialize, ToSchema)]
@@ -406,12 +408,15 @@ pub(crate) struct SimilarityResponse(pub Vec<f32>);
 #[derive(Deserialize, ToSchema)]
 pub(crate) struct EmbedRequest {
     pub inputs: Input,
+
     #[serde(default)]
     #[schema(default = "false", example = "false", nullable = true)]
     pub truncate: Option<bool>,
+
     #[serde(default)]
     #[schema(default = "right", example = "right")]
     pub truncation_direction: TruncationDirection,
+
     /// The name of the prompt that should be used by for encoding. If not set, no prompt
     /// will be applied.
     ///
@@ -423,9 +428,15 @@ pub(crate) struct EmbedRequest {
     /// any text to encode.
     #[schema(default = "null", example = "null", nullable = true)]
     pub prompt_name: Option<String>,
+
     #[serde(default = "default_normalize")]
     #[schema(default = "true", example = "true")]
     pub normalize: bool,
+
+    /// The number of dimensions that the output embeddings should have. If not set, the original
+    /// shape of the representation will be returned instead.
+    #[schema(default = "null", example = "null", nullable = true)]
+    pub dimensions: Option<usize>,
 }
 
 fn default_normalize() -> bool {
Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,7 @@ message EmbedRequest {`
`80`	`80`	`bool normalize = 3;`
`81`	`81`	`TruncationDirection truncation_direction = 4;`
`82`	`82`	`optional string prompt_name = 5;`
	`83`	`+ optional uint32 dimensions = 6;`
`83`	`84`	`}`
`84`	`85`
`85`	`86`	`message EmbedResponse {`
Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@ impl TextEmbeddingsService {`
`91`	`91`	`truncation_direction,`
`92`	`92`	`request.prompt_name,`
`93`	`93`	`request.normalize,`
	`94`	`+ request.dimensions.map(\|v\| v as usize),`
`94`	`95`	`permit,`
`95`	`96`	`)`
`96`	`97`	`.await`